]> git.ipfire.org Git - thirdparty/systemd.git/blame - src/nspawn/nspawn.c
util: split-out path-util.[ch]
[thirdparty/systemd.git] / src / nspawn / nspawn.c
CommitLineData
88213476
LP
1/*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
2
3/***
4 This file is part of systemd.
5
6 Copyright 2010 Lennart Poettering
7
8 systemd is free software; you can redistribute it and/or modify it
5430f7f2
LP
9 under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 2.1 of the License, or
88213476
LP
11 (at your option) any later version.
12
13 systemd is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
5430f7f2 16 Lesser General Public License for more details.
88213476 17
5430f7f2 18 You should have received a copy of the GNU Lesser General Public License
88213476
LP
19 along with systemd; If not, see <http://www.gnu.org/licenses/>.
20***/
21
22#include <signal.h>
23#include <sched.h>
24#include <unistd.h>
25#include <sys/types.h>
26#include <sys/syscall.h>
27#include <sys/mount.h>
28#include <sys/wait.h>
29#include <stdlib.h>
30#include <string.h>
31#include <stdio.h>
32#include <errno.h>
33#include <sys/prctl.h>
34#include <sys/capability.h>
35#include <getopt.h>
a258bf26
LP
36#include <sys/epoll.h>
37#include <termios.h>
38#include <sys/signalfd.h>
687d0825 39#include <grp.h>
5ed27dbd 40#include <linux/fs.h>
9537eab0
LP
41#include <sys/un.h>
42#include <sys/socket.h>
88213476 43
81527be1
LP
44#include <systemd/sd-daemon.h>
45
88213476
LP
46#include "log.h"
47#include "util.h"
49e942b2 48#include "mkdir.h"
d7832d2c 49#include "audit.h"
94d82985 50#include "missing.h"
04d391da 51#include "cgroup-util.h"
a258bf26 52#include "strv.h"
9eb977db 53#include "path-util.h"
a41fe3a2 54#include "loopback-setup.h"
88213476
LP
55
56static char *arg_directory = NULL;
687d0825 57static char *arg_user = NULL;
40c32a4a 58static char **arg_controllers = NULL;
144f0fc0 59static char *arg_uuid = NULL;
ff01d048 60static bool arg_private_network = false;
bc2f673e 61static bool arg_read_only = false;
0f0dbc46 62static bool arg_boot = false;
88213476
LP
63
64static int help(void) {
65
66 printf("%s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
67 "Spawn a minimal namespace container for debugging, testing and building.\n\n"
40c32a4a
LGL
68 " -h --help Show this help\n"
69 " -D --directory=NAME Root directory for the container\n"
0f0dbc46 70 " -b --boot Boot up full system (i.e. invoke init)\n"
40c32a4a
LGL
71 " -u --user=USER Run the command under specified user or uid\n"
72 " -C --controllers=LIST Put the container in specified comma-separated cgroup hierarchies\n"
144f0fc0 73 " --uuid=UUID Set a specific machine UUID for the container\n"
bc2f673e
LP
74 " --private-network Disable network in container\n"
75 " --read-only Mount the root directory read-only\n",
88213476
LP
76 program_invocation_short_name);
77
78 return 0;
79}
80
81static int parse_argv(int argc, char *argv[]) {
82
a41fe3a2 83 enum {
144f0fc0 84 ARG_PRIVATE_NETWORK = 0x100,
bc2f673e
LP
85 ARG_UUID,
86 ARG_READ_ONLY
a41fe3a2
LP
87 };
88
88213476 89 static const struct option options[] = {
ff01d048
LP
90 { "help", no_argument, NULL, 'h' },
91 { "directory", required_argument, NULL, 'D' },
92 { "user", required_argument, NULL, 'u' },
40c32a4a 93 { "controllers", required_argument, NULL, 'C' },
ff01d048 94 { "private-network", no_argument, NULL, ARG_PRIVATE_NETWORK },
0f0dbc46 95 { "boot", no_argument, NULL, 'b' },
144f0fc0 96 { "uuid", required_argument, NULL, ARG_UUID },
bc2f673e 97 { "read-only", no_argument, NULL, ARG_READ_ONLY },
ff01d048 98 { NULL, 0, NULL, 0 }
88213476
LP
99 };
100
101 int c;
102
103 assert(argc >= 0);
104 assert(argv);
105
0f0dbc46 106 while ((c = getopt_long(argc, argv, "+hD:u:C:b", options, NULL)) >= 0) {
88213476
LP
107
108 switch (c) {
109
110 case 'h':
111 help();
112 return 0;
113
114 case 'D':
115 free(arg_directory);
3a74cea5
LP
116 arg_directory = canonicalize_file_name(optarg);
117 if (!arg_directory) {
118 log_error("Failed to canonicalize root directory.");
88213476
LP
119 return -ENOMEM;
120 }
121
122 break;
123
687d0825
MV
124 case 'u':
125 free(arg_user);
126 if (!(arg_user = strdup(optarg))) {
127 log_error("Failed to duplicate user name.");
128 return -ENOMEM;
129 }
130
131 break;
132
40c32a4a
LGL
133 case 'C':
134 strv_free(arg_controllers);
135 arg_controllers = strv_split(optarg, ",");
136 if (!arg_controllers) {
137 log_error("Failed to split controllers list.");
138 return -ENOMEM;
139 }
140 strv_uniq(arg_controllers);
141
142 break;
143
ff01d048
LP
144 case ARG_PRIVATE_NETWORK:
145 arg_private_network = true;
a41fe3a2
LP
146 break;
147
0f0dbc46
LP
148 case 'b':
149 arg_boot = true;
150 break;
151
144f0fc0
LP
152 case ARG_UUID:
153 arg_uuid = optarg;
154 break;
155
bc2f673e
LP
156 case ARG_READ_ONLY:
157 arg_read_only = true;
158 break;
159
88213476
LP
160 case '?':
161 return -EINVAL;
162
163 default:
164 log_error("Unknown option code %c", c);
165 return -EINVAL;
166 }
167 }
168
169 return 1;
170}
171
172static int mount_all(const char *dest) {
173
174 typedef struct MountPoint {
175 const char *what;
176 const char *where;
177 const char *type;
178 const char *options;
179 unsigned long flags;
3bd66c05 180 bool fatal;
88213476
LP
181 } MountPoint;
182
183 static const MountPoint mount_table[] = {
4b7a6af4 184 { "proc", "/proc", "proc", NULL, MS_NOSUID|MS_NOEXEC|MS_NODEV, true },
576a01c8
LP
185 { "/proc/sys", "/proc/sys", "bind", NULL, MS_BIND, true }, /* Bind mount first */
186 { "/proc/sys", "/proc/sys", "bind", NULL, MS_BIND|MS_RDONLY|MS_REMOUNT, true }, /* Then, make it r/o */
187 { "/sys", "/sys", "bind", NULL, MS_BIND, true }, /* Bind mount first */
188 { "/sys", "/sys", "bind", NULL, MS_BIND|MS_RDONLY|MS_REMOUNT, true }, /* Then, make it r/o */
635f7d8c 189 { "tmpfs", "/dev", "tmpfs", "mode=755", MS_NOSUID|MS_STRICTATIME, true },
576a01c8 190 { "/dev/pts", "/dev/pts", "bind", NULL, MS_BIND, true },
635f7d8c 191 { "tmpfs", "/run", "tmpfs", "mode=755", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true },
9b634ea5 192#ifdef HAVE_SELINUX
6b2bf923
LP
193 { "/sys/fs/selinux", "/sys/fs/selinux", "bind", NULL, MS_BIND, false }, /* Bind mount first */
194 { "/sys/fs/selinux", "/sys/fs/selinux", "bind", NULL, MS_BIND|MS_RDONLY|MS_REMOUNT, false }, /* Then, make it r/o */
9b634ea5 195#endif
88213476
LP
196 };
197
198 unsigned k;
199 int r = 0;
715ac17a 200 char *where;
88213476
LP
201
202 for (k = 0; k < ELEMENTSOF(mount_table); k++) {
88213476
LP
203 int t;
204
205 if (asprintf(&where, "%s/%s", dest, mount_table[k].where) < 0) {
206 log_error("Out of memory");
207
208 if (r == 0)
209 r = -ENOMEM;
210
211 break;
212 }
213
68fb0892
LP
214 t = path_is_mount_point(where, false);
215 if (t < 0) {
88213476
LP
216 log_error("Failed to detect whether %s is a mount point: %s", where, strerror(-t));
217 free(where);
218
219 if (r == 0)
220 r = t;
221
222 continue;
223 }
224
225 mkdir_p(where, 0755);
226
227 if (mount(mount_table[k].what,
228 where,
229 mount_table[k].type,
230 mount_table[k].flags,
3bd66c05
LP
231 mount_table[k].options) < 0 &&
232 mount_table[k].fatal) {
88213476
LP
233
234 log_error("mount(%s) failed: %m", where);
235
236 if (r == 0)
237 r = -errno;
238 }
239
240 free(where);
241 }
242
e58a1277
LP
243 return r;
244}
f8440af5 245
e58a1277
LP
246static int setup_timezone(const char *dest) {
247 char *where;
f8440af5 248
e58a1277
LP
249 assert(dest);
250
251 /* Fix the timezone, if possible */
252 if (asprintf(&where, "%s/etc/localtime", dest) < 0) {
253 log_error("Out of memory");
254 return -ENOMEM;
715ac17a
LP
255 }
256
e58a1277
LP
257 if (mount("/etc/localtime", where, "bind", MS_BIND, NULL) >= 0)
258 mount("/etc/localtime", where, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY, NULL);
4d1c38b8 259
e58a1277 260 free(where);
4d1c38b8 261
e58a1277
LP
262 if (asprintf(&where, "%s/etc/timezone", dest) < 0) {
263 log_error("Out of memory");
264 return -ENOMEM;
4d1c38b8
LP
265 }
266
e58a1277
LP
267 if (mount("/etc/timezone", where, "bind", MS_BIND, NULL) >= 0)
268 mount("/etc/timezone", where, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY, NULL);
68fb0892 269
e58a1277
LP
270 free(where);
271
272 return 0;
88213476
LP
273}
274
2547bb41
LP
275static int setup_resolv_conf(const char *dest) {
276 char *where;
277
278 assert(dest);
279
280 if (arg_private_network)
281 return 0;
282
283 /* Fix resolv.conf, if possible */
284 if (asprintf(&where, "%s/etc/resolv.conf", dest) < 0) {
285 log_error("Out of memory");
286 return -ENOMEM;
287 }
288
289 if (mount("/etc/resolv.conf", where, "bind", MS_BIND, NULL) >= 0)
290 mount("/etc/resolv.conf", where, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY, NULL);
291
292 free(where);
293
294 return 0;
295}
296
e58a1277 297static int copy_devnodes(const char *dest) {
88213476
LP
298
299 static const char devnodes[] =
300 "null\0"
301 "zero\0"
302 "full\0"
303 "random\0"
304 "urandom\0"
305 "tty\0"
306 "ptmx\0"
88213476
LP
307 "rtc0\0";
308
309 const char *d;
e58a1277 310 int r = 0;
124640f1 311 mode_t u;
a258bf26
LP
312
313 assert(dest);
124640f1
LP
314
315 u = umask(0000);
88213476
LP
316
317 NULSTR_FOREACH(d, devnodes) {
e58a1277
LP
318 struct stat st;
319 char *from = NULL, *to = NULL;
88213476
LP
320
321 asprintf(&from, "/dev/%s", d);
322 asprintf(&to, "%s/dev/%s", dest, d);
323
324 if (!from || !to) {
325 log_error("Failed to allocate devnode path");
326
327 free(from);
328 free(to);
329
a258bf26
LP
330 from = to = NULL;
331
88213476
LP
332 if (r == 0)
333 r = -ENOMEM;
334
335 break;
336 }
337
338 if (stat(from, &st) < 0) {
339
340 if (errno != ENOENT) {
341 log_error("Failed to stat %s: %m", from);
88213476
LP
342 if (r == 0)
343 r = -errno;
344 }
345
a258bf26 346 } else if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
88213476 347
a258bf26
LP
348 log_error("%s is not a char or block device, cannot copy.", from);
349 if (r == 0)
350 r = -EIO;
351
352 } else if (mknod(to, st.st_mode, st.st_rdev) < 0) {
353
354 log_error("mknod(%s) failed: %m", dest);
355 if (r == 0)
356 r = -errno;
88213476
LP
357 }
358
359 free(from);
360 free(to);
361 }
362
e58a1277 363 umask(u);
88213476 364
e58a1277
LP
365 return r;
366}
88213476 367
e58a1277
LP
368static int setup_dev_console(const char *dest, const char *console) {
369 struct stat st;
370 char *to = NULL;
371 int r;
372 mode_t u;
373
374 assert(dest);
375 assert(console);
376
377 u = umask(0000);
378
379 if (stat(console, &st) < 0) {
380 log_error("Failed to stat %s: %m", console);
381 r = -errno;
a258bf26 382 goto finish;
88213476 383
a258bf26 384 } else if (!S_ISCHR(st.st_mode)) {
a258bf26 385 log_error("/dev/console is not a char device.");
e58a1277
LP
386 r = -EIO;
387 goto finish;
388 }
88213476 389
e58a1277
LP
390 r = chmod_and_chown(console, 0600, 0, 0);
391 if (r < 0) {
392 log_error("Failed to correct access mode for TTY: %s", strerror(-r));
a258bf26
LP
393 goto finish;
394 }
88213476 395
a258bf26 396 if (asprintf(&to, "%s/dev/console", dest) < 0) {
a258bf26 397 log_error("Out of memory");
e58a1277
LP
398 r = -ENOMEM;
399 goto finish;
88213476
LP
400 }
401
a258bf26
LP
402 /* We need to bind mount the right tty to /dev/console since
403 * ptys can only exist on pts file systems. To have something
404 * to bind mount things on we create a device node first, that
405 * has the right major/minor (note that the major minor
406 * doesn't actually matter here, since we mount it over
407 * anyway). */
408
e58a1277
LP
409 if (mknod(to, (st.st_mode & ~07777) | 0600, st.st_rdev) < 0) {
410 log_error("mknod() for /dev/console failed: %m");
411 r = -errno;
412 goto finish;
413 }
a258bf26
LP
414
415 if (mount(console, to, "bind", MS_BIND, NULL) < 0) {
e58a1277
LP
416 log_error("Bind mount for /dev/console failed: %m");
417 r = -errno;
418 goto finish;
a258bf26
LP
419 }
420
e58a1277 421finish:
a258bf26 422 free(to);
e58a1277
LP
423 umask(u);
424
425 return r;
426}
427
428static int setup_kmsg(const char *dest, int kmsg_socket) {
429 char *from = NULL, *to = NULL;
430 int r, fd, k;
431 mode_t u;
432 union {
433 struct cmsghdr cmsghdr;
434 uint8_t buf[CMSG_SPACE(sizeof(int))];
435 } control;
436 struct msghdr mh;
437 struct cmsghdr *cmsg;
438
439 assert(dest);
440 assert(kmsg_socket >= 0);
a258bf26 441
e58a1277 442 u = umask(0000);
a258bf26 443
f1e5dfe2
LP
444 /* We create the kmsg FIFO as /dev/kmsg, but immediately
445 * delete it after bind mounting it to /proc/kmsg. While FIFOs
446 * on the reading side behave very similar to /proc/kmsg,
447 * their writing side behaves differently from /dev/kmsg in
448 * that writing blocks when nothing is reading. In order to
449 * avoid any problems with containers deadlocking due to this
450 * we simply make /dev/kmsg unavailable to the container. */
e58a1277
LP
451 if (asprintf(&from, "%s/dev/kmsg", dest) < 0) {
452 log_error("Out of memory");
453 r = -ENOMEM;
454 goto finish;
455 }
456
457 if (asprintf(&to, "%s/proc/kmsg", dest) < 0) {
458 log_error("Out of memory");
459 r = -ENOMEM;
460 goto finish;
461 }
462
463 if (mkfifo(from, 0600) < 0) {
464 log_error("mkfifo() for /dev/kmsg failed: %m");
465 r = -errno;
466 goto finish;
467 }
468
469 r = chmod_and_chown(from, 0600, 0, 0);
470 if (r < 0) {
471 log_error("Failed to correct access mode for /dev/kmsg: %s", strerror(-r));
472 goto finish;
473 }
474
475 if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
476 log_error("Bind mount for /proc/kmsg failed: %m");
477 r = -errno;
478 goto finish;
479 }
480
481 fd = open(from, O_RDWR|O_NDELAY|O_CLOEXEC);
482 if (fd < 0) {
483 log_error("Failed to open fifo: %m");
484 r = -errno;
485 goto finish;
486 }
487
488 zero(mh);
489 zero(control);
490
491 mh.msg_control = &control;
492 mh.msg_controllen = sizeof(control);
493
494 cmsg = CMSG_FIRSTHDR(&mh);
495 cmsg->cmsg_level = SOL_SOCKET;
496 cmsg->cmsg_type = SCM_RIGHTS;
497 cmsg->cmsg_len = CMSG_LEN(sizeof(int));
498 memcpy(CMSG_DATA(cmsg), &fd, sizeof(int));
499
500 mh.msg_controllen = cmsg->cmsg_len;
501
502 /* Store away the fd in the socket, so that it stays open as
503 * long as we run the child */
504 k = sendmsg(kmsg_socket, &mh, MSG_DONTWAIT|MSG_NOSIGNAL);
505 close_nointr_nofail(fd);
506
507 if (k < 0) {
508 log_error("Failed to send FIFO fd: %m");
509 r = -errno;
510 goto finish;
a258bf26
LP
511 }
512
f1e5dfe2
LP
513 /* And now make the FIFO unavailable as /dev/kmsg... */
514 unlink(from);
515
a258bf26 516finish:
e58a1277
LP
517 free(from);
518 free(to);
124640f1
LP
519 umask(u);
520
88213476
LP
521 return r;
522}
523
3a74cea5
LP
524static int setup_hostname(void) {
525 char *hn;
526 int r = 0;
527
9eb977db 528 hn = path_get_file_name(arg_directory);
3a74cea5
LP
529 if (hn) {
530 hn = strdup(hn);
531 if (!hn)
532 return -ENOMEM;
533
534 hostname_cleanup(hn);
535
536 if (!isempty(hn))
537 if (sethostname(hn, strlen(hn)) < 0)
538 r = -errno;
539
540 free(hn);
541 }
542
543 return r;
544}
545
88213476
LP
546static int drop_capabilities(void) {
547 static const unsigned long retain[] = {
548 CAP_CHOWN,
549 CAP_DAC_OVERRIDE,
550 CAP_DAC_READ_SEARCH,
551 CAP_FOWNER,
552 CAP_FSETID,
553 CAP_IPC_OWNER,
554 CAP_KILL,
555 CAP_LEASE,
556 CAP_LINUX_IMMUTABLE,
557 CAP_NET_BIND_SERVICE,
558 CAP_NET_BROADCAST,
559 CAP_NET_RAW,
560 CAP_SETGID,
561 CAP_SETFCAP,
562 CAP_SETPCAP,
563 CAP_SETUID,
564 CAP_SYS_ADMIN,
565 CAP_SYS_CHROOT,
566 CAP_SYS_NICE,
567 CAP_SYS_PTRACE,
568 CAP_SYS_TTY_CONFIG
569 };
570
571 unsigned long l;
572
64685e0c 573 for (l = 0; l <= cap_last_cap(); l++) {
88213476
LP
574 unsigned i;
575
576 for (i = 0; i < ELEMENTSOF(retain); i++)
577 if (retain[i] == l)
578 break;
579
580 if (i < ELEMENTSOF(retain))
581 continue;
582
583 if (prctl(PR_CAPBSET_DROP, l) < 0) {
88213476
LP
584 log_error("PR_CAPBSET_DROP failed: %m");
585 return -errno;
586 }
587 }
588
589 return 0;
590}
591
592static int is_os_tree(const char *path) {
593 int r;
594 char *p;
595 /* We use /bin/sh as flag file if something is an OS */
596
597 if (asprintf(&p, "%s/bin/sh", path) < 0)
598 return -ENOMEM;
599
600 r = access(p, F_OK);
601 free(p);
602
603 return r < 0 ? 0 : 1;
604}
605
a258bf26 606static int process_pty(int master, sigset_t *mask) {
0c749d50 607
b72491a2 608 char in_buffer[LINE_MAX], out_buffer[LINE_MAX];
a258bf26
LP
609 size_t in_buffer_full = 0, out_buffer_full = 0;
610 struct epoll_event stdin_ev, stdout_ev, master_ev, signal_ev;
611 bool stdin_readable = false, stdout_writable = false, master_readable = false, master_writable = false;
a258bf26
LP
612 int ep = -1, signal_fd = -1, r;
613
614 fd_nonblock(STDIN_FILENO, 1);
615 fd_nonblock(STDOUT_FILENO, 1);
616 fd_nonblock(master, 1);
617
618 if ((signal_fd = signalfd(-1, mask, SFD_NONBLOCK|SFD_CLOEXEC)) < 0) {
619 log_error("signalfd(): %m");
620 r = -errno;
621 goto finish;
622 }
623
624 if ((ep = epoll_create1(EPOLL_CLOEXEC)) < 0) {
625 log_error("Failed to create epoll: %m");
626 r = -errno;
627 goto finish;
628 }
629
630 zero(stdin_ev);
631 stdin_ev.events = EPOLLIN|EPOLLET;
632 stdin_ev.data.fd = STDIN_FILENO;
633
634 zero(stdout_ev);
635 stdout_ev.events = EPOLLOUT|EPOLLET;
636 stdout_ev.data.fd = STDOUT_FILENO;
637
638 zero(master_ev);
639 master_ev.events = EPOLLIN|EPOLLOUT|EPOLLET;
640 master_ev.data.fd = master;
641
642 zero(signal_ev);
643 signal_ev.events = EPOLLIN;
644 signal_ev.data.fd = signal_fd;
645
646 if (epoll_ctl(ep, EPOLL_CTL_ADD, STDIN_FILENO, &stdin_ev) < 0 ||
647 epoll_ctl(ep, EPOLL_CTL_ADD, STDOUT_FILENO, &stdout_ev) < 0 ||
648 epoll_ctl(ep, EPOLL_CTL_ADD, master, &master_ev) < 0 ||
649 epoll_ctl(ep, EPOLL_CTL_ADD, signal_fd, &signal_ev) < 0) {
650 log_error("Failed to regiser fds in epoll: %m");
651 r = -errno;
652 goto finish;
653 }
654
fd14078a 655 for (;;) {
a258bf26
LP
656 struct epoll_event ev[16];
657 ssize_t k;
658 int i, nfds;
659
660 if ((nfds = epoll_wait(ep, ev, ELEMENTSOF(ev), -1)) < 0) {
661
662 if (errno == EINTR || errno == EAGAIN)
663 continue;
664
665 log_error("epoll_wait(): %m");
666 r = -errno;
667 goto finish;
668 }
669
670 assert(nfds >= 1);
671
672 for (i = 0; i < nfds; i++) {
673 if (ev[i].data.fd == STDIN_FILENO) {
674
fd14078a 675 if (ev[i].events & (EPOLLIN|EPOLLHUP))
a258bf26
LP
676 stdin_readable = true;
677
678 } else if (ev[i].data.fd == STDOUT_FILENO) {
679
fd14078a 680 if (ev[i].events & (EPOLLOUT|EPOLLHUP))
a258bf26
LP
681 stdout_writable = true;
682
683 } else if (ev[i].data.fd == master) {
684
fd14078a 685 if (ev[i].events & (EPOLLIN|EPOLLHUP))
a258bf26
LP
686 master_readable = true;
687
fd14078a 688 if (ev[i].events & (EPOLLOUT|EPOLLHUP))
a258bf26
LP
689 master_writable = true;
690
691 } else if (ev[i].data.fd == signal_fd) {
692 struct signalfd_siginfo sfsi;
693 ssize_t n;
694
695 if ((n = read(signal_fd, &sfsi, sizeof(sfsi))) != sizeof(sfsi)) {
696
697 if (n >= 0) {
0c749d50 698 log_error("Failed to read from signalfd: invalid block size");
a258bf26
LP
699 r = -EIO;
700 goto finish;
701 }
702
703 if (errno != EINTR && errno != EAGAIN) {
0c749d50 704 log_error("Failed to read from signalfd: %m");
a258bf26
LP
705 r = -errno;
706 goto finish;
707 }
708 } else {
709
710 if (sfsi.ssi_signo == SIGWINCH) {
711 struct winsize ws;
712
713 /* The window size changed, let's forward that. */
a258bf26
LP
714 if (ioctl(STDIN_FILENO, TIOCGWINSZ, &ws) >= 0)
715 ioctl(master, TIOCSWINSZ, &ws);
716 } else {
0c749d50 717 r = 0;
a258bf26
LP
718 goto finish;
719 }
720 }
721 }
722 }
723
724 while ((stdin_readable && in_buffer_full <= 0) ||
725 (master_writable && in_buffer_full > 0) ||
726 (master_readable && out_buffer_full <= 0) ||
727 (stdout_writable && out_buffer_full > 0)) {
728
b72491a2 729 if (stdin_readable && in_buffer_full < LINE_MAX) {
a258bf26 730
b72491a2 731 if ((k = read(STDIN_FILENO, in_buffer + in_buffer_full, LINE_MAX - in_buffer_full)) < 0) {
a258bf26 732
fd14078a 733 if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
a258bf26 734 stdin_readable = false;
a258bf26
LP
735 else {
736 log_error("read(): %m");
0c749d50 737 r = -errno;
a258bf26
LP
738 goto finish;
739 }
740 } else
741 in_buffer_full += (size_t) k;
a258bf26
LP
742 }
743
744 if (master_writable && in_buffer_full > 0) {
745
746 if ((k = write(master, in_buffer, in_buffer_full)) < 0) {
747
fd14078a 748 if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
a258bf26 749 master_writable = false;
fd14078a 750 else {
a258bf26 751 log_error("write(): %m");
0c749d50 752 r = -errno;
a258bf26
LP
753 goto finish;
754 }
755
756 } else {
757 assert(in_buffer_full >= (size_t) k);
758 memmove(in_buffer, in_buffer + k, in_buffer_full - k);
759 in_buffer_full -= k;
760 }
761 }
762
b72491a2 763 if (master_readable && out_buffer_full < LINE_MAX) {
a258bf26 764
b72491a2 765 if ((k = read(master, out_buffer + out_buffer_full, LINE_MAX - out_buffer_full)) < 0) {
a258bf26 766
fd14078a 767 if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
a258bf26 768 master_readable = false;
a258bf26
LP
769 else {
770 log_error("read(): %m");
0c749d50 771 r = -errno;
a258bf26
LP
772 goto finish;
773 }
774 } else
775 out_buffer_full += (size_t) k;
a258bf26
LP
776 }
777
778 if (stdout_writable && out_buffer_full > 0) {
779
780 if ((k = write(STDOUT_FILENO, out_buffer, out_buffer_full)) < 0) {
781
fd14078a 782 if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
a258bf26 783 stdout_writable = false;
fd14078a 784 else {
a258bf26 785 log_error("write(): %m");
0c749d50 786 r = -errno;
a258bf26
LP
787 goto finish;
788 }
789
790 } else {
791 assert(out_buffer_full >= (size_t) k);
792 memmove(out_buffer, out_buffer + k, out_buffer_full - k);
793 out_buffer_full -= k;
794 }
795 }
796 }
fd14078a 797 }
a258bf26
LP
798
799finish:
800 if (ep >= 0)
801 close_nointr_nofail(ep);
802
803 if (signal_fd >= 0)
804 close_nointr_nofail(signal_fd);
805
806 return r;
807}
88213476
LP
808
809int main(int argc, char *argv[]) {
810 pid_t pid = 0;
04d391da
LP
811 int r = EXIT_FAILURE, k;
812 char *oldcg = NULL, *newcg = NULL;
40c32a4a 813 char **controller = NULL;
a258bf26
LP
814 int master = -1;
815 const char *console = NULL;
816 struct termios saved_attr, raw_attr;
817 sigset_t mask;
818 bool saved_attr_valid = false;
819 struct winsize ws;
e58a1277 820 int kmsg_socket_pair[2] = { -1, -1 };
88213476
LP
821
822 log_parse_environment();
823 log_open();
824
825 if ((r = parse_argv(argc, argv)) <= 0)
826 goto finish;
827
828 if (arg_directory) {
829 char *p;
830
831 p = path_make_absolute_cwd(arg_directory);
832 free(arg_directory);
833 arg_directory = p;
834 } else
835 arg_directory = get_current_dir_name();
836
837 if (!arg_directory) {
838 log_error("Failed to determine path");
839 goto finish;
840 }
841
842 path_kill_slashes(arg_directory);
843
844 if (geteuid() != 0) {
845 log_error("Need to be root.");
846 goto finish;
847 }
848
04d391da
LP
849 if (sd_booted() <= 0) {
850 log_error("Not running on a systemd system.");
851 goto finish;
852 }
853
88213476 854 if (path_equal(arg_directory, "/")) {
6df6b939 855 log_error("Spawning container on root directory not supported.");
88213476
LP
856 goto finish;
857 }
858
859 if (is_os_tree(arg_directory) <= 0) {
860 log_error("Directory %s doesn't look like an OS root directory. Refusing.", arg_directory);
861 goto finish;
862 }
863
04d391da
LP
864 if ((k = cg_get_by_pid(SYSTEMD_CGROUP_CONTROLLER, 0, &oldcg)) < 0) {
865 log_error("Failed to determine current cgroup: %s", strerror(-k));
866 goto finish;
867 }
868
869 if (asprintf(&newcg, "%s/nspawn-%lu", oldcg, (unsigned long) getpid()) < 0) {
870 log_error("Failed to allocate cgroup path.");
871 goto finish;
872 }
873
40c32a4a
LGL
874 k = cg_create_and_attach(SYSTEMD_CGROUP_CONTROLLER, newcg, 0);
875 if (k < 0) {
04d391da
LP
876 log_error("Failed to create cgroup: %s", strerror(-k));
877 goto finish;
878 }
879
40c32a4a
LGL
880 STRV_FOREACH(controller,arg_controllers) {
881 k = cg_create_and_attach(*controller, newcg, 0);
882 if (k < 0)
883 log_warning("Failed to create cgroup in controller %s: %s", *controller, strerror(-k));
884 }
885
a258bf26
LP
886 if ((master = posix_openpt(O_RDWR|O_NOCTTY|O_CLOEXEC|O_NDELAY)) < 0) {
887 log_error("Failed to acquire pseudo tty: %m");
888 goto finish;
889 }
890
891 if (!(console = ptsname(master))) {
892 log_error("Failed to determine tty name: %m");
893 goto finish;
894 }
895
896 log_info("Spawning namespace container on %s (console is %s).", arg_directory, console);
897
898 if (ioctl(STDIN_FILENO, TIOCGWINSZ, &ws) >= 0)
899 ioctl(master, TIOCSWINSZ, &ws);
900
901 if (unlockpt(master) < 0) {
902 log_error("Failed to unlock tty: %m");
903 goto finish;
904 }
905
906 if (tcgetattr(STDIN_FILENO, &saved_attr) < 0) {
907 log_error("Failed to get terminal attributes: %m");
908 goto finish;
909 }
910
911 saved_attr_valid = true;
912
913 raw_attr = saved_attr;
914 cfmakeraw(&raw_attr);
915 raw_attr.c_lflag &= ~ECHO;
916
917 if (tcsetattr(STDIN_FILENO, TCSANOW, &raw_attr) < 0) {
918 log_error("Failed to set terminal attributes: %m");
919 goto finish;
920 }
921
e58a1277
LP
922 if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_NONBLOCK|SOCK_CLOEXEC, 0, kmsg_socket_pair) < 0) {
923 log_error("Failed to create kmsg socket pair");
924 goto finish;
925 }
926
a258bf26
LP
927 assert_se(sigemptyset(&mask) == 0);
928 sigset_add_many(&mask, SIGCHLD, SIGWINCH, SIGTERM, SIGINT, -1);
929 assert_se(sigprocmask(SIG_BLOCK, &mask, NULL) == 0);
930
52af2106
LP
931 pid = syscall(__NR_clone, SIGCHLD|CLONE_NEWIPC|CLONE_NEWNS|CLONE_NEWPID|CLONE_NEWUTS|(arg_private_network ? CLONE_NEWNET : 0), NULL);
932 if (pid < 0) {
933 if (errno == EINVAL)
934 log_error("clone() failed, do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in): %m");
935 else
936 log_error("clone() failed: %m");
937
88213476
LP
938 goto finish;
939 }
940
941 if (pid == 0) {
a258bf26
LP
942 /* child */
943
687d0825
MV
944 const char *home = NULL;
945 uid_t uid = (uid_t) -1;
946 gid_t gid = (gid_t) -1;
da5b3bad 947 const char *envp[] = {
da5b3bad 948 "PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin",
3bb1c6b0 949 "container=systemd-nspawn", /* LXC sets container=lxc, so follow the scheme here */
687d0825
MV
950 NULL, /* TERM */
951 NULL, /* HOME */
952 NULL, /* USER */
953 NULL, /* LOGNAME */
144f0fc0 954 NULL, /* container_uuid */
da5b3bad
LP
955 NULL
956 };
88213476 957
3bb1c6b0 958 envp[2] = strv_find_prefix(environ, "TERM=");
a258bf26
LP
959
960 close_nointr_nofail(master);
961
962 close_nointr(STDIN_FILENO);
963 close_nointr(STDOUT_FILENO);
964 close_nointr(STDERR_FILENO);
965
e58a1277 966 close_all_fds(&kmsg_socket_pair[1], 1);
a258bf26
LP
967
968 reset_all_signal_handlers();
969
970 assert_se(sigemptyset(&mask) == 0);
971 assert_se(sigprocmask(SIG_SETMASK, &mask, NULL) == 0);
972
973 if (setsid() < 0)
974 goto child_fail;
975
976 if (prctl(PR_SET_PDEATHSIG, SIGKILL) < 0)
977 goto child_fail;
88213476 978
f5c1b9ee
LP
979 /* Mark / as private, in case somebody marked it shared */
980 if (mount(NULL, "/", NULL, MS_PRIVATE|MS_REC, NULL) < 0)
981 goto child_fail;
982
bc2f673e
LP
983 /* Turn directory into bind mount */
984 if (mount(arg_directory, arg_directory, "bind", MS_BIND, NULL) < 0) {
985 log_error("Failed to make bind mount.");
986 goto child_fail;
987 }
988
989 if (arg_read_only)
990 if (mount(arg_directory, arg_directory, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY, NULL) < 0) {
991 log_error("Failed to make read-only.");
992 goto child_fail;
993 }
994
88213476
LP
995 if (mount_all(arg_directory) < 0)
996 goto child_fail;
997
e58a1277
LP
998 if (copy_devnodes(arg_directory) < 0)
999 goto child_fail;
1000
1001 if (setup_dev_console(arg_directory, console) < 0)
1002 goto child_fail;
1003
1004 if (setup_kmsg(arg_directory, kmsg_socket_pair[1]) < 0)
1005 goto child_fail;
1006
1007 close_nointr_nofail(kmsg_socket_pair[1]);
1008
1009 if (setup_timezone(arg_directory) < 0)
88213476
LP
1010 goto child_fail;
1011
2547bb41
LP
1012 if (setup_resolv_conf(arg_directory) < 0)
1013 goto child_fail;
1014
88213476
LP
1015 if (chdir(arg_directory) < 0) {
1016 log_error("chdir(%s) failed: %m", arg_directory);
1017 goto child_fail;
1018 }
a258bf26
LP
1019
1020 if (open_terminal("dev/console", O_RDWR) != STDIN_FILENO ||
1021 dup2(STDIN_FILENO, STDOUT_FILENO) != STDOUT_FILENO ||
1022 dup2(STDIN_FILENO, STDERR_FILENO) != STDERR_FILENO)
1023 goto child_fail;
1024
bc2f673e
LP
1025 if (mount(arg_directory, "/", "bind", MS_MOVE, NULL) < 0) {
1026 log_error("mount(MS_BIND) failed: %m");
88213476
LP
1027 goto child_fail;
1028 }
1029
1030 if (chroot(".") < 0) {
1031 log_error("chroot() failed: %m");
1032 goto child_fail;
1033 }
1034
1035 if (chdir("/") < 0) {
1036 log_error("chdir() failed: %m");
1037 goto child_fail;
1038 }
1039
4c12626c 1040 umask(0022);
a258bf26 1041
a41fe3a2
LP
1042 loopback_setup();
1043
88213476
LP
1044 if (drop_capabilities() < 0)
1045 goto child_fail;
1046
687d0825
MV
1047 if (arg_user) {
1048
1049 if (get_user_creds((const char**)&arg_user, &uid, &gid, &home) < 0) {
1050 log_error("get_user_creds() failed: %m");
1051 goto child_fail;
1052 }
1053
1054 if (mkdir_parents(home, 0775) < 0) {
1055 log_error("mkdir_parents() failed: %m");
1056 goto child_fail;
1057 }
1058
1059 if (safe_mkdir(home, 0775, uid, gid) < 0) {
1060 log_error("safe_mkdir() failed: %m");
1061 goto child_fail;
1062 }
1063
1064 if (initgroups((const char*)arg_user, gid) < 0) {
1065 log_error("initgroups() failed: %m");
1066 goto child_fail;
1067 }
1068
5c94603d 1069 if (setresgid(gid, gid, gid) < 0) {
687d0825
MV
1070 log_error("setregid() failed: %m");
1071 goto child_fail;
1072 }
1073
5c94603d 1074 if (setresuid(uid, uid, uid) < 0) {
687d0825
MV
1075 log_error("setreuid() failed: %m");
1076 goto child_fail;
1077 }
1078 }
1079
144f0fc0
LP
1080 if ((asprintf((char**)(envp + 3), "HOME=%s", home ? home: "/root") < 0) ||
1081 (asprintf((char**)(envp + 4), "USER=%s", arg_user ? arg_user : "root") < 0) ||
1082 (asprintf((char**)(envp + 5), "LOGNAME=%s", arg_user ? arg_user : "root") < 0)) {
5c94603d 1083 log_error("Out of memory");
687d0825 1084 goto child_fail;
144f0fc0
LP
1085 }
1086
1087 if (arg_uuid) {
1088 if (asprintf((char**)(envp + 6), "container_uuid=%s", arg_uuid) < 0) {
1089 log_error("Out of memory");
1090 goto child_fail;
1091 }
687d0825
MV
1092 }
1093
3a74cea5 1094 setup_hostname();
88213476 1095
0f0dbc46
LP
1096 if (arg_boot) {
1097 char **a;
1098 size_t l;
1099
1100 /* Automatically search for the init system */
1101
1102 l = 1 + argc - optind;
1103 a = newa(char*, l + 1);
1104 memcpy(a + 1, argv + optind, l * sizeof(char*));
1105
1106 a[0] = (char*) "/usr/lib/systemd/systemd";
1107 execve(a[0], a, (char**) envp);
1108
1109 a[0] = (char*) "/lib/systemd/systemd";
1110 execve(a[0], a, (char**) envp);
1111
1112 a[0] = (char*) "/sbin/init";
1113 execve(a[0], a, (char**) envp);
1114 } else if (argc > optind)
da5b3bad
LP
1115 execvpe(argv[optind], argv + optind, (char**) envp);
1116 else {
5c94603d 1117 chdir(home ? home : "/root");
da5b3bad
LP
1118 execle("/bin/bash", "-bash", NULL, (char**) envp);
1119 }
88213476
LP
1120
1121 log_error("execv() failed: %m");
1122
1123 child_fail:
1124 _exit(EXIT_FAILURE);
1125 }
1126
a258bf26
LP
1127 if (process_pty(master, &mask) < 0)
1128 goto finish;
1129
1130 if (saved_attr_valid) {
1131 tcsetattr(STDIN_FILENO, TCSANOW, &saved_attr);
1132 saved_attr_valid = false;
1133 }
1134
6df6b939 1135 r = wait_for_terminate_and_warn(argc > optind ? argv[optind] : "bash", pid);
88213476
LP
1136
1137 if (r < 0)
1138 r = EXIT_FAILURE;
1139
1140finish:
a258bf26
LP
1141 if (saved_attr_valid)
1142 tcsetattr(STDIN_FILENO, TCSANOW, &saved_attr);
1143
1144 if (master >= 0)
1145 close_nointr_nofail(master);
1146
e58a1277
LP
1147 close_pipe(kmsg_socket_pair);
1148
04d391da
LP
1149 if (oldcg)
1150 cg_attach(SYSTEMD_CGROUP_CONTROLLER, oldcg, 0);
1151
1152 if (newcg)
1153 cg_kill_recursive_and_wait(SYSTEMD_CGROUP_CONTROLLER, newcg, true);
88213476 1154
04d391da 1155 free(arg_directory);
40c32a4a 1156 strv_free(arg_controllers);
04d391da
LP
1157 free(oldcg);
1158 free(newcg);
88213476
LP
1159
1160 return r;
1161}