]> git.ipfire.org Git - thirdparty/systemd.git/blame - src/nspawn/nspawn.c
util: split-out hwclock.[ch]
[thirdparty/systemd.git] / src / nspawn / nspawn.c
CommitLineData
88213476
LP
1/*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
2
3/***
4 This file is part of systemd.
5
6 Copyright 2010 Lennart Poettering
7
8 systemd is free software; you can redistribute it and/or modify it
5430f7f2
LP
9 under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 2.1 of the License, or
88213476
LP
11 (at your option) any later version.
12
13 systemd is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
5430f7f2 16 Lesser General Public License for more details.
88213476 17
5430f7f2 18 You should have received a copy of the GNU Lesser General Public License
88213476
LP
19 along with systemd; If not, see <http://www.gnu.org/licenses/>.
20***/
21
22#include <signal.h>
23#include <sched.h>
24#include <unistd.h>
25#include <sys/types.h>
26#include <sys/syscall.h>
27#include <sys/mount.h>
28#include <sys/wait.h>
29#include <stdlib.h>
30#include <string.h>
31#include <stdio.h>
32#include <errno.h>
33#include <sys/prctl.h>
34#include <sys/capability.h>
35#include <getopt.h>
a258bf26
LP
36#include <sys/epoll.h>
37#include <termios.h>
38#include <sys/signalfd.h>
687d0825 39#include <grp.h>
5ed27dbd 40#include <linux/fs.h>
9537eab0
LP
41#include <sys/un.h>
42#include <sys/socket.h>
88213476 43
81527be1
LP
44#include <systemd/sd-daemon.h>
45
88213476
LP
46#include "log.h"
47#include "util.h"
49e942b2 48#include "mkdir.h"
d7832d2c 49#include "audit.h"
94d82985 50#include "missing.h"
04d391da 51#include "cgroup-util.h"
a258bf26 52#include "strv.h"
a41fe3a2 53#include "loopback-setup.h"
88213476
LP
54
55static char *arg_directory = NULL;
687d0825 56static char *arg_user = NULL;
40c32a4a 57static char **arg_controllers = NULL;
144f0fc0 58static char *arg_uuid = NULL;
ff01d048 59static bool arg_private_network = false;
bc2f673e 60static bool arg_read_only = false;
0f0dbc46 61static bool arg_boot = false;
88213476
LP
62
63static int help(void) {
64
65 printf("%s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
66 "Spawn a minimal namespace container for debugging, testing and building.\n\n"
40c32a4a
LGL
67 " -h --help Show this help\n"
68 " -D --directory=NAME Root directory for the container\n"
0f0dbc46 69 " -b --boot Boot up full system (i.e. invoke init)\n"
40c32a4a
LGL
70 " -u --user=USER Run the command under specified user or uid\n"
71 " -C --controllers=LIST Put the container in specified comma-separated cgroup hierarchies\n"
144f0fc0 72 " --uuid=UUID Set a specific machine UUID for the container\n"
bc2f673e
LP
73 " --private-network Disable network in container\n"
74 " --read-only Mount the root directory read-only\n",
88213476
LP
75 program_invocation_short_name);
76
77 return 0;
78}
79
80static int parse_argv(int argc, char *argv[]) {
81
a41fe3a2 82 enum {
144f0fc0 83 ARG_PRIVATE_NETWORK = 0x100,
bc2f673e
LP
84 ARG_UUID,
85 ARG_READ_ONLY
a41fe3a2
LP
86 };
87
88213476 88 static const struct option options[] = {
ff01d048
LP
89 { "help", no_argument, NULL, 'h' },
90 { "directory", required_argument, NULL, 'D' },
91 { "user", required_argument, NULL, 'u' },
40c32a4a 92 { "controllers", required_argument, NULL, 'C' },
ff01d048 93 { "private-network", no_argument, NULL, ARG_PRIVATE_NETWORK },
0f0dbc46 94 { "boot", no_argument, NULL, 'b' },
144f0fc0 95 { "uuid", required_argument, NULL, ARG_UUID },
bc2f673e 96 { "read-only", no_argument, NULL, ARG_READ_ONLY },
ff01d048 97 { NULL, 0, NULL, 0 }
88213476
LP
98 };
99
100 int c;
101
102 assert(argc >= 0);
103 assert(argv);
104
0f0dbc46 105 while ((c = getopt_long(argc, argv, "+hD:u:C:b", options, NULL)) >= 0) {
88213476
LP
106
107 switch (c) {
108
109 case 'h':
110 help();
111 return 0;
112
113 case 'D':
114 free(arg_directory);
3a74cea5
LP
115 arg_directory = canonicalize_file_name(optarg);
116 if (!arg_directory) {
117 log_error("Failed to canonicalize root directory.");
88213476
LP
118 return -ENOMEM;
119 }
120
121 break;
122
687d0825
MV
123 case 'u':
124 free(arg_user);
125 if (!(arg_user = strdup(optarg))) {
126 log_error("Failed to duplicate user name.");
127 return -ENOMEM;
128 }
129
130 break;
131
40c32a4a
LGL
132 case 'C':
133 strv_free(arg_controllers);
134 arg_controllers = strv_split(optarg, ",");
135 if (!arg_controllers) {
136 log_error("Failed to split controllers list.");
137 return -ENOMEM;
138 }
139 strv_uniq(arg_controllers);
140
141 break;
142
ff01d048
LP
143 case ARG_PRIVATE_NETWORK:
144 arg_private_network = true;
a41fe3a2
LP
145 break;
146
0f0dbc46
LP
147 case 'b':
148 arg_boot = true;
149 break;
150
144f0fc0
LP
151 case ARG_UUID:
152 arg_uuid = optarg;
153 break;
154
bc2f673e
LP
155 case ARG_READ_ONLY:
156 arg_read_only = true;
157 break;
158
88213476
LP
159 case '?':
160 return -EINVAL;
161
162 default:
163 log_error("Unknown option code %c", c);
164 return -EINVAL;
165 }
166 }
167
168 return 1;
169}
170
171static int mount_all(const char *dest) {
172
173 typedef struct MountPoint {
174 const char *what;
175 const char *where;
176 const char *type;
177 const char *options;
178 unsigned long flags;
3bd66c05 179 bool fatal;
88213476
LP
180 } MountPoint;
181
182 static const MountPoint mount_table[] = {
4b7a6af4 183 { "proc", "/proc", "proc", NULL, MS_NOSUID|MS_NOEXEC|MS_NODEV, true },
576a01c8
LP
184 { "/proc/sys", "/proc/sys", "bind", NULL, MS_BIND, true }, /* Bind mount first */
185 { "/proc/sys", "/proc/sys", "bind", NULL, MS_BIND|MS_RDONLY|MS_REMOUNT, true }, /* Then, make it r/o */
186 { "/sys", "/sys", "bind", NULL, MS_BIND, true }, /* Bind mount first */
187 { "/sys", "/sys", "bind", NULL, MS_BIND|MS_RDONLY|MS_REMOUNT, true }, /* Then, make it r/o */
635f7d8c 188 { "tmpfs", "/dev", "tmpfs", "mode=755", MS_NOSUID|MS_STRICTATIME, true },
576a01c8 189 { "/dev/pts", "/dev/pts", "bind", NULL, MS_BIND, true },
635f7d8c 190 { "tmpfs", "/run", "tmpfs", "mode=755", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true },
9b634ea5 191#ifdef HAVE_SELINUX
6b2bf923
LP
192 { "/sys/fs/selinux", "/sys/fs/selinux", "bind", NULL, MS_BIND, false }, /* Bind mount first */
193 { "/sys/fs/selinux", "/sys/fs/selinux", "bind", NULL, MS_BIND|MS_RDONLY|MS_REMOUNT, false }, /* Then, make it r/o */
9b634ea5 194#endif
88213476
LP
195 };
196
197 unsigned k;
198 int r = 0;
715ac17a 199 char *where;
88213476
LP
200
201 for (k = 0; k < ELEMENTSOF(mount_table); k++) {
88213476
LP
202 int t;
203
204 if (asprintf(&where, "%s/%s", dest, mount_table[k].where) < 0) {
205 log_error("Out of memory");
206
207 if (r == 0)
208 r = -ENOMEM;
209
210 break;
211 }
212
68fb0892
LP
213 t = path_is_mount_point(where, false);
214 if (t < 0) {
88213476
LP
215 log_error("Failed to detect whether %s is a mount point: %s", where, strerror(-t));
216 free(where);
217
218 if (r == 0)
219 r = t;
220
221 continue;
222 }
223
224 mkdir_p(where, 0755);
225
226 if (mount(mount_table[k].what,
227 where,
228 mount_table[k].type,
229 mount_table[k].flags,
3bd66c05
LP
230 mount_table[k].options) < 0 &&
231 mount_table[k].fatal) {
88213476
LP
232
233 log_error("mount(%s) failed: %m", where);
234
235 if (r == 0)
236 r = -errno;
237 }
238
239 free(where);
240 }
241
e58a1277
LP
242 return r;
243}
f8440af5 244
e58a1277
LP
245static int setup_timezone(const char *dest) {
246 char *where;
f8440af5 247
e58a1277
LP
248 assert(dest);
249
250 /* Fix the timezone, if possible */
251 if (asprintf(&where, "%s/etc/localtime", dest) < 0) {
252 log_error("Out of memory");
253 return -ENOMEM;
715ac17a
LP
254 }
255
e58a1277
LP
256 if (mount("/etc/localtime", where, "bind", MS_BIND, NULL) >= 0)
257 mount("/etc/localtime", where, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY, NULL);
4d1c38b8 258
e58a1277 259 free(where);
4d1c38b8 260
e58a1277
LP
261 if (asprintf(&where, "%s/etc/timezone", dest) < 0) {
262 log_error("Out of memory");
263 return -ENOMEM;
4d1c38b8
LP
264 }
265
e58a1277
LP
266 if (mount("/etc/timezone", where, "bind", MS_BIND, NULL) >= 0)
267 mount("/etc/timezone", where, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY, NULL);
68fb0892 268
e58a1277
LP
269 free(where);
270
271 return 0;
88213476
LP
272}
273
2547bb41
LP
274static int setup_resolv_conf(const char *dest) {
275 char *where;
276
277 assert(dest);
278
279 if (arg_private_network)
280 return 0;
281
282 /* Fix resolv.conf, if possible */
283 if (asprintf(&where, "%s/etc/resolv.conf", dest) < 0) {
284 log_error("Out of memory");
285 return -ENOMEM;
286 }
287
288 if (mount("/etc/resolv.conf", where, "bind", MS_BIND, NULL) >= 0)
289 mount("/etc/resolv.conf", where, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY, NULL);
290
291 free(where);
292
293 return 0;
294}
295
e58a1277 296static int copy_devnodes(const char *dest) {
88213476
LP
297
298 static const char devnodes[] =
299 "null\0"
300 "zero\0"
301 "full\0"
302 "random\0"
303 "urandom\0"
304 "tty\0"
305 "ptmx\0"
88213476
LP
306 "rtc0\0";
307
308 const char *d;
e58a1277 309 int r = 0;
124640f1 310 mode_t u;
a258bf26
LP
311
312 assert(dest);
124640f1
LP
313
314 u = umask(0000);
88213476
LP
315
316 NULSTR_FOREACH(d, devnodes) {
e58a1277
LP
317 struct stat st;
318 char *from = NULL, *to = NULL;
88213476
LP
319
320 asprintf(&from, "/dev/%s", d);
321 asprintf(&to, "%s/dev/%s", dest, d);
322
323 if (!from || !to) {
324 log_error("Failed to allocate devnode path");
325
326 free(from);
327 free(to);
328
a258bf26
LP
329 from = to = NULL;
330
88213476
LP
331 if (r == 0)
332 r = -ENOMEM;
333
334 break;
335 }
336
337 if (stat(from, &st) < 0) {
338
339 if (errno != ENOENT) {
340 log_error("Failed to stat %s: %m", from);
88213476
LP
341 if (r == 0)
342 r = -errno;
343 }
344
a258bf26 345 } else if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
88213476 346
a258bf26
LP
347 log_error("%s is not a char or block device, cannot copy.", from);
348 if (r == 0)
349 r = -EIO;
350
351 } else if (mknod(to, st.st_mode, st.st_rdev) < 0) {
352
353 log_error("mknod(%s) failed: %m", dest);
354 if (r == 0)
355 r = -errno;
88213476
LP
356 }
357
358 free(from);
359 free(to);
360 }
361
e58a1277 362 umask(u);
88213476 363
e58a1277
LP
364 return r;
365}
88213476 366
e58a1277
LP
367static int setup_dev_console(const char *dest, const char *console) {
368 struct stat st;
369 char *to = NULL;
370 int r;
371 mode_t u;
372
373 assert(dest);
374 assert(console);
375
376 u = umask(0000);
377
378 if (stat(console, &st) < 0) {
379 log_error("Failed to stat %s: %m", console);
380 r = -errno;
a258bf26 381 goto finish;
88213476 382
a258bf26 383 } else if (!S_ISCHR(st.st_mode)) {
a258bf26 384 log_error("/dev/console is not a char device.");
e58a1277
LP
385 r = -EIO;
386 goto finish;
387 }
88213476 388
e58a1277
LP
389 r = chmod_and_chown(console, 0600, 0, 0);
390 if (r < 0) {
391 log_error("Failed to correct access mode for TTY: %s", strerror(-r));
a258bf26
LP
392 goto finish;
393 }
88213476 394
a258bf26 395 if (asprintf(&to, "%s/dev/console", dest) < 0) {
a258bf26 396 log_error("Out of memory");
e58a1277
LP
397 r = -ENOMEM;
398 goto finish;
88213476
LP
399 }
400
a258bf26
LP
401 /* We need to bind mount the right tty to /dev/console since
402 * ptys can only exist on pts file systems. To have something
403 * to bind mount things on we create a device node first, that
404 * has the right major/minor (note that the major minor
405 * doesn't actually matter here, since we mount it over
406 * anyway). */
407
e58a1277
LP
408 if (mknod(to, (st.st_mode & ~07777) | 0600, st.st_rdev) < 0) {
409 log_error("mknod() for /dev/console failed: %m");
410 r = -errno;
411 goto finish;
412 }
a258bf26
LP
413
414 if (mount(console, to, "bind", MS_BIND, NULL) < 0) {
e58a1277
LP
415 log_error("Bind mount for /dev/console failed: %m");
416 r = -errno;
417 goto finish;
a258bf26
LP
418 }
419
e58a1277 420finish:
a258bf26 421 free(to);
e58a1277
LP
422 umask(u);
423
424 return r;
425}
426
427static int setup_kmsg(const char *dest, int kmsg_socket) {
428 char *from = NULL, *to = NULL;
429 int r, fd, k;
430 mode_t u;
431 union {
432 struct cmsghdr cmsghdr;
433 uint8_t buf[CMSG_SPACE(sizeof(int))];
434 } control;
435 struct msghdr mh;
436 struct cmsghdr *cmsg;
437
438 assert(dest);
439 assert(kmsg_socket >= 0);
a258bf26 440
e58a1277 441 u = umask(0000);
a258bf26 442
f1e5dfe2
LP
443 /* We create the kmsg FIFO as /dev/kmsg, but immediately
444 * delete it after bind mounting it to /proc/kmsg. While FIFOs
445 * on the reading side behave very similar to /proc/kmsg,
446 * their writing side behaves differently from /dev/kmsg in
447 * that writing blocks when nothing is reading. In order to
448 * avoid any problems with containers deadlocking due to this
449 * we simply make /dev/kmsg unavailable to the container. */
e58a1277
LP
450 if (asprintf(&from, "%s/dev/kmsg", dest) < 0) {
451 log_error("Out of memory");
452 r = -ENOMEM;
453 goto finish;
454 }
455
456 if (asprintf(&to, "%s/proc/kmsg", dest) < 0) {
457 log_error("Out of memory");
458 r = -ENOMEM;
459 goto finish;
460 }
461
462 if (mkfifo(from, 0600) < 0) {
463 log_error("mkfifo() for /dev/kmsg failed: %m");
464 r = -errno;
465 goto finish;
466 }
467
468 r = chmod_and_chown(from, 0600, 0, 0);
469 if (r < 0) {
470 log_error("Failed to correct access mode for /dev/kmsg: %s", strerror(-r));
471 goto finish;
472 }
473
474 if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
475 log_error("Bind mount for /proc/kmsg failed: %m");
476 r = -errno;
477 goto finish;
478 }
479
480 fd = open(from, O_RDWR|O_NDELAY|O_CLOEXEC);
481 if (fd < 0) {
482 log_error("Failed to open fifo: %m");
483 r = -errno;
484 goto finish;
485 }
486
487 zero(mh);
488 zero(control);
489
490 mh.msg_control = &control;
491 mh.msg_controllen = sizeof(control);
492
493 cmsg = CMSG_FIRSTHDR(&mh);
494 cmsg->cmsg_level = SOL_SOCKET;
495 cmsg->cmsg_type = SCM_RIGHTS;
496 cmsg->cmsg_len = CMSG_LEN(sizeof(int));
497 memcpy(CMSG_DATA(cmsg), &fd, sizeof(int));
498
499 mh.msg_controllen = cmsg->cmsg_len;
500
501 /* Store away the fd in the socket, so that it stays open as
502 * long as we run the child */
503 k = sendmsg(kmsg_socket, &mh, MSG_DONTWAIT|MSG_NOSIGNAL);
504 close_nointr_nofail(fd);
505
506 if (k < 0) {
507 log_error("Failed to send FIFO fd: %m");
508 r = -errno;
509 goto finish;
a258bf26
LP
510 }
511
f1e5dfe2
LP
512 /* And now make the FIFO unavailable as /dev/kmsg... */
513 unlink(from);
514
a258bf26 515finish:
e58a1277
LP
516 free(from);
517 free(to);
124640f1
LP
518 umask(u);
519
88213476
LP
520 return r;
521}
522
3a74cea5
LP
523static int setup_hostname(void) {
524 char *hn;
525 int r = 0;
526
527 hn = file_name_from_path(arg_directory);
528 if (hn) {
529 hn = strdup(hn);
530 if (!hn)
531 return -ENOMEM;
532
533 hostname_cleanup(hn);
534
535 if (!isempty(hn))
536 if (sethostname(hn, strlen(hn)) < 0)
537 r = -errno;
538
539 free(hn);
540 }
541
542 return r;
543}
544
88213476
LP
545static int drop_capabilities(void) {
546 static const unsigned long retain[] = {
547 CAP_CHOWN,
548 CAP_DAC_OVERRIDE,
549 CAP_DAC_READ_SEARCH,
550 CAP_FOWNER,
551 CAP_FSETID,
552 CAP_IPC_OWNER,
553 CAP_KILL,
554 CAP_LEASE,
555 CAP_LINUX_IMMUTABLE,
556 CAP_NET_BIND_SERVICE,
557 CAP_NET_BROADCAST,
558 CAP_NET_RAW,
559 CAP_SETGID,
560 CAP_SETFCAP,
561 CAP_SETPCAP,
562 CAP_SETUID,
563 CAP_SYS_ADMIN,
564 CAP_SYS_CHROOT,
565 CAP_SYS_NICE,
566 CAP_SYS_PTRACE,
567 CAP_SYS_TTY_CONFIG
568 };
569
570 unsigned long l;
571
64685e0c 572 for (l = 0; l <= cap_last_cap(); l++) {
88213476
LP
573 unsigned i;
574
575 for (i = 0; i < ELEMENTSOF(retain); i++)
576 if (retain[i] == l)
577 break;
578
579 if (i < ELEMENTSOF(retain))
580 continue;
581
582 if (prctl(PR_CAPBSET_DROP, l) < 0) {
88213476
LP
583 log_error("PR_CAPBSET_DROP failed: %m");
584 return -errno;
585 }
586 }
587
588 return 0;
589}
590
591static int is_os_tree(const char *path) {
592 int r;
593 char *p;
594 /* We use /bin/sh as flag file if something is an OS */
595
596 if (asprintf(&p, "%s/bin/sh", path) < 0)
597 return -ENOMEM;
598
599 r = access(p, F_OK);
600 free(p);
601
602 return r < 0 ? 0 : 1;
603}
604
a258bf26 605static int process_pty(int master, sigset_t *mask) {
0c749d50 606
b72491a2 607 char in_buffer[LINE_MAX], out_buffer[LINE_MAX];
a258bf26
LP
608 size_t in_buffer_full = 0, out_buffer_full = 0;
609 struct epoll_event stdin_ev, stdout_ev, master_ev, signal_ev;
610 bool stdin_readable = false, stdout_writable = false, master_readable = false, master_writable = false;
a258bf26
LP
611 int ep = -1, signal_fd = -1, r;
612
613 fd_nonblock(STDIN_FILENO, 1);
614 fd_nonblock(STDOUT_FILENO, 1);
615 fd_nonblock(master, 1);
616
617 if ((signal_fd = signalfd(-1, mask, SFD_NONBLOCK|SFD_CLOEXEC)) < 0) {
618 log_error("signalfd(): %m");
619 r = -errno;
620 goto finish;
621 }
622
623 if ((ep = epoll_create1(EPOLL_CLOEXEC)) < 0) {
624 log_error("Failed to create epoll: %m");
625 r = -errno;
626 goto finish;
627 }
628
629 zero(stdin_ev);
630 stdin_ev.events = EPOLLIN|EPOLLET;
631 stdin_ev.data.fd = STDIN_FILENO;
632
633 zero(stdout_ev);
634 stdout_ev.events = EPOLLOUT|EPOLLET;
635 stdout_ev.data.fd = STDOUT_FILENO;
636
637 zero(master_ev);
638 master_ev.events = EPOLLIN|EPOLLOUT|EPOLLET;
639 master_ev.data.fd = master;
640
641 zero(signal_ev);
642 signal_ev.events = EPOLLIN;
643 signal_ev.data.fd = signal_fd;
644
645 if (epoll_ctl(ep, EPOLL_CTL_ADD, STDIN_FILENO, &stdin_ev) < 0 ||
646 epoll_ctl(ep, EPOLL_CTL_ADD, STDOUT_FILENO, &stdout_ev) < 0 ||
647 epoll_ctl(ep, EPOLL_CTL_ADD, master, &master_ev) < 0 ||
648 epoll_ctl(ep, EPOLL_CTL_ADD, signal_fd, &signal_ev) < 0) {
649 log_error("Failed to regiser fds in epoll: %m");
650 r = -errno;
651 goto finish;
652 }
653
fd14078a 654 for (;;) {
a258bf26
LP
655 struct epoll_event ev[16];
656 ssize_t k;
657 int i, nfds;
658
659 if ((nfds = epoll_wait(ep, ev, ELEMENTSOF(ev), -1)) < 0) {
660
661 if (errno == EINTR || errno == EAGAIN)
662 continue;
663
664 log_error("epoll_wait(): %m");
665 r = -errno;
666 goto finish;
667 }
668
669 assert(nfds >= 1);
670
671 for (i = 0; i < nfds; i++) {
672 if (ev[i].data.fd == STDIN_FILENO) {
673
fd14078a 674 if (ev[i].events & (EPOLLIN|EPOLLHUP))
a258bf26
LP
675 stdin_readable = true;
676
677 } else if (ev[i].data.fd == STDOUT_FILENO) {
678
fd14078a 679 if (ev[i].events & (EPOLLOUT|EPOLLHUP))
a258bf26
LP
680 stdout_writable = true;
681
682 } else if (ev[i].data.fd == master) {
683
fd14078a 684 if (ev[i].events & (EPOLLIN|EPOLLHUP))
a258bf26
LP
685 master_readable = true;
686
fd14078a 687 if (ev[i].events & (EPOLLOUT|EPOLLHUP))
a258bf26
LP
688 master_writable = true;
689
690 } else if (ev[i].data.fd == signal_fd) {
691 struct signalfd_siginfo sfsi;
692 ssize_t n;
693
694 if ((n = read(signal_fd, &sfsi, sizeof(sfsi))) != sizeof(sfsi)) {
695
696 if (n >= 0) {
0c749d50 697 log_error("Failed to read from signalfd: invalid block size");
a258bf26
LP
698 r = -EIO;
699 goto finish;
700 }
701
702 if (errno != EINTR && errno != EAGAIN) {
0c749d50 703 log_error("Failed to read from signalfd: %m");
a258bf26
LP
704 r = -errno;
705 goto finish;
706 }
707 } else {
708
709 if (sfsi.ssi_signo == SIGWINCH) {
710 struct winsize ws;
711
712 /* The window size changed, let's forward that. */
a258bf26
LP
713 if (ioctl(STDIN_FILENO, TIOCGWINSZ, &ws) >= 0)
714 ioctl(master, TIOCSWINSZ, &ws);
715 } else {
0c749d50 716 r = 0;
a258bf26
LP
717 goto finish;
718 }
719 }
720 }
721 }
722
723 while ((stdin_readable && in_buffer_full <= 0) ||
724 (master_writable && in_buffer_full > 0) ||
725 (master_readable && out_buffer_full <= 0) ||
726 (stdout_writable && out_buffer_full > 0)) {
727
b72491a2 728 if (stdin_readable && in_buffer_full < LINE_MAX) {
a258bf26 729
b72491a2 730 if ((k = read(STDIN_FILENO, in_buffer + in_buffer_full, LINE_MAX - in_buffer_full)) < 0) {
a258bf26 731
fd14078a 732 if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
a258bf26 733 stdin_readable = false;
a258bf26
LP
734 else {
735 log_error("read(): %m");
0c749d50 736 r = -errno;
a258bf26
LP
737 goto finish;
738 }
739 } else
740 in_buffer_full += (size_t) k;
a258bf26
LP
741 }
742
743 if (master_writable && in_buffer_full > 0) {
744
745 if ((k = write(master, in_buffer, in_buffer_full)) < 0) {
746
fd14078a 747 if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
a258bf26 748 master_writable = false;
fd14078a 749 else {
a258bf26 750 log_error("write(): %m");
0c749d50 751 r = -errno;
a258bf26
LP
752 goto finish;
753 }
754
755 } else {
756 assert(in_buffer_full >= (size_t) k);
757 memmove(in_buffer, in_buffer + k, in_buffer_full - k);
758 in_buffer_full -= k;
759 }
760 }
761
b72491a2 762 if (master_readable && out_buffer_full < LINE_MAX) {
a258bf26 763
b72491a2 764 if ((k = read(master, out_buffer + out_buffer_full, LINE_MAX - out_buffer_full)) < 0) {
a258bf26 765
fd14078a 766 if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
a258bf26 767 master_readable = false;
a258bf26
LP
768 else {
769 log_error("read(): %m");
0c749d50 770 r = -errno;
a258bf26
LP
771 goto finish;
772 }
773 } else
774 out_buffer_full += (size_t) k;
a258bf26
LP
775 }
776
777 if (stdout_writable && out_buffer_full > 0) {
778
779 if ((k = write(STDOUT_FILENO, out_buffer, out_buffer_full)) < 0) {
780
fd14078a 781 if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
a258bf26 782 stdout_writable = false;
fd14078a 783 else {
a258bf26 784 log_error("write(): %m");
0c749d50 785 r = -errno;
a258bf26
LP
786 goto finish;
787 }
788
789 } else {
790 assert(out_buffer_full >= (size_t) k);
791 memmove(out_buffer, out_buffer + k, out_buffer_full - k);
792 out_buffer_full -= k;
793 }
794 }
795 }
fd14078a 796 }
a258bf26
LP
797
798finish:
799 if (ep >= 0)
800 close_nointr_nofail(ep);
801
802 if (signal_fd >= 0)
803 close_nointr_nofail(signal_fd);
804
805 return r;
806}
88213476
LP
807
808int main(int argc, char *argv[]) {
809 pid_t pid = 0;
04d391da
LP
810 int r = EXIT_FAILURE, k;
811 char *oldcg = NULL, *newcg = NULL;
40c32a4a 812 char **controller = NULL;
a258bf26
LP
813 int master = -1;
814 const char *console = NULL;
815 struct termios saved_attr, raw_attr;
816 sigset_t mask;
817 bool saved_attr_valid = false;
818 struct winsize ws;
e58a1277 819 int kmsg_socket_pair[2] = { -1, -1 };
88213476
LP
820
821 log_parse_environment();
822 log_open();
823
824 if ((r = parse_argv(argc, argv)) <= 0)
825 goto finish;
826
827 if (arg_directory) {
828 char *p;
829
830 p = path_make_absolute_cwd(arg_directory);
831 free(arg_directory);
832 arg_directory = p;
833 } else
834 arg_directory = get_current_dir_name();
835
836 if (!arg_directory) {
837 log_error("Failed to determine path");
838 goto finish;
839 }
840
841 path_kill_slashes(arg_directory);
842
843 if (geteuid() != 0) {
844 log_error("Need to be root.");
845 goto finish;
846 }
847
04d391da
LP
848 if (sd_booted() <= 0) {
849 log_error("Not running on a systemd system.");
850 goto finish;
851 }
852
88213476 853 if (path_equal(arg_directory, "/")) {
6df6b939 854 log_error("Spawning container on root directory not supported.");
88213476
LP
855 goto finish;
856 }
857
858 if (is_os_tree(arg_directory) <= 0) {
859 log_error("Directory %s doesn't look like an OS root directory. Refusing.", arg_directory);
860 goto finish;
861 }
862
04d391da
LP
863 if ((k = cg_get_by_pid(SYSTEMD_CGROUP_CONTROLLER, 0, &oldcg)) < 0) {
864 log_error("Failed to determine current cgroup: %s", strerror(-k));
865 goto finish;
866 }
867
868 if (asprintf(&newcg, "%s/nspawn-%lu", oldcg, (unsigned long) getpid()) < 0) {
869 log_error("Failed to allocate cgroup path.");
870 goto finish;
871 }
872
40c32a4a
LGL
873 k = cg_create_and_attach(SYSTEMD_CGROUP_CONTROLLER, newcg, 0);
874 if (k < 0) {
04d391da
LP
875 log_error("Failed to create cgroup: %s", strerror(-k));
876 goto finish;
877 }
878
40c32a4a
LGL
879 STRV_FOREACH(controller,arg_controllers) {
880 k = cg_create_and_attach(*controller, newcg, 0);
881 if (k < 0)
882 log_warning("Failed to create cgroup in controller %s: %s", *controller, strerror(-k));
883 }
884
a258bf26
LP
885 if ((master = posix_openpt(O_RDWR|O_NOCTTY|O_CLOEXEC|O_NDELAY)) < 0) {
886 log_error("Failed to acquire pseudo tty: %m");
887 goto finish;
888 }
889
890 if (!(console = ptsname(master))) {
891 log_error("Failed to determine tty name: %m");
892 goto finish;
893 }
894
895 log_info("Spawning namespace container on %s (console is %s).", arg_directory, console);
896
897 if (ioctl(STDIN_FILENO, TIOCGWINSZ, &ws) >= 0)
898 ioctl(master, TIOCSWINSZ, &ws);
899
900 if (unlockpt(master) < 0) {
901 log_error("Failed to unlock tty: %m");
902 goto finish;
903 }
904
905 if (tcgetattr(STDIN_FILENO, &saved_attr) < 0) {
906 log_error("Failed to get terminal attributes: %m");
907 goto finish;
908 }
909
910 saved_attr_valid = true;
911
912 raw_attr = saved_attr;
913 cfmakeraw(&raw_attr);
914 raw_attr.c_lflag &= ~ECHO;
915
916 if (tcsetattr(STDIN_FILENO, TCSANOW, &raw_attr) < 0) {
917 log_error("Failed to set terminal attributes: %m");
918 goto finish;
919 }
920
e58a1277
LP
921 if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_NONBLOCK|SOCK_CLOEXEC, 0, kmsg_socket_pair) < 0) {
922 log_error("Failed to create kmsg socket pair");
923 goto finish;
924 }
925
a258bf26
LP
926 assert_se(sigemptyset(&mask) == 0);
927 sigset_add_many(&mask, SIGCHLD, SIGWINCH, SIGTERM, SIGINT, -1);
928 assert_se(sigprocmask(SIG_BLOCK, &mask, NULL) == 0);
929
52af2106
LP
930 pid = syscall(__NR_clone, SIGCHLD|CLONE_NEWIPC|CLONE_NEWNS|CLONE_NEWPID|CLONE_NEWUTS|(arg_private_network ? CLONE_NEWNET : 0), NULL);
931 if (pid < 0) {
932 if (errno == EINVAL)
933 log_error("clone() failed, do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in): %m");
934 else
935 log_error("clone() failed: %m");
936
88213476
LP
937 goto finish;
938 }
939
940 if (pid == 0) {
a258bf26
LP
941 /* child */
942
687d0825
MV
943 const char *home = NULL;
944 uid_t uid = (uid_t) -1;
945 gid_t gid = (gid_t) -1;
da5b3bad 946 const char *envp[] = {
da5b3bad 947 "PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin",
3bb1c6b0 948 "container=systemd-nspawn", /* LXC sets container=lxc, so follow the scheme here */
687d0825
MV
949 NULL, /* TERM */
950 NULL, /* HOME */
951 NULL, /* USER */
952 NULL, /* LOGNAME */
144f0fc0 953 NULL, /* container_uuid */
da5b3bad
LP
954 NULL
955 };
88213476 956
3bb1c6b0 957 envp[2] = strv_find_prefix(environ, "TERM=");
a258bf26
LP
958
959 close_nointr_nofail(master);
960
961 close_nointr(STDIN_FILENO);
962 close_nointr(STDOUT_FILENO);
963 close_nointr(STDERR_FILENO);
964
e58a1277 965 close_all_fds(&kmsg_socket_pair[1], 1);
a258bf26
LP
966
967 reset_all_signal_handlers();
968
969 assert_se(sigemptyset(&mask) == 0);
970 assert_se(sigprocmask(SIG_SETMASK, &mask, NULL) == 0);
971
972 if (setsid() < 0)
973 goto child_fail;
974
975 if (prctl(PR_SET_PDEATHSIG, SIGKILL) < 0)
976 goto child_fail;
88213476 977
f5c1b9ee
LP
978 /* Mark / as private, in case somebody marked it shared */
979 if (mount(NULL, "/", NULL, MS_PRIVATE|MS_REC, NULL) < 0)
980 goto child_fail;
981
bc2f673e
LP
982 /* Turn directory into bind mount */
983 if (mount(arg_directory, arg_directory, "bind", MS_BIND, NULL) < 0) {
984 log_error("Failed to make bind mount.");
985 goto child_fail;
986 }
987
988 if (arg_read_only)
989 if (mount(arg_directory, arg_directory, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY, NULL) < 0) {
990 log_error("Failed to make read-only.");
991 goto child_fail;
992 }
993
88213476
LP
994 if (mount_all(arg_directory) < 0)
995 goto child_fail;
996
e58a1277
LP
997 if (copy_devnodes(arg_directory) < 0)
998 goto child_fail;
999
1000 if (setup_dev_console(arg_directory, console) < 0)
1001 goto child_fail;
1002
1003 if (setup_kmsg(arg_directory, kmsg_socket_pair[1]) < 0)
1004 goto child_fail;
1005
1006 close_nointr_nofail(kmsg_socket_pair[1]);
1007
1008 if (setup_timezone(arg_directory) < 0)
88213476
LP
1009 goto child_fail;
1010
2547bb41
LP
1011 if (setup_resolv_conf(arg_directory) < 0)
1012 goto child_fail;
1013
88213476
LP
1014 if (chdir(arg_directory) < 0) {
1015 log_error("chdir(%s) failed: %m", arg_directory);
1016 goto child_fail;
1017 }
a258bf26
LP
1018
1019 if (open_terminal("dev/console", O_RDWR) != STDIN_FILENO ||
1020 dup2(STDIN_FILENO, STDOUT_FILENO) != STDOUT_FILENO ||
1021 dup2(STDIN_FILENO, STDERR_FILENO) != STDERR_FILENO)
1022 goto child_fail;
1023
bc2f673e
LP
1024 if (mount(arg_directory, "/", "bind", MS_MOVE, NULL) < 0) {
1025 log_error("mount(MS_BIND) failed: %m");
88213476
LP
1026 goto child_fail;
1027 }
1028
1029 if (chroot(".") < 0) {
1030 log_error("chroot() failed: %m");
1031 goto child_fail;
1032 }
1033
1034 if (chdir("/") < 0) {
1035 log_error("chdir() failed: %m");
1036 goto child_fail;
1037 }
1038
4c12626c 1039 umask(0022);
a258bf26 1040
a41fe3a2
LP
1041 loopback_setup();
1042
88213476
LP
1043 if (drop_capabilities() < 0)
1044 goto child_fail;
1045
687d0825
MV
1046 if (arg_user) {
1047
1048 if (get_user_creds((const char**)&arg_user, &uid, &gid, &home) < 0) {
1049 log_error("get_user_creds() failed: %m");
1050 goto child_fail;
1051 }
1052
1053 if (mkdir_parents(home, 0775) < 0) {
1054 log_error("mkdir_parents() failed: %m");
1055 goto child_fail;
1056 }
1057
1058 if (safe_mkdir(home, 0775, uid, gid) < 0) {
1059 log_error("safe_mkdir() failed: %m");
1060 goto child_fail;
1061 }
1062
1063 if (initgroups((const char*)arg_user, gid) < 0) {
1064 log_error("initgroups() failed: %m");
1065 goto child_fail;
1066 }
1067
5c94603d 1068 if (setresgid(gid, gid, gid) < 0) {
687d0825
MV
1069 log_error("setregid() failed: %m");
1070 goto child_fail;
1071 }
1072
5c94603d 1073 if (setresuid(uid, uid, uid) < 0) {
687d0825
MV
1074 log_error("setreuid() failed: %m");
1075 goto child_fail;
1076 }
1077 }
1078
144f0fc0
LP
1079 if ((asprintf((char**)(envp + 3), "HOME=%s", home ? home: "/root") < 0) ||
1080 (asprintf((char**)(envp + 4), "USER=%s", arg_user ? arg_user : "root") < 0) ||
1081 (asprintf((char**)(envp + 5), "LOGNAME=%s", arg_user ? arg_user : "root") < 0)) {
5c94603d 1082 log_error("Out of memory");
687d0825 1083 goto child_fail;
144f0fc0
LP
1084 }
1085
1086 if (arg_uuid) {
1087 if (asprintf((char**)(envp + 6), "container_uuid=%s", arg_uuid) < 0) {
1088 log_error("Out of memory");
1089 goto child_fail;
1090 }
687d0825
MV
1091 }
1092
3a74cea5 1093 setup_hostname();
88213476 1094
0f0dbc46
LP
1095 if (arg_boot) {
1096 char **a;
1097 size_t l;
1098
1099 /* Automatically search for the init system */
1100
1101 l = 1 + argc - optind;
1102 a = newa(char*, l + 1);
1103 memcpy(a + 1, argv + optind, l * sizeof(char*));
1104
1105 a[0] = (char*) "/usr/lib/systemd/systemd";
1106 execve(a[0], a, (char**) envp);
1107
1108 a[0] = (char*) "/lib/systemd/systemd";
1109 execve(a[0], a, (char**) envp);
1110
1111 a[0] = (char*) "/sbin/init";
1112 execve(a[0], a, (char**) envp);
1113 } else if (argc > optind)
da5b3bad
LP
1114 execvpe(argv[optind], argv + optind, (char**) envp);
1115 else {
5c94603d 1116 chdir(home ? home : "/root");
da5b3bad
LP
1117 execle("/bin/bash", "-bash", NULL, (char**) envp);
1118 }
88213476
LP
1119
1120 log_error("execv() failed: %m");
1121
1122 child_fail:
1123 _exit(EXIT_FAILURE);
1124 }
1125
a258bf26
LP
1126 if (process_pty(master, &mask) < 0)
1127 goto finish;
1128
1129 if (saved_attr_valid) {
1130 tcsetattr(STDIN_FILENO, TCSANOW, &saved_attr);
1131 saved_attr_valid = false;
1132 }
1133
6df6b939 1134 r = wait_for_terminate_and_warn(argc > optind ? argv[optind] : "bash", pid);
88213476
LP
1135
1136 if (r < 0)
1137 r = EXIT_FAILURE;
1138
1139finish:
a258bf26
LP
1140 if (saved_attr_valid)
1141 tcsetattr(STDIN_FILENO, TCSANOW, &saved_attr);
1142
1143 if (master >= 0)
1144 close_nointr_nofail(master);
1145
e58a1277
LP
1146 close_pipe(kmsg_socket_pair);
1147
04d391da
LP
1148 if (oldcg)
1149 cg_attach(SYSTEMD_CGROUP_CONTROLLER, oldcg, 0);
1150
1151 if (newcg)
1152 cg_kill_recursive_and_wait(SYSTEMD_CGROUP_CONTROLLER, newcg, true);
88213476 1153
04d391da 1154 free(arg_directory);
40c32a4a 1155 strv_free(arg_controllers);
04d391da
LP
1156 free(oldcg);
1157 free(newcg);
88213476
LP
1158
1159 return r;
1160}