]> git.ipfire.org Git - thirdparty/systemd.git/blob - src/nspawn/nspawn.c
tree-wide: whenever we fork off a foreign child process reset signal mask/handlers
[thirdparty/systemd.git] / src / nspawn / nspawn.c
1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
2
3 /***
4 This file is part of systemd.
5
6 Copyright 2010 Lennart Poettering
7
8 systemd is free software; you can redistribute it and/or modify it
9 under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 2.1 of the License, or
11 (at your option) any later version.
12
13 systemd is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 Lesser General Public License for more details.
17
18 You should have received a copy of the GNU Lesser General Public License
19 along with systemd; If not, see <http://www.gnu.org/licenses/>.
20 ***/
21
22 #include <signal.h>
23 #include <sched.h>
24 #include <unistd.h>
25 #include <sys/types.h>
26 #include <sys/mount.h>
27 #include <stdlib.h>
28 #include <string.h>
29 #include <stdio.h>
30 #include <errno.h>
31 #include <sys/prctl.h>
32 #include <getopt.h>
33 #include <grp.h>
34 #include <linux/fs.h>
35 #include <sys/socket.h>
36 #include <linux/netlink.h>
37 #include <net/if.h>
38 #include <linux/veth.h>
39 #include <sys/personality.h>
40 #include <linux/loop.h>
41 #include <sys/file.h>
42
43 #ifdef HAVE_SELINUX
44 #include <selinux/selinux.h>
45 #endif
46
47 #ifdef HAVE_SECCOMP
48 #include <seccomp.h>
49 #endif
50
51 #ifdef HAVE_BLKID
52 #include <blkid/blkid.h>
53 #endif
54
55 #include "sd-daemon.h"
56 #include "sd-bus.h"
57 #include "sd-id128.h"
58 #include "sd-rtnl.h"
59 #include "random-util.h"
60 #include "log.h"
61 #include "util.h"
62 #include "mkdir.h"
63 #include "rm-rf.h"
64 #include "macro.h"
65 #include "missing.h"
66 #include "cgroup-util.h"
67 #include "strv.h"
68 #include "path-util.h"
69 #include "loopback-setup.h"
70 #include "dev-setup.h"
71 #include "fdset.h"
72 #include "build.h"
73 #include "fileio.h"
74 #include "bus-util.h"
75 #include "bus-error.h"
76 #include "ptyfwd.h"
77 #include "env-util.h"
78 #include "rtnl-util.h"
79 #include "udev-util.h"
80 #include "blkid-util.h"
81 #include "gpt.h"
82 #include "siphash24.h"
83 #include "copy.h"
84 #include "base-filesystem.h"
85 #include "barrier.h"
86 #include "event-util.h"
87 #include "capability.h"
88 #include "cap-list.h"
89 #include "btrfs-util.h"
90 #include "machine-image.h"
91 #include "list.h"
92 #include "in-addr-util.h"
93 #include "fw-util.h"
94 #include "local-addresses.h"
95 #include "formats-util.h"
96 #include "process-util.h"
97 #include "terminal-util.h"
98 #include "hostname-util.h"
99 #include "signal-util.h"
100
101 #ifdef HAVE_SECCOMP
102 #include "seccomp-util.h"
103 #endif
104
105 typedef struct ExposePort {
106 int protocol;
107 uint16_t host_port;
108 uint16_t container_port;
109 LIST_FIELDS(struct ExposePort, ports);
110 } ExposePort;
111
112 typedef enum ContainerStatus {
113 CONTAINER_TERMINATED,
114 CONTAINER_REBOOTED
115 } ContainerStatus;
116
117 typedef enum LinkJournal {
118 LINK_NO,
119 LINK_AUTO,
120 LINK_HOST,
121 LINK_GUEST
122 } LinkJournal;
123
124 typedef enum Volatile {
125 VOLATILE_NO,
126 VOLATILE_YES,
127 VOLATILE_STATE,
128 } Volatile;
129
130 typedef enum CustomMountType {
131 CUSTOM_MOUNT_BIND,
132 CUSTOM_MOUNT_TMPFS,
133 CUSTOM_MOUNT_OVERLAY,
134 } CustomMountType;
135
136 typedef struct CustomMount {
137 CustomMountType type;
138 bool read_only;
139 char *source; /* for overlayfs this is the upper directory */
140 char *destination;
141 char *options;
142 char *work_dir;
143 char **lower;
144 } CustomMount;
145
146 static char *arg_directory = NULL;
147 static char *arg_template = NULL;
148 static char *arg_user = NULL;
149 static sd_id128_t arg_uuid = {};
150 static char *arg_machine = NULL;
151 static const char *arg_selinux_context = NULL;
152 static const char *arg_selinux_apifs_context = NULL;
153 static const char *arg_slice = NULL;
154 static bool arg_private_network = false;
155 static bool arg_read_only = false;
156 static bool arg_boot = false;
157 static bool arg_ephemeral = false;
158 static LinkJournal arg_link_journal = LINK_AUTO;
159 static bool arg_link_journal_try = false;
160 static uint64_t arg_retain =
161 (1ULL << CAP_CHOWN) |
162 (1ULL << CAP_DAC_OVERRIDE) |
163 (1ULL << CAP_DAC_READ_SEARCH) |
164 (1ULL << CAP_FOWNER) |
165 (1ULL << CAP_FSETID) |
166 (1ULL << CAP_IPC_OWNER) |
167 (1ULL << CAP_KILL) |
168 (1ULL << CAP_LEASE) |
169 (1ULL << CAP_LINUX_IMMUTABLE) |
170 (1ULL << CAP_NET_BIND_SERVICE) |
171 (1ULL << CAP_NET_BROADCAST) |
172 (1ULL << CAP_NET_RAW) |
173 (1ULL << CAP_SETGID) |
174 (1ULL << CAP_SETFCAP) |
175 (1ULL << CAP_SETPCAP) |
176 (1ULL << CAP_SETUID) |
177 (1ULL << CAP_SYS_ADMIN) |
178 (1ULL << CAP_SYS_CHROOT) |
179 (1ULL << CAP_SYS_NICE) |
180 (1ULL << CAP_SYS_PTRACE) |
181 (1ULL << CAP_SYS_TTY_CONFIG) |
182 (1ULL << CAP_SYS_RESOURCE) |
183 (1ULL << CAP_SYS_BOOT) |
184 (1ULL << CAP_AUDIT_WRITE) |
185 (1ULL << CAP_AUDIT_CONTROL) |
186 (1ULL << CAP_MKNOD);
187 static CustomMount *arg_custom_mounts = NULL;
188 static unsigned arg_n_custom_mounts = 0;
189 static char **arg_setenv = NULL;
190 static bool arg_quiet = false;
191 static bool arg_share_system = false;
192 static bool arg_register = true;
193 static bool arg_keep_unit = false;
194 static char **arg_network_interfaces = NULL;
195 static char **arg_network_macvlan = NULL;
196 static char **arg_network_ipvlan = NULL;
197 static bool arg_network_veth = false;
198 static const char *arg_network_bridge = NULL;
199 static unsigned long arg_personality = PERSONALITY_INVALID;
200 static char *arg_image = NULL;
201 static Volatile arg_volatile = VOLATILE_NO;
202 static ExposePort *arg_expose_ports = NULL;
203 static char **arg_property = NULL;
204 static uid_t arg_uid_shift = UID_INVALID, arg_uid_range = 0x10000U;
205 static bool arg_userns = false;
206 static int arg_kill_signal = 0;
207
208 static void help(void) {
209 printf("%s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
210 "Spawn a minimal namespace container for debugging, testing and building.\n\n"
211 " -h --help Show this help\n"
212 " --version Print version string\n"
213 " -q --quiet Do not show status information\n"
214 " -D --directory=PATH Root directory for the container\n"
215 " --template=PATH Initialize root directory from template directory,\n"
216 " if missing\n"
217 " -x --ephemeral Run container with snapshot of root directory, and\n"
218 " remove it after exit\n"
219 " -i --image=PATH File system device or disk image for the container\n"
220 " -b --boot Boot up full system (i.e. invoke init)\n"
221 " -u --user=USER Run the command under specified user or uid\n"
222 " -M --machine=NAME Set the machine name for the container\n"
223 " --uuid=UUID Set a specific machine UUID for the container\n"
224 " -S --slice=SLICE Place the container in the specified slice\n"
225 " --property=NAME=VALUE Set scope unit property\n"
226 " --private-users[=UIDBASE[:NUIDS]]\n"
227 " Run within user namespace\n"
228 " --private-network Disable network in container\n"
229 " --network-interface=INTERFACE\n"
230 " Assign an existing network interface to the\n"
231 " container\n"
232 " --network-macvlan=INTERFACE\n"
233 " Create a macvlan network interface based on an\n"
234 " existing network interface to the container\n"
235 " --network-ipvlan=INTERFACE\n"
236 " Create a ipvlan network interface based on an\n"
237 " existing network interface to the container\n"
238 " -n --network-veth Add a virtual ethernet connection between host\n"
239 " and container\n"
240 " --network-bridge=INTERFACE\n"
241 " Add a virtual ethernet connection between host\n"
242 " and container and add it to an existing bridge on\n"
243 " the host\n"
244 " -p --port=[PROTOCOL:]HOSTPORT[:CONTAINERPORT]\n"
245 " Expose a container IP port on the host\n"
246 " -Z --selinux-context=SECLABEL\n"
247 " Set the SELinux security context to be used by\n"
248 " processes in the container\n"
249 " -L --selinux-apifs-context=SECLABEL\n"
250 " Set the SELinux security context to be used by\n"
251 " API/tmpfs file systems in the container\n"
252 " --capability=CAP In addition to the default, retain specified\n"
253 " capability\n"
254 " --drop-capability=CAP Drop the specified capability from the default set\n"
255 " --kill-signal=SIGNAL Select signal to use for shutting down PID 1\n"
256 " --link-journal=MODE Link up guest journal, one of no, auto, guest, host,\n"
257 " try-guest, try-host\n"
258 " -j Equivalent to --link-journal=try-guest\n"
259 " --read-only Mount the root directory read-only\n"
260 " --bind=PATH[:PATH] Bind mount a file or directory from the host into\n"
261 " the container\n"
262 " --bind-ro=PATH[:PATH] Similar, but creates a read-only bind mount\n"
263 " --tmpfs=PATH:[OPTIONS] Mount an empty tmpfs to the specified directory\n"
264 " --overlay=PATH[:PATH...]:PATH\n"
265 " Create an overlay mount from the host to \n"
266 " the container\n"
267 " --overlay-ro=PATH[:PATH...]:PATH\n"
268 " Similar, but creates a read-only overlay mount\n"
269 " --setenv=NAME=VALUE Pass an environment variable to PID 1\n"
270 " --share-system Share system namespaces with host\n"
271 " --register=BOOLEAN Register container as machine\n"
272 " --keep-unit Do not register a scope for the machine, reuse\n"
273 " the service unit nspawn is running in\n"
274 " --volatile[=MODE] Run the system in volatile mode\n"
275 , program_invocation_short_name);
276 }
277
278 static CustomMount* custom_mount_add(CustomMountType t) {
279 CustomMount *c, *ret;
280
281 c = realloc(arg_custom_mounts, (arg_n_custom_mounts + 1) * sizeof(CustomMount));
282 if (!c)
283 return NULL;
284
285 arg_custom_mounts = c;
286 ret = arg_custom_mounts + arg_n_custom_mounts;
287 arg_n_custom_mounts++;
288
289 *ret = (CustomMount) { .type = t };
290
291 return ret;
292 }
293
294 static void custom_mount_free_all(void) {
295 unsigned i;
296
297 for (i = 0; i < arg_n_custom_mounts; i++) {
298 CustomMount *m = &arg_custom_mounts[i];
299
300 free(m->source);
301 free(m->destination);
302 free(m->options);
303
304 if (m->work_dir) {
305 (void) rm_rf(m->work_dir, REMOVE_ROOT|REMOVE_PHYSICAL);
306 free(m->work_dir);
307 }
308
309 strv_free(m->lower);
310 }
311
312 free(arg_custom_mounts);
313 arg_custom_mounts = NULL;
314 arg_n_custom_mounts = 0;
315 }
316
317 static int custom_mount_compare(const void *a, const void *b) {
318 const CustomMount *x = a, *y = b;
319 int r;
320
321 r = path_compare(x->destination, y->destination);
322 if (r != 0)
323 return r;
324
325 if (x->type < y->type)
326 return -1;
327 if (x->type > y->type)
328 return 1;
329
330 return 0;
331 }
332
333 static int custom_mounts_prepare(void) {
334 unsigned i;
335 int r;
336
337 /* Ensure the mounts are applied prefix first. */
338 qsort_safe(arg_custom_mounts, arg_n_custom_mounts, sizeof(CustomMount), custom_mount_compare);
339
340 /* Allocate working directories for the overlay file systems that need it */
341 for (i = 0; i < arg_n_custom_mounts; i++) {
342 CustomMount *m = &arg_custom_mounts[i];
343
344 if (m->type != CUSTOM_MOUNT_OVERLAY)
345 continue;
346
347 if (m->work_dir)
348 continue;
349
350 if (m->read_only)
351 continue;
352
353 r = tempfn_random(m->source, &m->work_dir);
354 if (r < 0)
355 return log_error_errno(r, "Failed to generate work directory from %s: %m", m->source);
356 }
357
358 return 0;
359 }
360
361 static int set_sanitized_path(char **b, const char *path) {
362 char *p;
363
364 assert(b);
365 assert(path);
366
367 p = canonicalize_file_name(path);
368 if (!p) {
369 if (errno != ENOENT)
370 return -errno;
371
372 p = path_make_absolute_cwd(path);
373 if (!p)
374 return -ENOMEM;
375 }
376
377 free(*b);
378 *b = path_kill_slashes(p);
379 return 0;
380 }
381
382 static int parse_argv(int argc, char *argv[]) {
383
384 enum {
385 ARG_VERSION = 0x100,
386 ARG_PRIVATE_NETWORK,
387 ARG_UUID,
388 ARG_READ_ONLY,
389 ARG_CAPABILITY,
390 ARG_DROP_CAPABILITY,
391 ARG_LINK_JOURNAL,
392 ARG_BIND,
393 ARG_BIND_RO,
394 ARG_TMPFS,
395 ARG_OVERLAY,
396 ARG_OVERLAY_RO,
397 ARG_SETENV,
398 ARG_SHARE_SYSTEM,
399 ARG_REGISTER,
400 ARG_KEEP_UNIT,
401 ARG_NETWORK_INTERFACE,
402 ARG_NETWORK_MACVLAN,
403 ARG_NETWORK_IPVLAN,
404 ARG_NETWORK_BRIDGE,
405 ARG_PERSONALITY,
406 ARG_VOLATILE,
407 ARG_TEMPLATE,
408 ARG_PROPERTY,
409 ARG_PRIVATE_USERS,
410 ARG_KILL_SIGNAL,
411 };
412
413 static const struct option options[] = {
414 { "help", no_argument, NULL, 'h' },
415 { "version", no_argument, NULL, ARG_VERSION },
416 { "directory", required_argument, NULL, 'D' },
417 { "template", required_argument, NULL, ARG_TEMPLATE },
418 { "ephemeral", no_argument, NULL, 'x' },
419 { "user", required_argument, NULL, 'u' },
420 { "private-network", no_argument, NULL, ARG_PRIVATE_NETWORK },
421 { "boot", no_argument, NULL, 'b' },
422 { "uuid", required_argument, NULL, ARG_UUID },
423 { "read-only", no_argument, NULL, ARG_READ_ONLY },
424 { "capability", required_argument, NULL, ARG_CAPABILITY },
425 { "drop-capability", required_argument, NULL, ARG_DROP_CAPABILITY },
426 { "link-journal", required_argument, NULL, ARG_LINK_JOURNAL },
427 { "bind", required_argument, NULL, ARG_BIND },
428 { "bind-ro", required_argument, NULL, ARG_BIND_RO },
429 { "tmpfs", required_argument, NULL, ARG_TMPFS },
430 { "overlay", required_argument, NULL, ARG_OVERLAY },
431 { "overlay-ro", required_argument, NULL, ARG_OVERLAY_RO },
432 { "machine", required_argument, NULL, 'M' },
433 { "slice", required_argument, NULL, 'S' },
434 { "setenv", required_argument, NULL, ARG_SETENV },
435 { "selinux-context", required_argument, NULL, 'Z' },
436 { "selinux-apifs-context", required_argument, NULL, 'L' },
437 { "quiet", no_argument, NULL, 'q' },
438 { "share-system", no_argument, NULL, ARG_SHARE_SYSTEM },
439 { "register", required_argument, NULL, ARG_REGISTER },
440 { "keep-unit", no_argument, NULL, ARG_KEEP_UNIT },
441 { "network-interface", required_argument, NULL, ARG_NETWORK_INTERFACE },
442 { "network-macvlan", required_argument, NULL, ARG_NETWORK_MACVLAN },
443 { "network-ipvlan", required_argument, NULL, ARG_NETWORK_IPVLAN },
444 { "network-veth", no_argument, NULL, 'n' },
445 { "network-bridge", required_argument, NULL, ARG_NETWORK_BRIDGE },
446 { "personality", required_argument, NULL, ARG_PERSONALITY },
447 { "image", required_argument, NULL, 'i' },
448 { "volatile", optional_argument, NULL, ARG_VOLATILE },
449 { "port", required_argument, NULL, 'p' },
450 { "property", required_argument, NULL, ARG_PROPERTY },
451 { "private-users", optional_argument, NULL, ARG_PRIVATE_USERS },
452 { "kill-signal", required_argument, NULL, ARG_KILL_SIGNAL },
453 {}
454 };
455
456 int c, r;
457 uint64_t plus = 0, minus = 0;
458
459 assert(argc >= 0);
460 assert(argv);
461
462 while ((c = getopt_long(argc, argv, "+hD:u:bL:M:jS:Z:qi:xp:n", options, NULL)) >= 0)
463
464 switch (c) {
465
466 case 'h':
467 help();
468 return 0;
469
470 case ARG_VERSION:
471 puts(PACKAGE_STRING);
472 puts(SYSTEMD_FEATURES);
473 return 0;
474
475 case 'D':
476 r = set_sanitized_path(&arg_directory, optarg);
477 if (r < 0)
478 return log_error_errno(r, "Invalid root directory: %m");
479
480 break;
481
482 case ARG_TEMPLATE:
483 r = set_sanitized_path(&arg_template, optarg);
484 if (r < 0)
485 return log_error_errno(r, "Invalid template directory: %m");
486
487 break;
488
489 case 'i':
490 r = set_sanitized_path(&arg_image, optarg);
491 if (r < 0)
492 return log_error_errno(r, "Invalid image path: %m");
493
494 break;
495
496 case 'x':
497 arg_ephemeral = true;
498 break;
499
500 case 'u':
501 free(arg_user);
502 arg_user = strdup(optarg);
503 if (!arg_user)
504 return log_oom();
505
506 break;
507
508 case ARG_NETWORK_BRIDGE:
509 arg_network_bridge = optarg;
510
511 /* fall through */
512
513 case 'n':
514 arg_network_veth = true;
515 arg_private_network = true;
516 break;
517
518 case ARG_NETWORK_INTERFACE:
519 if (strv_extend(&arg_network_interfaces, optarg) < 0)
520 return log_oom();
521
522 arg_private_network = true;
523 break;
524
525 case ARG_NETWORK_MACVLAN:
526 if (strv_extend(&arg_network_macvlan, optarg) < 0)
527 return log_oom();
528
529 arg_private_network = true;
530 break;
531
532 case ARG_NETWORK_IPVLAN:
533 if (strv_extend(&arg_network_ipvlan, optarg) < 0)
534 return log_oom();
535
536 /* fall through */
537
538 case ARG_PRIVATE_NETWORK:
539 arg_private_network = true;
540 break;
541
542 case 'b':
543 arg_boot = true;
544 break;
545
546 case ARG_UUID:
547 r = sd_id128_from_string(optarg, &arg_uuid);
548 if (r < 0) {
549 log_error("Invalid UUID: %s", optarg);
550 return r;
551 }
552 break;
553
554 case 'S':
555 arg_slice = optarg;
556 break;
557
558 case 'M':
559 if (isempty(optarg)) {
560 free(arg_machine);
561 arg_machine = NULL;
562 } else {
563 if (!machine_name_is_valid(optarg)) {
564 log_error("Invalid machine name: %s", optarg);
565 return -EINVAL;
566 }
567
568 r = free_and_strdup(&arg_machine, optarg);
569 if (r < 0)
570 return log_oom();
571
572 break;
573 }
574
575 case 'Z':
576 arg_selinux_context = optarg;
577 break;
578
579 case 'L':
580 arg_selinux_apifs_context = optarg;
581 break;
582
583 case ARG_READ_ONLY:
584 arg_read_only = true;
585 break;
586
587 case ARG_CAPABILITY:
588 case ARG_DROP_CAPABILITY: {
589 const char *state, *word;
590 size_t length;
591
592 FOREACH_WORD_SEPARATOR(word, length, optarg, ",", state) {
593 _cleanup_free_ char *t;
594
595 t = strndup(word, length);
596 if (!t)
597 return log_oom();
598
599 if (streq(t, "all")) {
600 if (c == ARG_CAPABILITY)
601 plus = (uint64_t) -1;
602 else
603 minus = (uint64_t) -1;
604 } else {
605 int cap;
606
607 cap = capability_from_name(t);
608 if (cap < 0) {
609 log_error("Failed to parse capability %s.", t);
610 return -EINVAL;
611 }
612
613 if (c == ARG_CAPABILITY)
614 plus |= 1ULL << (uint64_t) cap;
615 else
616 minus |= 1ULL << (uint64_t) cap;
617 }
618 }
619
620 break;
621 }
622
623 case 'j':
624 arg_link_journal = LINK_GUEST;
625 arg_link_journal_try = true;
626 break;
627
628 case ARG_LINK_JOURNAL:
629 if (streq(optarg, "auto")) {
630 arg_link_journal = LINK_AUTO;
631 arg_link_journal_try = false;
632 } else if (streq(optarg, "no")) {
633 arg_link_journal = LINK_NO;
634 arg_link_journal_try = false;
635 } else if (streq(optarg, "guest")) {
636 arg_link_journal = LINK_GUEST;
637 arg_link_journal_try = false;
638 } else if (streq(optarg, "host")) {
639 arg_link_journal = LINK_HOST;
640 arg_link_journal_try = false;
641 } else if (streq(optarg, "try-guest")) {
642 arg_link_journal = LINK_GUEST;
643 arg_link_journal_try = true;
644 } else if (streq(optarg, "try-host")) {
645 arg_link_journal = LINK_HOST;
646 arg_link_journal_try = true;
647 } else {
648 log_error("Failed to parse link journal mode %s", optarg);
649 return -EINVAL;
650 }
651
652 break;
653
654 case ARG_BIND:
655 case ARG_BIND_RO: {
656 _cleanup_free_ char *source = NULL, *destination = NULL;
657 CustomMount *m;
658 char *e;
659
660 e = strchr(optarg, ':');
661 if (e) {
662 source = strndup(optarg, e - optarg);
663 destination = strdup(e + 1);
664 } else {
665 source = strdup(optarg);
666 destination = strdup(optarg);
667 }
668
669 if (!source || !destination)
670 return log_oom();
671
672 if (!path_is_absolute(source) || !path_is_absolute(destination)) {
673 log_error("Invalid bind mount specification: %s", optarg);
674 return -EINVAL;
675 }
676
677 m = custom_mount_add(CUSTOM_MOUNT_BIND);
678 if (!m)
679 return log_oom();
680
681 m->source = source;
682 m->destination = destination;
683 m->read_only = c == ARG_BIND_RO;
684
685 source = destination = NULL;
686
687 break;
688 }
689
690 case ARG_TMPFS: {
691 _cleanup_free_ char *path = NULL, *opts = NULL;
692 CustomMount *m;
693 char *e;
694
695 e = strchr(optarg, ':');
696 if (e) {
697 path = strndup(optarg, e - optarg);
698 opts = strdup(e + 1);
699 } else {
700 path = strdup(optarg);
701 opts = strdup("mode=0755");
702 }
703
704 if (!path || !opts)
705 return log_oom();
706
707 if (!path_is_absolute(path)) {
708 log_error("Invalid tmpfs specification: %s", optarg);
709 return -EINVAL;
710 }
711
712 m = custom_mount_add(CUSTOM_MOUNT_TMPFS);
713 if (!m)
714 return log_oom();
715
716 m->destination = path;
717 m->options = opts;
718
719 path = opts = NULL;
720
721 break;
722 }
723
724 case ARG_OVERLAY:
725 case ARG_OVERLAY_RO: {
726 _cleanup_free_ char *upper = NULL, *destination = NULL;
727 _cleanup_strv_free_ char **lower = NULL;
728 CustomMount *m;
729 unsigned n = 0;
730 char **i;
731
732 lower = strv_split(optarg, ":");
733 if (!lower)
734 return log_oom();
735
736 STRV_FOREACH(i, lower) {
737 if (!path_is_absolute(*i)) {
738 log_error("Overlay path %s is not absolute.", *i);
739 return -EINVAL;
740 }
741
742 n++;
743 }
744
745 if (n < 2) {
746 log_error("--overlay= needs at least two colon-separated directories specified.");
747 return -EINVAL;
748 }
749
750 if (n == 2) {
751 /* If two parameters are specified,
752 * the first one is the lower, the
753 * second one the upper directory. And
754 * we'll also define the the
755 * destination mount point the same as
756 * the upper. */
757 upper = lower[1];
758 lower[1] = NULL;
759
760 destination = strdup(upper);
761 if (!destination)
762 return log_oom();
763
764 } else {
765 upper = lower[n - 2];
766 destination = lower[n - 1];
767 lower[n - 2] = NULL;
768 }
769
770 m = custom_mount_add(CUSTOM_MOUNT_OVERLAY);
771 if (!m)
772 return log_oom();
773
774 m->destination = destination;
775 m->source = upper;
776 m->lower = lower;
777 m->read_only = c == ARG_OVERLAY_RO;
778
779 upper = destination = NULL;
780 lower = NULL;
781
782 break;
783 }
784
785 case ARG_SETENV: {
786 char **n;
787
788 if (!env_assignment_is_valid(optarg)) {
789 log_error("Environment variable assignment '%s' is not valid.", optarg);
790 return -EINVAL;
791 }
792
793 n = strv_env_set(arg_setenv, optarg);
794 if (!n)
795 return log_oom();
796
797 strv_free(arg_setenv);
798 arg_setenv = n;
799 break;
800 }
801
802 case 'q':
803 arg_quiet = true;
804 break;
805
806 case ARG_SHARE_SYSTEM:
807 arg_share_system = true;
808 break;
809
810 case ARG_REGISTER:
811 r = parse_boolean(optarg);
812 if (r < 0) {
813 log_error("Failed to parse --register= argument: %s", optarg);
814 return r;
815 }
816
817 arg_register = r;
818 break;
819
820 case ARG_KEEP_UNIT:
821 arg_keep_unit = true;
822 break;
823
824 case ARG_PERSONALITY:
825
826 arg_personality = personality_from_string(optarg);
827 if (arg_personality == PERSONALITY_INVALID) {
828 log_error("Unknown or unsupported personality '%s'.", optarg);
829 return -EINVAL;
830 }
831
832 break;
833
834 case ARG_VOLATILE:
835
836 if (!optarg)
837 arg_volatile = VOLATILE_YES;
838 else {
839 r = parse_boolean(optarg);
840 if (r < 0) {
841 if (streq(optarg, "state"))
842 arg_volatile = VOLATILE_STATE;
843 else {
844 log_error("Failed to parse --volatile= argument: %s", optarg);
845 return r;
846 }
847 } else
848 arg_volatile = r ? VOLATILE_YES : VOLATILE_NO;
849 }
850
851 break;
852
853 case 'p': {
854 const char *split, *e;
855 uint16_t container_port, host_port;
856 int protocol;
857 ExposePort *p;
858
859 if ((e = startswith(optarg, "tcp:")))
860 protocol = IPPROTO_TCP;
861 else if ((e = startswith(optarg, "udp:")))
862 protocol = IPPROTO_UDP;
863 else {
864 e = optarg;
865 protocol = IPPROTO_TCP;
866 }
867
868 split = strchr(e, ':');
869 if (split) {
870 char v[split - e + 1];
871
872 memcpy(v, e, split - e);
873 v[split - e] = 0;
874
875 r = safe_atou16(v, &host_port);
876 if (r < 0 || host_port <= 0) {
877 log_error("Failed to parse host port: %s", optarg);
878 return -EINVAL;
879 }
880
881 r = safe_atou16(split + 1, &container_port);
882 } else {
883 r = safe_atou16(e, &container_port);
884 host_port = container_port;
885 }
886
887 if (r < 0 || container_port <= 0) {
888 log_error("Failed to parse host port: %s", optarg);
889 return -EINVAL;
890 }
891
892 LIST_FOREACH(ports, p, arg_expose_ports) {
893 if (p->protocol == protocol && p->host_port == host_port) {
894 log_error("Duplicate port specification: %s", optarg);
895 return -EINVAL;
896 }
897 }
898
899 p = new(ExposePort, 1);
900 if (!p)
901 return log_oom();
902
903 p->protocol = protocol;
904 p->host_port = host_port;
905 p->container_port = container_port;
906
907 LIST_PREPEND(ports, arg_expose_ports, p);
908
909 break;
910 }
911
912 case ARG_PROPERTY:
913 if (strv_extend(&arg_property, optarg) < 0)
914 return log_oom();
915
916 break;
917
918 case ARG_PRIVATE_USERS:
919 if (optarg) {
920 _cleanup_free_ char *buffer = NULL;
921 const char *range, *shift;
922
923 range = strchr(optarg, ':');
924 if (range) {
925 buffer = strndup(optarg, range - optarg);
926 if (!buffer)
927 return log_oom();
928 shift = buffer;
929
930 range++;
931 if (safe_atou32(range, &arg_uid_range) < 0 || arg_uid_range <= 0) {
932 log_error("Failed to parse UID range: %s", range);
933 return -EINVAL;
934 }
935 } else
936 shift = optarg;
937
938 if (parse_uid(shift, &arg_uid_shift) < 0) {
939 log_error("Failed to parse UID: %s", optarg);
940 return -EINVAL;
941 }
942 }
943
944 arg_userns = true;
945 break;
946
947 case ARG_KILL_SIGNAL:
948 arg_kill_signal = signal_from_string_try_harder(optarg);
949 if (arg_kill_signal < 0) {
950 log_error("Cannot parse signal: %s", optarg);
951 return -EINVAL;
952 }
953
954 break;
955
956 case '?':
957 return -EINVAL;
958
959 default:
960 assert_not_reached("Unhandled option");
961 }
962
963 if (arg_share_system)
964 arg_register = false;
965
966 if (arg_boot && arg_share_system) {
967 log_error("--boot and --share-system may not be combined.");
968 return -EINVAL;
969 }
970
971 if (arg_keep_unit && cg_pid_get_owner_uid(0, NULL) >= 0) {
972 log_error("--keep-unit may not be used when invoked from a user session.");
973 return -EINVAL;
974 }
975
976 if (arg_directory && arg_image) {
977 log_error("--directory= and --image= may not be combined.");
978 return -EINVAL;
979 }
980
981 if (arg_template && arg_image) {
982 log_error("--template= and --image= may not be combined.");
983 return -EINVAL;
984 }
985
986 if (arg_template && !(arg_directory || arg_machine)) {
987 log_error("--template= needs --directory= or --machine=.");
988 return -EINVAL;
989 }
990
991 if (arg_ephemeral && arg_template) {
992 log_error("--ephemeral and --template= may not be combined.");
993 return -EINVAL;
994 }
995
996 if (arg_ephemeral && arg_image) {
997 log_error("--ephemeral and --image= may not be combined.");
998 return -EINVAL;
999 }
1000
1001 if (arg_ephemeral && !IN_SET(arg_link_journal, LINK_NO, LINK_AUTO)) {
1002 log_error("--ephemeral and --link-journal= may not be combined.");
1003 return -EINVAL;
1004 }
1005
1006 if (arg_volatile != VOLATILE_NO && arg_read_only) {
1007 log_error("Cannot combine --read-only with --volatile. Note that --volatile already implies a read-only base hierarchy.");
1008 return -EINVAL;
1009 }
1010
1011 if (arg_expose_ports && !arg_private_network) {
1012 log_error("Cannot use --port= without private networking.");
1013 return -EINVAL;
1014 }
1015
1016 arg_retain = (arg_retain | plus | (arg_private_network ? 1ULL << CAP_NET_ADMIN : 0)) & ~minus;
1017
1018 if (arg_boot && arg_kill_signal <= 0)
1019 arg_kill_signal = SIGRTMIN+3;
1020
1021 return 1;
1022 }
1023
1024 static int tmpfs_patch_options(const char *options, char **ret) {
1025 char *buf = NULL;
1026
1027 if (arg_userns && arg_uid_shift != 0) {
1028
1029 if (options)
1030 (void) asprintf(&buf, "%s,uid=" UID_FMT ",gid=" UID_FMT, options, arg_uid_shift, arg_uid_shift);
1031 else
1032 (void) asprintf(&buf, "uid=" UID_FMT ",gid=" UID_FMT, arg_uid_shift, arg_uid_shift);
1033 if (!buf)
1034 return -ENOMEM;
1035
1036 options = buf;
1037 }
1038
1039 #ifdef HAVE_SELINUX
1040 if (arg_selinux_apifs_context) {
1041 char *t;
1042
1043 if (options)
1044 t = strjoin(options, ",context=\"", arg_selinux_apifs_context, "\"", NULL);
1045 else
1046 t = strjoin("context=\"", arg_selinux_apifs_context, "\"", NULL);
1047 if (!t) {
1048 free(buf);
1049 return -ENOMEM;
1050 }
1051
1052 free(buf);
1053 buf = t;
1054 }
1055 #endif
1056
1057 *ret = buf;
1058 return !!buf;
1059 }
1060
1061 static int mount_all(const char *dest, bool userns) {
1062
1063 typedef struct MountPoint {
1064 const char *what;
1065 const char *where;
1066 const char *type;
1067 const char *options;
1068 unsigned long flags;
1069 bool fatal;
1070 bool userns;
1071 } MountPoint;
1072
1073 static const MountPoint mount_table[] = {
1074 { "proc", "/proc", "proc", NULL, MS_NOSUID|MS_NOEXEC|MS_NODEV, true, true },
1075 { "/proc/sys", "/proc/sys", NULL, NULL, MS_BIND, true, true }, /* Bind mount first */
1076 { NULL, "/proc/sys", NULL, NULL, MS_BIND|MS_RDONLY|MS_REMOUNT, true, true }, /* Then, make it r/o */
1077 { "sysfs", "/sys", "sysfs", NULL, MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV, true, false },
1078 { "tmpfs", "/sys/fs/cgroup", "tmpfs", "mode=755", MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_STRICTATIME, true, false },
1079 { "tmpfs", "/dev", "tmpfs", "mode=755", MS_NOSUID|MS_STRICTATIME, true, false },
1080 { "tmpfs", "/dev/shm", "tmpfs", "mode=1777", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true, false },
1081 { "tmpfs", "/run", "tmpfs", "mode=755", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true, false },
1082 { "tmpfs", "/tmp", "tmpfs", "mode=1777", MS_STRICTATIME, true, false },
1083 #ifdef HAVE_SELINUX
1084 { "/sys/fs/selinux", "/sys/fs/selinux", NULL, NULL, MS_BIND, false, false }, /* Bind mount first */
1085 { NULL, "/sys/fs/selinux", NULL, NULL, MS_BIND|MS_RDONLY|MS_REMOUNT, false, false }, /* Then, make it r/o */
1086 #endif
1087 };
1088
1089 unsigned k;
1090 int r;
1091
1092 for (k = 0; k < ELEMENTSOF(mount_table); k++) {
1093 _cleanup_free_ char *where = NULL, *options = NULL;
1094 const char *o;
1095
1096 if (userns != mount_table[k].userns)
1097 continue;
1098
1099 where = prefix_root(dest, mount_table[k].where);
1100 if (!where)
1101 return log_oom();
1102
1103 r = path_is_mount_point(where, AT_SYMLINK_FOLLOW);
1104 if (r < 0 && r != -ENOENT)
1105 return log_error_errno(r, "Failed to detect whether %s is a mount point: %m", where);
1106
1107 /* Skip this entry if it is not a remount. */
1108 if (mount_table[k].what && r > 0)
1109 continue;
1110
1111 r = mkdir_p(where, 0755);
1112 if (r < 0) {
1113 if (mount_table[k].fatal)
1114 return log_error_errno(r, "Failed to create directory %s: %m", where);
1115
1116 log_warning_errno(r, "Failed to create directory %s: %m", where);
1117 continue;
1118 }
1119
1120 o = mount_table[k].options;
1121 if (streq_ptr(mount_table[k].type, "tmpfs")) {
1122 r = tmpfs_patch_options(o, &options);
1123 if (r < 0)
1124 return log_oom();
1125 if (r > 0)
1126 o = options;
1127 }
1128
1129 if (mount(mount_table[k].what,
1130 where,
1131 mount_table[k].type,
1132 mount_table[k].flags,
1133 o) < 0) {
1134
1135 if (mount_table[k].fatal)
1136 return log_error_errno(errno, "mount(%s) failed: %m", where);
1137
1138 log_warning_errno(errno, "mount(%s) failed, ignoring: %m", where);
1139 }
1140 }
1141
1142 return 0;
1143 }
1144
1145 static int mount_bind(const char *dest, CustomMount *m) {
1146 struct stat source_st, dest_st;
1147 const char *where;
1148 int r;
1149
1150 assert(m);
1151
1152 if (stat(m->source, &source_st) < 0)
1153 return log_error_errno(errno, "Failed to stat %s: %m", m->source);
1154
1155 where = prefix_roota(dest, m->destination);
1156
1157 if (stat(where, &dest_st) >= 0) {
1158 if (S_ISDIR(source_st.st_mode) && !S_ISDIR(dest_st.st_mode)) {
1159 log_error("Cannot bind mount directory %s on file %s.", m->source, where);
1160 return -EINVAL;
1161 }
1162
1163 if (!S_ISDIR(source_st.st_mode) && S_ISDIR(dest_st.st_mode)) {
1164 log_error("Cannot bind mount file %s on directory %s.", m->source, where);
1165 return -EINVAL;
1166 }
1167
1168 } else if (errno == ENOENT) {
1169 r = mkdir_parents_label(where, 0755);
1170 if (r < 0)
1171 return log_error_errno(r, "Failed to make parents of %s: %m", where);
1172 } else {
1173 log_error_errno(errno, "Failed to stat %s: %m", where);
1174 return -errno;
1175 }
1176
1177 /* Create the mount point. Any non-directory file can be
1178 * mounted on any non-directory file (regular, fifo, socket,
1179 * char, block).
1180 */
1181 if (S_ISDIR(source_st.st_mode))
1182 r = mkdir_label(where, 0755);
1183 else
1184 r = touch(where);
1185 if (r < 0 && r != -EEXIST)
1186 return log_error_errno(r, "Failed to create mount point %s: %m", where);
1187
1188 if (mount(m->source, where, NULL, MS_BIND, NULL) < 0)
1189 return log_error_errno(errno, "mount(%s) failed: %m", where);
1190
1191 if (m->read_only) {
1192 r = bind_remount_recursive(where, true);
1193 if (r < 0)
1194 return log_error_errno(r, "Read-only bind mount failed: %m");
1195 }
1196
1197 return 0;
1198 }
1199
1200 static int mount_tmpfs(const char *dest, CustomMount *m) {
1201 const char *where, *options;
1202 _cleanup_free_ char *buf = NULL;
1203 int r;
1204
1205 assert(dest);
1206 assert(m);
1207
1208 where = prefix_roota(dest, m->destination);
1209
1210 r = mkdir_p_label(where, 0755);
1211 if (r < 0 && r != -EEXIST)
1212 return log_error_errno(r, "Creating mount point for tmpfs %s failed: %m", where);
1213
1214 r = tmpfs_patch_options(m->options, &buf);
1215 if (r < 0)
1216 return log_oom();
1217 options = r > 0 ? buf : m->options;
1218
1219 if (mount("tmpfs", where, "tmpfs", MS_NODEV|MS_STRICTATIME, options) < 0)
1220 return log_error_errno(errno, "tmpfs mount to %s failed: %m", where);
1221
1222 return 0;
1223 }
1224
1225 static int mount_overlay(const char *dest, CustomMount *m) {
1226 _cleanup_free_ char *lower = NULL;
1227 const char *where, *options;
1228 int r;
1229
1230 assert(dest);
1231 assert(m);
1232
1233 where = prefix_roota(dest, m->destination);
1234
1235 r = mkdir_label(where, 0755);
1236 if (r < 0 && r != -EEXIST)
1237 return log_error_errno(r, "Creating mount point for overlay %s failed: %m", where);
1238
1239 (void) mkdir_p_label(m->source, 0755);
1240
1241 strv_reverse(m->lower);
1242 lower = strv_join(m->lower, ":");
1243 strv_reverse(m->lower);
1244 if (!lower)
1245 return log_oom();
1246
1247 if (m->read_only)
1248 options = strjoina("lowerdir=", m->source, ":", lower);
1249 else {
1250 assert(m->work_dir);
1251 (void) mkdir_label(m->work_dir, 0700);
1252
1253 options = strjoina("lowerdir=", lower, ",upperdir=", m->source, ",workdir=", m->work_dir);
1254 }
1255
1256 if (mount("overlay", where, "overlay", m->read_only ? MS_RDONLY : 0, options) < 0)
1257 return log_error_errno(errno, "overlay mount to %s failed: %m", where);
1258
1259 return 0;
1260 }
1261
1262 static int mount_custom(const char *dest) {
1263 unsigned i;
1264 int r;
1265
1266 assert(dest);
1267
1268 for (i = 0; i < arg_n_custom_mounts; i++) {
1269 CustomMount *m = &arg_custom_mounts[i];
1270
1271 switch (m->type) {
1272
1273 case CUSTOM_MOUNT_BIND:
1274 r = mount_bind(dest, m);
1275 break;
1276
1277 case CUSTOM_MOUNT_TMPFS:
1278 r = mount_tmpfs(dest, m);
1279 break;
1280
1281 case CUSTOM_MOUNT_OVERLAY:
1282 r = mount_overlay(dest, m);
1283 break;
1284
1285 default:
1286 assert_not_reached("Unknown custom mount type");
1287 }
1288
1289 if (r < 0)
1290 return r;
1291 }
1292
1293 return 0;
1294 }
1295
1296 static int mount_cgroup_hierarchy(const char *dest, const char *controller, const char *hierarchy, bool read_only) {
1297 char *to;
1298 int r;
1299
1300 to = strjoina(dest, "/sys/fs/cgroup/", hierarchy);
1301
1302 r = path_is_mount_point(to, 0);
1303 if (r < 0 && r != -ENOENT)
1304 return log_error_errno(r, "Failed to determine if %s is mounted already: %m", to);
1305 if (r > 0)
1306 return 0;
1307
1308 mkdir_p(to, 0755);
1309
1310 /* The superblock mount options of the mount point need to be
1311 * identical to the hosts', and hence writable... */
1312 if (mount("cgroup", to, "cgroup", MS_NOSUID|MS_NOEXEC|MS_NODEV, controller) < 0)
1313 return log_error_errno(errno, "Failed to mount to %s: %m", to);
1314
1315 /* ... hence let's only make the bind mount read-only, not the
1316 * superblock. */
1317 if (read_only) {
1318 if (mount(NULL, to, NULL, MS_BIND|MS_REMOUNT|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_RDONLY, NULL) < 0)
1319 return log_error_errno(errno, "Failed to remount %s read-only: %m", to);
1320 }
1321 return 1;
1322 }
1323
1324 static int mount_cgroup(const char *dest) {
1325 _cleanup_set_free_free_ Set *controllers = NULL;
1326 const char *cgroup_root;
1327 int r;
1328
1329 controllers = set_new(&string_hash_ops);
1330 if (!controllers)
1331 return log_oom();
1332
1333 r = cg_kernel_controllers(controllers);
1334 if (r < 0)
1335 return log_error_errno(r, "Failed to determine cgroup controllers: %m");
1336
1337 for (;;) {
1338 _cleanup_free_ char *controller = NULL, *origin = NULL, *combined = NULL;
1339
1340 controller = set_steal_first(controllers);
1341 if (!controller)
1342 break;
1343
1344 origin = prefix_root("/sys/fs/cgroup/", controller);
1345 if (!origin)
1346 return log_oom();
1347
1348 r = readlink_malloc(origin, &combined);
1349 if (r == -EINVAL) {
1350 /* Not a symbolic link, but directly a single cgroup hierarchy */
1351
1352 r = mount_cgroup_hierarchy(dest, controller, controller, true);
1353 if (r < 0)
1354 return r;
1355
1356 } else if (r < 0)
1357 return log_error_errno(r, "Failed to read link %s: %m", origin);
1358 else {
1359 _cleanup_free_ char *target = NULL;
1360
1361 target = prefix_root(dest, origin);
1362 if (!target)
1363 return log_oom();
1364
1365 /* A symbolic link, a combination of controllers in one hierarchy */
1366
1367 if (!filename_is_valid(combined)) {
1368 log_warning("Ignoring invalid combined hierarchy %s.", combined);
1369 continue;
1370 }
1371
1372 r = mount_cgroup_hierarchy(dest, combined, combined, true);
1373 if (r < 0)
1374 return r;
1375
1376 r = symlink_idempotent(combined, target);
1377 if (r == -EINVAL) {
1378 log_error("Invalid existing symlink for combined hierarchy");
1379 return r;
1380 }
1381 if (r < 0)
1382 return log_error_errno(r, "Failed to create symlink for combined hierarchy: %m");
1383 }
1384 }
1385
1386 r = mount_cgroup_hierarchy(dest, "name=systemd,xattr", "systemd", false);
1387 if (r < 0)
1388 return r;
1389
1390 cgroup_root = prefix_roota(dest, "/sys/fs/cgroup");
1391 if (mount(NULL, cgroup_root, NULL, MS_REMOUNT|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_STRICTATIME|MS_RDONLY, "mode=755") < 0)
1392 return log_error_errno(errno, "Failed to remount %s read-only: %m", cgroup_root);
1393
1394 return 0;
1395 }
1396
1397 static int mount_systemd_cgroup_writable(const char *dest) {
1398 _cleanup_free_ char *own_cgroup_path = NULL;
1399 const char *systemd_root, *systemd_own;
1400 int r;
1401
1402 assert(dest);
1403
1404 r = cg_pid_get_path(NULL, 0, &own_cgroup_path);
1405 if (r < 0)
1406 return log_error_errno(r, "Failed to determine our own cgroup path: %m");
1407
1408 /* Make our own cgroup a (writable) bind mount */
1409 systemd_own = strjoina(dest, "/sys/fs/cgroup/systemd", own_cgroup_path);
1410 if (mount(systemd_own, systemd_own, NULL, MS_BIND, NULL) < 0)
1411 return log_error_errno(errno, "Failed to turn %s into a bind mount: %m", own_cgroup_path);
1412
1413 /* And then remount the systemd cgroup root read-only */
1414 systemd_root = prefix_roota(dest, "/sys/fs/cgroup/systemd");
1415 if (mount(NULL, systemd_root, NULL, MS_BIND|MS_REMOUNT|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_RDONLY, NULL) < 0)
1416 return log_error_errno(errno, "Failed to mount cgroup root read-only: %m");
1417
1418 return 0;
1419 }
1420
1421 static int userns_lchown(const char *p, uid_t uid, gid_t gid) {
1422 assert(p);
1423
1424 if (!arg_userns)
1425 return 0;
1426
1427 if (uid == UID_INVALID && gid == GID_INVALID)
1428 return 0;
1429
1430 if (uid != UID_INVALID) {
1431 uid += arg_uid_shift;
1432
1433 if (uid < arg_uid_shift || uid >= arg_uid_shift + arg_uid_range)
1434 return -EOVERFLOW;
1435 }
1436
1437 if (gid != GID_INVALID) {
1438 gid += (gid_t) arg_uid_shift;
1439
1440 if (gid < (gid_t) arg_uid_shift || gid >= (gid_t) (arg_uid_shift + arg_uid_range))
1441 return -EOVERFLOW;
1442 }
1443
1444 if (lchown(p, uid, gid) < 0)
1445 return -errno;
1446
1447 return 0;
1448 }
1449
1450 static int userns_mkdir(const char *root, const char *path, mode_t mode, uid_t uid, gid_t gid) {
1451 const char *q;
1452
1453 q = prefix_roota(root, path);
1454 if (mkdir(q, mode) < 0) {
1455 if (errno == EEXIST)
1456 return 0;
1457 return -errno;
1458 }
1459
1460 return userns_lchown(q, uid, gid);
1461 }
1462
1463 static int setup_timezone(const char *dest) {
1464 _cleanup_free_ char *p = NULL, *q = NULL;
1465 const char *where, *check, *what;
1466 char *z, *y;
1467 int r;
1468
1469 assert(dest);
1470
1471 /* Fix the timezone, if possible */
1472 r = readlink_malloc("/etc/localtime", &p);
1473 if (r < 0) {
1474 log_warning("/etc/localtime is not a symlink, not updating container timezone.");
1475 return 0;
1476 }
1477
1478 z = path_startswith(p, "../usr/share/zoneinfo/");
1479 if (!z)
1480 z = path_startswith(p, "/usr/share/zoneinfo/");
1481 if (!z) {
1482 log_warning("/etc/localtime does not point into /usr/share/zoneinfo/, not updating container timezone.");
1483 return 0;
1484 }
1485
1486 where = prefix_roota(dest, "/etc/localtime");
1487 r = readlink_malloc(where, &q);
1488 if (r >= 0) {
1489 y = path_startswith(q, "../usr/share/zoneinfo/");
1490 if (!y)
1491 y = path_startswith(q, "/usr/share/zoneinfo/");
1492
1493 /* Already pointing to the right place? Then do nothing .. */
1494 if (y && streq(y, z))
1495 return 0;
1496 }
1497
1498 check = strjoina("/usr/share/zoneinfo/", z);
1499 check = prefix_root(dest, check);
1500 if (laccess(check, F_OK) < 0) {
1501 log_warning("Timezone %s does not exist in container, not updating container timezone.", z);
1502 return 0;
1503 }
1504
1505 r = unlink(where);
1506 if (r < 0 && errno != ENOENT) {
1507 log_error_errno(errno, "Failed to remove existing timezone info %s in container: %m", where);
1508 return 0;
1509 }
1510
1511 what = strjoina("../usr/share/zoneinfo/", z);
1512 if (symlink(what, where) < 0) {
1513 log_error_errno(errno, "Failed to correct timezone of container: %m");
1514 return 0;
1515 }
1516
1517 r = userns_lchown(where, 0, 0);
1518 if (r < 0)
1519 return log_warning_errno(r, "Failed to chown /etc/localtime: %m");
1520
1521 return 0;
1522 }
1523
1524 static int setup_resolv_conf(const char *dest) {
1525 const char *where = NULL;
1526 int r;
1527
1528 assert(dest);
1529
1530 if (arg_private_network)
1531 return 0;
1532
1533 /* Fix resolv.conf, if possible */
1534 where = prefix_roota(dest, "/etc/resolv.conf");
1535
1536 r = copy_file("/etc/resolv.conf", where, O_TRUNC|O_NOFOLLOW, 0644, 0);
1537 if (r < 0) {
1538 log_warning_errno(r, "Failed to copy /etc/resolv.conf to %s: %m", where);
1539 return 0;
1540 }
1541
1542 r = userns_lchown(where, 0, 0);
1543 if (r < 0)
1544 log_warning_errno(r, "Failed to chown /etc/resolv.conf: %m");
1545
1546 return 0;
1547 }
1548
1549 static int setup_volatile_state(const char *directory) {
1550 _cleanup_free_ char *buf = NULL;
1551 const char *p, *options;
1552 int r;
1553
1554 assert(directory);
1555
1556 if (arg_volatile != VOLATILE_STATE)
1557 return 0;
1558
1559 /* --volatile=state means we simply overmount /var
1560 with a tmpfs, and the rest read-only. */
1561
1562 r = bind_remount_recursive(directory, true);
1563 if (r < 0)
1564 return log_error_errno(r, "Failed to remount %s read-only: %m", directory);
1565
1566 p = prefix_roota(directory, "/var");
1567 r = mkdir(p, 0755);
1568 if (r < 0 && errno != EEXIST)
1569 return log_error_errno(errno, "Failed to create %s: %m", directory);
1570
1571 options = "mode=755";
1572 r = tmpfs_patch_options(options, &buf);
1573 if (r < 0)
1574 return log_oom();
1575 if (r > 0)
1576 options = buf;
1577
1578 if (mount("tmpfs", p, "tmpfs", MS_STRICTATIME, options) < 0)
1579 return log_error_errno(errno, "Failed to mount tmpfs to /var: %m");
1580
1581 return 0;
1582 }
1583
1584 static int setup_volatile(const char *directory) {
1585 bool tmpfs_mounted = false, bind_mounted = false;
1586 char template[] = "/tmp/nspawn-volatile-XXXXXX";
1587 _cleanup_free_ char *buf = NULL;
1588 const char *f, *t, *options;
1589 int r;
1590
1591 assert(directory);
1592
1593 if (arg_volatile != VOLATILE_YES)
1594 return 0;
1595
1596 /* --volatile=yes means we mount a tmpfs to the root dir, and
1597 the original /usr to use inside it, and that read-only. */
1598
1599 if (!mkdtemp(template))
1600 return log_error_errno(errno, "Failed to create temporary directory: %m");
1601
1602 options = "mode=755";
1603 r = tmpfs_patch_options(options, &buf);
1604 if (r < 0)
1605 return log_oom();
1606 if (r > 0)
1607 options = buf;
1608
1609 if (mount("tmpfs", template, "tmpfs", MS_STRICTATIME, options) < 0) {
1610 r = log_error_errno(errno, "Failed to mount tmpfs for root directory: %m");
1611 goto fail;
1612 }
1613
1614 tmpfs_mounted = true;
1615
1616 f = prefix_roota(directory, "/usr");
1617 t = prefix_roota(template, "/usr");
1618
1619 r = mkdir(t, 0755);
1620 if (r < 0 && errno != EEXIST) {
1621 r = log_error_errno(errno, "Failed to create %s: %m", t);
1622 goto fail;
1623 }
1624
1625 if (mount(f, t, NULL, MS_BIND|MS_REC, NULL) < 0) {
1626 r = log_error_errno(errno, "Failed to create /usr bind mount: %m");
1627 goto fail;
1628 }
1629
1630 bind_mounted = true;
1631
1632 r = bind_remount_recursive(t, true);
1633 if (r < 0) {
1634 log_error_errno(r, "Failed to remount %s read-only: %m", t);
1635 goto fail;
1636 }
1637
1638 if (mount(template, directory, NULL, MS_MOVE, NULL) < 0) {
1639 r = log_error_errno(errno, "Failed to move root mount: %m");
1640 goto fail;
1641 }
1642
1643 (void) rmdir(template);
1644
1645 return 0;
1646
1647 fail:
1648 if (bind_mounted)
1649 (void) umount(t);
1650
1651 if (tmpfs_mounted)
1652 (void) umount(template);
1653 (void) rmdir(template);
1654 return r;
1655 }
1656
1657 static char* id128_format_as_uuid(sd_id128_t id, char s[37]) {
1658 assert(s);
1659
1660 snprintf(s, 37,
1661 "%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
1662 SD_ID128_FORMAT_VAL(id));
1663
1664 return s;
1665 }
1666
1667 static int setup_boot_id(const char *dest) {
1668 const char *from, *to;
1669 sd_id128_t rnd = {};
1670 char as_uuid[37];
1671 int r;
1672
1673 if (arg_share_system)
1674 return 0;
1675
1676 /* Generate a new randomized boot ID, so that each boot-up of
1677 * the container gets a new one */
1678
1679 from = prefix_roota(dest, "/run/proc-sys-kernel-random-boot-id");
1680 to = prefix_roota(dest, "/proc/sys/kernel/random/boot_id");
1681
1682 r = sd_id128_randomize(&rnd);
1683 if (r < 0)
1684 return log_error_errno(r, "Failed to generate random boot id: %m");
1685
1686 id128_format_as_uuid(rnd, as_uuid);
1687
1688 r = write_string_file(from, as_uuid);
1689 if (r < 0)
1690 return log_error_errno(r, "Failed to write boot id: %m");
1691
1692 if (mount(from, to, NULL, MS_BIND, NULL) < 0)
1693 r = log_error_errno(errno, "Failed to bind mount boot id: %m");
1694 else if (mount(NULL, to, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY|MS_NOSUID|MS_NODEV, NULL) < 0)
1695 log_warning_errno(errno, "Failed to make boot id read-only: %m");
1696
1697 unlink(from);
1698 return r;
1699 }
1700
1701 static int copy_devnodes(const char *dest) {
1702
1703 static const char devnodes[] =
1704 "null\0"
1705 "zero\0"
1706 "full\0"
1707 "random\0"
1708 "urandom\0"
1709 "tty\0"
1710 "net/tun\0";
1711
1712 const char *d;
1713 int r = 0;
1714 _cleanup_umask_ mode_t u;
1715
1716 assert(dest);
1717
1718 u = umask(0000);
1719
1720 /* Create /dev/net, so that we can create /dev/net/tun in it */
1721 if (userns_mkdir(dest, "/dev/net", 0755, 0, 0) < 0)
1722 return log_error_errno(r, "Failed to create /dev/net directory: %m");
1723
1724 NULSTR_FOREACH(d, devnodes) {
1725 _cleanup_free_ char *from = NULL, *to = NULL;
1726 struct stat st;
1727
1728 from = strappend("/dev/", d);
1729 to = prefix_root(dest, from);
1730
1731 if (stat(from, &st) < 0) {
1732
1733 if (errno != ENOENT)
1734 return log_error_errno(errno, "Failed to stat %s: %m", from);
1735
1736 } else if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
1737
1738 log_error("%s is not a char or block device, cannot copy.", from);
1739 return -EIO;
1740
1741 } else {
1742 if (mknod(to, st.st_mode, st.st_rdev) < 0) {
1743 if (errno != EPERM)
1744 return log_error_errno(errno, "mknod(%s) failed: %m", to);
1745
1746 /* Some systems abusively restrict mknod but
1747 * allow bind mounts. */
1748 r = touch(to);
1749 if (r < 0)
1750 return log_error_errno(r, "touch (%s) failed: %m", to);
1751 if (mount(from, to, NULL, MS_BIND, NULL) < 0)
1752 return log_error_errno(errno, "Both mknod and bind mount (%s) failed: %m", to);
1753 }
1754
1755 r = userns_lchown(to, 0, 0);
1756 if (r < 0)
1757 return log_error_errno(r, "chown() of device node %s failed: %m", to);
1758 }
1759 }
1760
1761 return r;
1762 }
1763
1764 static int setup_pts(const char *dest) {
1765 _cleanup_free_ char *options = NULL;
1766 const char *p;
1767
1768 #ifdef HAVE_SELINUX
1769 if (arg_selinux_apifs_context)
1770 (void) asprintf(&options,
1771 "newinstance,ptmxmode=0666,mode=620,uid=" UID_FMT ",gid=" GID_FMT ",context=\"%s\"",
1772 arg_uid_shift,
1773 arg_uid_shift + TTY_GID,
1774 arg_selinux_apifs_context);
1775 else
1776 #endif
1777 (void) asprintf(&options,
1778 "newinstance,ptmxmode=0666,mode=620,uid=" UID_FMT ",gid=" GID_FMT,
1779 arg_uid_shift,
1780 arg_uid_shift + TTY_GID);
1781
1782 if (!options)
1783 return log_oom();
1784
1785 /* Mount /dev/pts itself */
1786 p = prefix_roota(dest, "/dev/pts");
1787 if (mkdir(p, 0755) < 0)
1788 return log_error_errno(errno, "Failed to create /dev/pts: %m");
1789 if (mount("devpts", p, "devpts", MS_NOSUID|MS_NOEXEC, options) < 0)
1790 return log_error_errno(errno, "Failed to mount /dev/pts: %m");
1791 if (userns_lchown(p, 0, 0) < 0)
1792 return log_error_errno(errno, "Failed to chown /dev/pts: %m");
1793
1794 /* Create /dev/ptmx symlink */
1795 p = prefix_roota(dest, "/dev/ptmx");
1796 if (symlink("pts/ptmx", p) < 0)
1797 return log_error_errno(errno, "Failed to create /dev/ptmx symlink: %m");
1798 if (userns_lchown(p, 0, 0) < 0)
1799 return log_error_errno(errno, "Failed to chown /dev/ptmx: %m");
1800
1801 /* And fix /dev/pts/ptmx ownership */
1802 p = prefix_roota(dest, "/dev/pts/ptmx");
1803 if (userns_lchown(p, 0, 0) < 0)
1804 return log_error_errno(errno, "Failed to chown /dev/pts/ptmx: %m");
1805
1806 return 0;
1807 }
1808
1809 static int setup_dev_console(const char *dest, const char *console) {
1810 _cleanup_umask_ mode_t u;
1811 const char *to;
1812 int r;
1813
1814 assert(dest);
1815 assert(console);
1816
1817 u = umask(0000);
1818
1819 r = chmod_and_chown(console, 0600, arg_uid_shift, arg_uid_shift);
1820 if (r < 0)
1821 return log_error_errno(r, "Failed to correct access mode for TTY: %m");
1822
1823 /* We need to bind mount the right tty to /dev/console since
1824 * ptys can only exist on pts file systems. To have something
1825 * to bind mount things on we create a empty regular file. */
1826
1827 to = prefix_roota(dest, "/dev/console");
1828 r = touch(to);
1829 if (r < 0)
1830 return log_error_errno(r, "touch() for /dev/console failed: %m");
1831
1832 if (mount(console, to, NULL, MS_BIND, NULL) < 0)
1833 return log_error_errno(errno, "Bind mount for /dev/console failed: %m");
1834
1835 return 0;
1836 }
1837
1838 static int setup_kmsg(const char *dest, int kmsg_socket) {
1839 const char *from, *to;
1840 _cleanup_umask_ mode_t u;
1841 int fd, k;
1842 union {
1843 struct cmsghdr cmsghdr;
1844 uint8_t buf[CMSG_SPACE(sizeof(int))];
1845 } control = {};
1846 struct msghdr mh = {
1847 .msg_control = &control,
1848 .msg_controllen = sizeof(control),
1849 };
1850 struct cmsghdr *cmsg;
1851
1852 assert(kmsg_socket >= 0);
1853
1854 u = umask(0000);
1855
1856 /* We create the kmsg FIFO as /run/kmsg, but immediately
1857 * delete it after bind mounting it to /proc/kmsg. While FIFOs
1858 * on the reading side behave very similar to /proc/kmsg,
1859 * their writing side behaves differently from /dev/kmsg in
1860 * that writing blocks when nothing is reading. In order to
1861 * avoid any problems with containers deadlocking due to this
1862 * we simply make /dev/kmsg unavailable to the container. */
1863 from = prefix_roota(dest, "/run/kmsg");
1864 to = prefix_roota(dest, "/proc/kmsg");
1865
1866 if (mkfifo(from, 0600) < 0)
1867 return log_error_errno(errno, "mkfifo() for /run/kmsg failed: %m");
1868 if (mount(from, to, NULL, MS_BIND, NULL) < 0)
1869 return log_error_errno(errno, "Bind mount for /proc/kmsg failed: %m");
1870
1871 fd = open(from, O_RDWR|O_NDELAY|O_CLOEXEC);
1872 if (fd < 0)
1873 return log_error_errno(errno, "Failed to open fifo: %m");
1874
1875 cmsg = CMSG_FIRSTHDR(&mh);
1876 cmsg->cmsg_level = SOL_SOCKET;
1877 cmsg->cmsg_type = SCM_RIGHTS;
1878 cmsg->cmsg_len = CMSG_LEN(sizeof(int));
1879 memcpy(CMSG_DATA(cmsg), &fd, sizeof(int));
1880
1881 mh.msg_controllen = cmsg->cmsg_len;
1882
1883 /* Store away the fd in the socket, so that it stays open as
1884 * long as we run the child */
1885 k = sendmsg(kmsg_socket, &mh, MSG_NOSIGNAL);
1886 safe_close(fd);
1887
1888 if (k < 0)
1889 return log_error_errno(errno, "Failed to send FIFO fd: %m");
1890
1891 /* And now make the FIFO unavailable as /run/kmsg... */
1892 (void) unlink(from);
1893
1894 return 0;
1895 }
1896
1897 static int send_rtnl(int send_fd) {
1898 union {
1899 struct cmsghdr cmsghdr;
1900 uint8_t buf[CMSG_SPACE(sizeof(int))];
1901 } control = {};
1902 struct msghdr mh = {
1903 .msg_control = &control,
1904 .msg_controllen = sizeof(control),
1905 };
1906 struct cmsghdr *cmsg;
1907 _cleanup_close_ int fd = -1;
1908 ssize_t k;
1909
1910 assert(send_fd >= 0);
1911
1912 if (!arg_expose_ports)
1913 return 0;
1914
1915 fd = socket(PF_NETLINK, SOCK_RAW|SOCK_CLOEXEC|SOCK_NONBLOCK, NETLINK_ROUTE);
1916 if (fd < 0)
1917 return log_error_errno(errno, "Failed to allocate container netlink: %m");
1918
1919 cmsg = CMSG_FIRSTHDR(&mh);
1920 cmsg->cmsg_level = SOL_SOCKET;
1921 cmsg->cmsg_type = SCM_RIGHTS;
1922 cmsg->cmsg_len = CMSG_LEN(sizeof(int));
1923 memcpy(CMSG_DATA(cmsg), &fd, sizeof(int));
1924
1925 mh.msg_controllen = cmsg->cmsg_len;
1926
1927 /* Store away the fd in the socket, so that it stays open as
1928 * long as we run the child */
1929 k = sendmsg(send_fd, &mh, MSG_NOSIGNAL);
1930 if (k < 0)
1931 return log_error_errno(errno, "Failed to send netlink fd: %m");
1932
1933 return 0;
1934 }
1935
1936 static int flush_ports(union in_addr_union *exposed) {
1937 ExposePort *p;
1938 int r, af = AF_INET;
1939
1940 assert(exposed);
1941
1942 if (!arg_expose_ports)
1943 return 0;
1944
1945 if (in_addr_is_null(af, exposed))
1946 return 0;
1947
1948 log_debug("Lost IP address.");
1949
1950 LIST_FOREACH(ports, p, arg_expose_ports) {
1951 r = fw_add_local_dnat(false,
1952 af,
1953 p->protocol,
1954 NULL,
1955 NULL, 0,
1956 NULL, 0,
1957 p->host_port,
1958 exposed,
1959 p->container_port,
1960 NULL);
1961 if (r < 0)
1962 log_warning_errno(r, "Failed to modify firewall: %m");
1963 }
1964
1965 *exposed = IN_ADDR_NULL;
1966 return 0;
1967 }
1968
1969 static int expose_ports(sd_rtnl *rtnl, union in_addr_union *exposed) {
1970 _cleanup_free_ struct local_address *addresses = NULL;
1971 _cleanup_free_ char *pretty = NULL;
1972 union in_addr_union new_exposed;
1973 ExposePort *p;
1974 bool add;
1975 int af = AF_INET, r;
1976
1977 assert(exposed);
1978
1979 /* Invoked each time an address is added or removed inside the
1980 * container */
1981
1982 if (!arg_expose_ports)
1983 return 0;
1984
1985 r = local_addresses(rtnl, 0, af, &addresses);
1986 if (r < 0)
1987 return log_error_errno(r, "Failed to enumerate local addresses: %m");
1988
1989 add = r > 0 &&
1990 addresses[0].family == af &&
1991 addresses[0].scope < RT_SCOPE_LINK;
1992
1993 if (!add)
1994 return flush_ports(exposed);
1995
1996 new_exposed = addresses[0].address;
1997 if (in_addr_equal(af, exposed, &new_exposed))
1998 return 0;
1999
2000 in_addr_to_string(af, &new_exposed, &pretty);
2001 log_debug("New container IP is %s.", strna(pretty));
2002
2003 LIST_FOREACH(ports, p, arg_expose_ports) {
2004
2005 r = fw_add_local_dnat(true,
2006 af,
2007 p->protocol,
2008 NULL,
2009 NULL, 0,
2010 NULL, 0,
2011 p->host_port,
2012 &new_exposed,
2013 p->container_port,
2014 in_addr_is_null(af, exposed) ? NULL : exposed);
2015 if (r < 0)
2016 log_warning_errno(r, "Failed to modify firewall: %m");
2017 }
2018
2019 *exposed = new_exposed;
2020 return 0;
2021 }
2022
2023 static int on_address_change(sd_rtnl *rtnl, sd_rtnl_message *m, void *userdata) {
2024 union in_addr_union *exposed = userdata;
2025
2026 assert(rtnl);
2027 assert(m);
2028 assert(exposed);
2029
2030 expose_ports(rtnl, exposed);
2031 return 0;
2032 }
2033
2034 static int watch_rtnl(sd_event *event, int recv_fd, union in_addr_union *exposed, sd_rtnl **ret) {
2035 union {
2036 struct cmsghdr cmsghdr;
2037 uint8_t buf[CMSG_SPACE(sizeof(int))];
2038 } control = {};
2039 struct msghdr mh = {
2040 .msg_control = &control,
2041 .msg_controllen = sizeof(control),
2042 };
2043 struct cmsghdr *cmsg;
2044 _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
2045 int fd, r;
2046 ssize_t k;
2047
2048 assert(event);
2049 assert(recv_fd >= 0);
2050 assert(ret);
2051
2052 if (!arg_expose_ports)
2053 return 0;
2054
2055 k = recvmsg(recv_fd, &mh, MSG_NOSIGNAL);
2056 if (k < 0)
2057 return log_error_errno(errno, "Failed to recv netlink fd: %m");
2058
2059 cmsg = CMSG_FIRSTHDR(&mh);
2060 assert(cmsg->cmsg_level == SOL_SOCKET);
2061 assert(cmsg->cmsg_type == SCM_RIGHTS);
2062 assert(cmsg->cmsg_len == CMSG_LEN(sizeof(int)));
2063 memcpy(&fd, CMSG_DATA(cmsg), sizeof(int));
2064
2065 r = sd_rtnl_open_fd(&rtnl, fd, 1, RTNLGRP_IPV4_IFADDR);
2066 if (r < 0) {
2067 safe_close(fd);
2068 return log_error_errno(r, "Failed to create rtnl object: %m");
2069 }
2070
2071 r = sd_rtnl_add_match(rtnl, RTM_NEWADDR, on_address_change, exposed);
2072 if (r < 0)
2073 return log_error_errno(r, "Failed to subscribe to RTM_NEWADDR messages: %m");
2074
2075 r = sd_rtnl_add_match(rtnl, RTM_DELADDR, on_address_change, exposed);
2076 if (r < 0)
2077 return log_error_errno(r, "Failed to subscribe to RTM_DELADDR messages: %m");
2078
2079 r = sd_rtnl_attach_event(rtnl, event, 0);
2080 if (r < 0)
2081 return log_error_errno(r, "Failed to add to even loop: %m");
2082
2083 *ret = rtnl;
2084 rtnl = NULL;
2085
2086 return 0;
2087 }
2088
2089 static int setup_hostname(void) {
2090
2091 if (arg_share_system)
2092 return 0;
2093
2094 if (sethostname_idempotent(arg_machine) < 0)
2095 return -errno;
2096
2097 return 0;
2098 }
2099
2100 static int setup_journal(const char *directory) {
2101 sd_id128_t machine_id, this_id;
2102 _cleanup_free_ char *b = NULL, *d = NULL;
2103 const char *etc_machine_id, *p, *q;
2104 char *id;
2105 int r;
2106
2107 /* Don't link journals in ephemeral mode */
2108 if (arg_ephemeral)
2109 return 0;
2110
2111 etc_machine_id = prefix_roota(directory, "/etc/machine-id");
2112
2113 r = read_one_line_file(etc_machine_id, &b);
2114 if (r == -ENOENT && arg_link_journal == LINK_AUTO)
2115 return 0;
2116 else if (r < 0)
2117 return log_error_errno(r, "Failed to read machine ID from %s: %m", etc_machine_id);
2118
2119 id = strstrip(b);
2120 if (isempty(id) && arg_link_journal == LINK_AUTO)
2121 return 0;
2122
2123 /* Verify validity */
2124 r = sd_id128_from_string(id, &machine_id);
2125 if (r < 0)
2126 return log_error_errno(r, "Failed to parse machine ID from %s: %m", etc_machine_id);
2127
2128 r = sd_id128_get_machine(&this_id);
2129 if (r < 0)
2130 return log_error_errno(r, "Failed to retrieve machine ID: %m");
2131
2132 if (sd_id128_equal(machine_id, this_id)) {
2133 log_full(arg_link_journal == LINK_AUTO ? LOG_WARNING : LOG_ERR,
2134 "Host and machine ids are equal (%s): refusing to link journals", id);
2135 if (arg_link_journal == LINK_AUTO)
2136 return 0;
2137 return -EEXIST;
2138 }
2139
2140 if (arg_link_journal == LINK_NO)
2141 return 0;
2142
2143 r = userns_mkdir(directory, "/var", 0755, 0, 0);
2144 if (r < 0)
2145 return log_error_errno(r, "Failed to create /var: %m");
2146
2147 r = userns_mkdir(directory, "/var/log", 0755, 0, 0);
2148 if (r < 0)
2149 return log_error_errno(r, "Failed to create /var/log: %m");
2150
2151 r = userns_mkdir(directory, "/var/log/journal", 0755, 0, 0);
2152 if (r < 0)
2153 return log_error_errno(r, "Failed to create /var/log/journal: %m");
2154
2155 p = strjoina("/var/log/journal/", id);
2156 q = prefix_roota(directory, p);
2157
2158 if (path_is_mount_point(p, 0) > 0) {
2159 if (arg_link_journal != LINK_AUTO) {
2160 log_error("%s: already a mount point, refusing to use for journal", p);
2161 return -EEXIST;
2162 }
2163
2164 return 0;
2165 }
2166
2167 if (path_is_mount_point(q, 0) > 0) {
2168 if (arg_link_journal != LINK_AUTO) {
2169 log_error("%s: already a mount point, refusing to use for journal", q);
2170 return -EEXIST;
2171 }
2172
2173 return 0;
2174 }
2175
2176 r = readlink_and_make_absolute(p, &d);
2177 if (r >= 0) {
2178 if ((arg_link_journal == LINK_GUEST ||
2179 arg_link_journal == LINK_AUTO) &&
2180 path_equal(d, q)) {
2181
2182 r = userns_mkdir(directory, p, 0755, 0, 0);
2183 if (r < 0)
2184 log_warning_errno(errno, "Failed to create directory %s: %m", q);
2185 return 0;
2186 }
2187
2188 if (unlink(p) < 0)
2189 return log_error_errno(errno, "Failed to remove symlink %s: %m", p);
2190 } else if (r == -EINVAL) {
2191
2192 if (arg_link_journal == LINK_GUEST &&
2193 rmdir(p) < 0) {
2194
2195 if (errno == ENOTDIR) {
2196 log_error("%s already exists and is neither a symlink nor a directory", p);
2197 return r;
2198 } else {
2199 log_error_errno(errno, "Failed to remove %s: %m", p);
2200 return -errno;
2201 }
2202 }
2203 } else if (r != -ENOENT) {
2204 log_error_errno(errno, "readlink(%s) failed: %m", p);
2205 return r;
2206 }
2207
2208 if (arg_link_journal == LINK_GUEST) {
2209
2210 if (symlink(q, p) < 0) {
2211 if (arg_link_journal_try) {
2212 log_debug_errno(errno, "Failed to symlink %s to %s, skipping journal setup: %m", q, p);
2213 return 0;
2214 } else {
2215 log_error_errno(errno, "Failed to symlink %s to %s: %m", q, p);
2216 return -errno;
2217 }
2218 }
2219
2220 r = userns_mkdir(directory, p, 0755, 0, 0);
2221 if (r < 0)
2222 log_warning_errno(errno, "Failed to create directory %s: %m", q);
2223 return 0;
2224 }
2225
2226 if (arg_link_journal == LINK_HOST) {
2227 /* don't create parents here -- if the host doesn't have
2228 * permanent journal set up, don't force it here */
2229 r = mkdir(p, 0755);
2230 if (r < 0) {
2231 if (arg_link_journal_try) {
2232 log_debug_errno(errno, "Failed to create %s, skipping journal setup: %m", p);
2233 return 0;
2234 } else {
2235 log_error_errno(errno, "Failed to create %s: %m", p);
2236 return r;
2237 }
2238 }
2239
2240 } else if (access(p, F_OK) < 0)
2241 return 0;
2242
2243 if (dir_is_empty(q) == 0)
2244 log_warning("%s is not empty, proceeding anyway.", q);
2245
2246 r = userns_mkdir(directory, p, 0755, 0, 0);
2247 if (r < 0) {
2248 log_error_errno(errno, "Failed to create %s: %m", q);
2249 return r;
2250 }
2251
2252 if (mount(p, q, NULL, MS_BIND, NULL) < 0)
2253 return log_error_errno(errno, "Failed to bind mount journal from host into guest: %m");
2254
2255 return 0;
2256 }
2257
2258 static int drop_capabilities(void) {
2259 return capability_bounding_set_drop(~arg_retain, false);
2260 }
2261
2262 static int register_machine(pid_t pid, int local_ifindex) {
2263 _cleanup_bus_error_free_ sd_bus_error error = SD_BUS_ERROR_NULL;
2264 _cleanup_bus_close_unref_ sd_bus *bus = NULL;
2265 int r;
2266
2267 if (!arg_register)
2268 return 0;
2269
2270 r = sd_bus_default_system(&bus);
2271 if (r < 0)
2272 return log_error_errno(r, "Failed to open system bus: %m");
2273
2274 if (arg_keep_unit) {
2275 r = sd_bus_call_method(
2276 bus,
2277 "org.freedesktop.machine1",
2278 "/org/freedesktop/machine1",
2279 "org.freedesktop.machine1.Manager",
2280 "RegisterMachineWithNetwork",
2281 &error,
2282 NULL,
2283 "sayssusai",
2284 arg_machine,
2285 SD_BUS_MESSAGE_APPEND_ID128(arg_uuid),
2286 "nspawn",
2287 "container",
2288 (uint32_t) pid,
2289 strempty(arg_directory),
2290 local_ifindex > 0 ? 1 : 0, local_ifindex);
2291 } else {
2292 _cleanup_bus_message_unref_ sd_bus_message *m = NULL;
2293 char **i;
2294 unsigned j;
2295
2296 r = sd_bus_message_new_method_call(
2297 bus,
2298 &m,
2299 "org.freedesktop.machine1",
2300 "/org/freedesktop/machine1",
2301 "org.freedesktop.machine1.Manager",
2302 "CreateMachineWithNetwork");
2303 if (r < 0)
2304 return bus_log_create_error(r);
2305
2306 r = sd_bus_message_append(
2307 m,
2308 "sayssusai",
2309 arg_machine,
2310 SD_BUS_MESSAGE_APPEND_ID128(arg_uuid),
2311 "nspawn",
2312 "container",
2313 (uint32_t) pid,
2314 strempty(arg_directory),
2315 local_ifindex > 0 ? 1 : 0, local_ifindex);
2316 if (r < 0)
2317 return bus_log_create_error(r);
2318
2319 r = sd_bus_message_open_container(m, 'a', "(sv)");
2320 if (r < 0)
2321 return bus_log_create_error(r);
2322
2323 if (!isempty(arg_slice)) {
2324 r = sd_bus_message_append(m, "(sv)", "Slice", "s", arg_slice);
2325 if (r < 0)
2326 return bus_log_create_error(r);
2327 }
2328
2329 r = sd_bus_message_append(m, "(sv)", "DevicePolicy", "s", "strict");
2330 if (r < 0)
2331 return bus_log_create_error(r);
2332
2333 /* If you make changes here, also make sure to update
2334 * systemd-nspawn@.service, to keep the device
2335 * policies in sync regardless if we are run with or
2336 * without the --keep-unit switch. */
2337 r = sd_bus_message_append(m, "(sv)", "DeviceAllow", "a(ss)", 9,
2338 /* Allow the container to
2339 * access and create the API
2340 * device nodes, so that
2341 * PrivateDevices= in the
2342 * container can work
2343 * fine */
2344 "/dev/null", "rwm",
2345 "/dev/zero", "rwm",
2346 "/dev/full", "rwm",
2347 "/dev/random", "rwm",
2348 "/dev/urandom", "rwm",
2349 "/dev/tty", "rwm",
2350 "/dev/net/tun", "rwm",
2351 /* Allow the container
2352 * access to ptys. However,
2353 * do not permit the
2354 * container to ever create
2355 * these device nodes. */
2356 "/dev/pts/ptmx", "rw",
2357 "char-pts", "rw");
2358 if (r < 0)
2359 return bus_log_create_error(r);
2360
2361 for (j = 0; j < arg_n_custom_mounts; j++) {
2362 CustomMount *cm = &arg_custom_mounts[j];
2363
2364 if (cm->type != CUSTOM_MOUNT_BIND)
2365 continue;
2366
2367 r = is_device_node(cm->source);
2368 if (r < 0)
2369 return log_error_errno(r, "Failed to stat %s: %m", cm->source);
2370
2371 if (r) {
2372 r = sd_bus_message_append(m, "(sv)", "DeviceAllow", "a(ss)", 1,
2373 cm->source, cm->read_only ? "r" : "rw");
2374 if (r < 0)
2375 return log_error_errno(r, "Failed to append message arguments: %m");
2376 }
2377 }
2378
2379 if (arg_kill_signal != 0) {
2380 r = sd_bus_message_append(m, "(sv)", "KillSignal", "i", arg_kill_signal);
2381 if (r < 0)
2382 return bus_log_create_error(r);
2383
2384 r = sd_bus_message_append(m, "(sv)", "KillMode", "s", "mixed");
2385 if (r < 0)
2386 return bus_log_create_error(r);
2387 }
2388
2389 STRV_FOREACH(i, arg_property) {
2390 r = sd_bus_message_open_container(m, 'r', "sv");
2391 if (r < 0)
2392 return bus_log_create_error(r);
2393
2394 r = bus_append_unit_property_assignment(m, *i);
2395 if (r < 0)
2396 return r;
2397
2398 r = sd_bus_message_close_container(m);
2399 if (r < 0)
2400 return bus_log_create_error(r);
2401 }
2402
2403 r = sd_bus_message_close_container(m);
2404 if (r < 0)
2405 return bus_log_create_error(r);
2406
2407 r = sd_bus_call(bus, m, 0, &error, NULL);
2408 }
2409
2410 if (r < 0) {
2411 log_error("Failed to register machine: %s", bus_error_message(&error, r));
2412 return r;
2413 }
2414
2415 return 0;
2416 }
2417
2418 static int terminate_machine(pid_t pid) {
2419 _cleanup_bus_error_free_ sd_bus_error error = SD_BUS_ERROR_NULL;
2420 _cleanup_bus_message_unref_ sd_bus_message *reply = NULL;
2421 _cleanup_bus_close_unref_ sd_bus *bus = NULL;
2422 const char *path;
2423 int r;
2424
2425 if (!arg_register)
2426 return 0;
2427
2428 /* If we are reusing the unit, then just exit, systemd will do
2429 * the right thing when we exit. */
2430 if (arg_keep_unit)
2431 return 0;
2432
2433 r = sd_bus_default_system(&bus);
2434 if (r < 0)
2435 return log_error_errno(r, "Failed to open system bus: %m");
2436
2437 r = sd_bus_call_method(
2438 bus,
2439 "org.freedesktop.machine1",
2440 "/org/freedesktop/machine1",
2441 "org.freedesktop.machine1.Manager",
2442 "GetMachineByPID",
2443 &error,
2444 &reply,
2445 "u",
2446 (uint32_t) pid);
2447 if (r < 0) {
2448 /* Note that the machine might already have been
2449 * cleaned up automatically, hence don't consider it a
2450 * failure if we cannot get the machine object. */
2451 log_debug("Failed to get machine: %s", bus_error_message(&error, r));
2452 return 0;
2453 }
2454
2455 r = sd_bus_message_read(reply, "o", &path);
2456 if (r < 0)
2457 return bus_log_parse_error(r);
2458
2459 r = sd_bus_call_method(
2460 bus,
2461 "org.freedesktop.machine1",
2462 path,
2463 "org.freedesktop.machine1.Machine",
2464 "Terminate",
2465 &error,
2466 NULL,
2467 NULL);
2468 if (r < 0) {
2469 log_debug("Failed to terminate machine: %s", bus_error_message(&error, r));
2470 return 0;
2471 }
2472
2473 return 0;
2474 }
2475
2476 static int reset_audit_loginuid(void) {
2477 _cleanup_free_ char *p = NULL;
2478 int r;
2479
2480 if (arg_share_system)
2481 return 0;
2482
2483 r = read_one_line_file("/proc/self/loginuid", &p);
2484 if (r == -ENOENT)
2485 return 0;
2486 if (r < 0)
2487 return log_error_errno(r, "Failed to read /proc/self/loginuid: %m");
2488
2489 /* Already reset? */
2490 if (streq(p, "4294967295"))
2491 return 0;
2492
2493 r = write_string_file("/proc/self/loginuid", "4294967295");
2494 if (r < 0) {
2495 log_error_errno(r,
2496 "Failed to reset audit login UID. This probably means that your kernel is too\n"
2497 "old and you have audit enabled. Note that the auditing subsystem is known to\n"
2498 "be incompatible with containers on old kernels. Please make sure to upgrade\n"
2499 "your kernel or to off auditing with 'audit=0' on the kernel command line before\n"
2500 "using systemd-nspawn. Sleeping for 5s... (%m)");
2501
2502 sleep(5);
2503 }
2504
2505 return 0;
2506 }
2507
2508 #define HOST_HASH_KEY SD_ID128_MAKE(1a,37,6f,c7,46,ec,45,0b,ad,a3,d5,31,06,60,5d,b1)
2509 #define CONTAINER_HASH_KEY SD_ID128_MAKE(c3,c4,f9,19,b5,57,b2,1c,e6,cf,14,27,03,9c,ee,a2)
2510 #define MACVLAN_HASH_KEY SD_ID128_MAKE(00,13,6d,bc,66,83,44,81,bb,0c,f9,51,1f,24,a6,6f)
2511
2512 static int generate_mac(struct ether_addr *mac, sd_id128_t hash_key, uint64_t idx) {
2513 uint8_t result[8];
2514 size_t l, sz;
2515 uint8_t *v, *i;
2516 int r;
2517
2518 l = strlen(arg_machine);
2519 sz = sizeof(sd_id128_t) + l;
2520 if (idx > 0)
2521 sz += sizeof(idx);
2522
2523 v = alloca(sz);
2524
2525 /* fetch some persistent data unique to the host */
2526 r = sd_id128_get_machine((sd_id128_t*) v);
2527 if (r < 0)
2528 return r;
2529
2530 /* combine with some data unique (on this host) to this
2531 * container instance */
2532 i = mempcpy(v + sizeof(sd_id128_t), arg_machine, l);
2533 if (idx > 0) {
2534 idx = htole64(idx);
2535 memcpy(i, &idx, sizeof(idx));
2536 }
2537
2538 /* Let's hash the host machine ID plus the container name. We
2539 * use a fixed, but originally randomly created hash key here. */
2540 siphash24(result, v, sz, hash_key.bytes);
2541
2542 assert_cc(ETH_ALEN <= sizeof(result));
2543 memcpy(mac->ether_addr_octet, result, ETH_ALEN);
2544
2545 /* see eth_random_addr in the kernel */
2546 mac->ether_addr_octet[0] &= 0xfe; /* clear multicast bit */
2547 mac->ether_addr_octet[0] |= 0x02; /* set local assignment bit (IEEE802) */
2548
2549 return 0;
2550 }
2551
2552 static int setup_veth(pid_t pid, char iface_name[IFNAMSIZ], int *ifi) {
2553 _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
2554 _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
2555 struct ether_addr mac_host, mac_container;
2556 int r, i;
2557
2558 if (!arg_private_network)
2559 return 0;
2560
2561 if (!arg_network_veth)
2562 return 0;
2563
2564 /* Use two different interface name prefixes depending whether
2565 * we are in bridge mode or not. */
2566 snprintf(iface_name, IFNAMSIZ - 1, "%s-%s",
2567 arg_network_bridge ? "vb" : "ve", arg_machine);
2568
2569 r = generate_mac(&mac_container, CONTAINER_HASH_KEY, 0);
2570 if (r < 0)
2571 return log_error_errno(r, "Failed to generate predictable MAC address for container side: %m");
2572
2573 r = generate_mac(&mac_host, HOST_HASH_KEY, 0);
2574 if (r < 0)
2575 return log_error_errno(r, "Failed to generate predictable MAC address for host side: %m");
2576
2577 r = sd_rtnl_open(&rtnl, 0);
2578 if (r < 0)
2579 return log_error_errno(r, "Failed to connect to netlink: %m");
2580
2581 r = sd_rtnl_message_new_link(rtnl, &m, RTM_NEWLINK, 0);
2582 if (r < 0)
2583 return log_error_errno(r, "Failed to allocate netlink message: %m");
2584
2585 r = sd_rtnl_message_append_string(m, IFLA_IFNAME, iface_name);
2586 if (r < 0)
2587 return log_error_errno(r, "Failed to add netlink interface name: %m");
2588
2589 r = sd_rtnl_message_append_ether_addr(m, IFLA_ADDRESS, &mac_host);
2590 if (r < 0)
2591 return log_error_errno(r, "Failed to add netlink MAC address: %m");
2592
2593 r = sd_rtnl_message_open_container(m, IFLA_LINKINFO);
2594 if (r < 0)
2595 return log_error_errno(r, "Failed to open netlink container: %m");
2596
2597 r = sd_rtnl_message_open_container_union(m, IFLA_INFO_DATA, "veth");
2598 if (r < 0)
2599 return log_error_errno(r, "Failed to open netlink container: %m");
2600
2601 r = sd_rtnl_message_open_container(m, VETH_INFO_PEER);
2602 if (r < 0)
2603 return log_error_errno(r, "Failed to open netlink container: %m");
2604
2605 r = sd_rtnl_message_append_string(m, IFLA_IFNAME, "host0");
2606 if (r < 0)
2607 return log_error_errno(r, "Failed to add netlink interface name: %m");
2608
2609 r = sd_rtnl_message_append_ether_addr(m, IFLA_ADDRESS, &mac_container);
2610 if (r < 0)
2611 return log_error_errno(r, "Failed to add netlink MAC address: %m");
2612
2613 r = sd_rtnl_message_append_u32(m, IFLA_NET_NS_PID, pid);
2614 if (r < 0)
2615 return log_error_errno(r, "Failed to add netlink namespace field: %m");
2616
2617 r = sd_rtnl_message_close_container(m);
2618 if (r < 0)
2619 return log_error_errno(r, "Failed to close netlink container: %m");
2620
2621 r = sd_rtnl_message_close_container(m);
2622 if (r < 0)
2623 return log_error_errno(r, "Failed to close netlink container: %m");
2624
2625 r = sd_rtnl_message_close_container(m);
2626 if (r < 0)
2627 return log_error_errno(r, "Failed to close netlink container: %m");
2628
2629 r = sd_rtnl_call(rtnl, m, 0, NULL);
2630 if (r < 0)
2631 return log_error_errno(r, "Failed to add new veth interfaces (host0, %s): %m", iface_name);
2632
2633 i = (int) if_nametoindex(iface_name);
2634 if (i <= 0)
2635 return log_error_errno(errno, "Failed to resolve interface %s: %m", iface_name);
2636
2637 *ifi = i;
2638
2639 return 0;
2640 }
2641
2642 static int setup_bridge(const char veth_name[], int *ifi) {
2643 _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
2644 _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
2645 int r, bridge;
2646
2647 if (!arg_private_network)
2648 return 0;
2649
2650 if (!arg_network_veth)
2651 return 0;
2652
2653 if (!arg_network_bridge)
2654 return 0;
2655
2656 bridge = (int) if_nametoindex(arg_network_bridge);
2657 if (bridge <= 0)
2658 return log_error_errno(errno, "Failed to resolve interface %s: %m", arg_network_bridge);
2659
2660 *ifi = bridge;
2661
2662 r = sd_rtnl_open(&rtnl, 0);
2663 if (r < 0)
2664 return log_error_errno(r, "Failed to connect to netlink: %m");
2665
2666 r = sd_rtnl_message_new_link(rtnl, &m, RTM_SETLINK, 0);
2667 if (r < 0)
2668 return log_error_errno(r, "Failed to allocate netlink message: %m");
2669
2670 r = sd_rtnl_message_link_set_flags(m, IFF_UP, IFF_UP);
2671 if (r < 0)
2672 return log_error_errno(r, "Failed to set IFF_UP flag: %m");
2673
2674 r = sd_rtnl_message_append_string(m, IFLA_IFNAME, veth_name);
2675 if (r < 0)
2676 return log_error_errno(r, "Failed to add netlink interface name field: %m");
2677
2678 r = sd_rtnl_message_append_u32(m, IFLA_MASTER, bridge);
2679 if (r < 0)
2680 return log_error_errno(r, "Failed to add netlink master field: %m");
2681
2682 r = sd_rtnl_call(rtnl, m, 0, NULL);
2683 if (r < 0)
2684 return log_error_errno(r, "Failed to add veth interface to bridge: %m");
2685
2686 return 0;
2687 }
2688
2689 static int parse_interface(struct udev *udev, const char *name) {
2690 _cleanup_udev_device_unref_ struct udev_device *d = NULL;
2691 char ifi_str[2 + DECIMAL_STR_MAX(int)];
2692 int ifi;
2693
2694 ifi = (int) if_nametoindex(name);
2695 if (ifi <= 0)
2696 return log_error_errno(errno, "Failed to resolve interface %s: %m", name);
2697
2698 sprintf(ifi_str, "n%i", ifi);
2699 d = udev_device_new_from_device_id(udev, ifi_str);
2700 if (!d)
2701 return log_error_errno(errno, "Failed to get udev device for interface %s: %m", name);
2702
2703 if (udev_device_get_is_initialized(d) <= 0) {
2704 log_error("Network interface %s is not initialized yet.", name);
2705 return -EBUSY;
2706 }
2707
2708 return ifi;
2709 }
2710
2711 static int move_network_interfaces(pid_t pid) {
2712 _cleanup_udev_unref_ struct udev *udev = NULL;
2713 _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
2714 char **i;
2715 int r;
2716
2717 if (!arg_private_network)
2718 return 0;
2719
2720 if (strv_isempty(arg_network_interfaces))
2721 return 0;
2722
2723 r = sd_rtnl_open(&rtnl, 0);
2724 if (r < 0)
2725 return log_error_errno(r, "Failed to connect to netlink: %m");
2726
2727 udev = udev_new();
2728 if (!udev) {
2729 log_error("Failed to connect to udev.");
2730 return -ENOMEM;
2731 }
2732
2733 STRV_FOREACH(i, arg_network_interfaces) {
2734 _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
2735 int ifi;
2736
2737 ifi = parse_interface(udev, *i);
2738 if (ifi < 0)
2739 return ifi;
2740
2741 r = sd_rtnl_message_new_link(rtnl, &m, RTM_SETLINK, ifi);
2742 if (r < 0)
2743 return log_error_errno(r, "Failed to allocate netlink message: %m");
2744
2745 r = sd_rtnl_message_append_u32(m, IFLA_NET_NS_PID, pid);
2746 if (r < 0)
2747 return log_error_errno(r, "Failed to append namespace PID to netlink message: %m");
2748
2749 r = sd_rtnl_call(rtnl, m, 0, NULL);
2750 if (r < 0)
2751 return log_error_errno(r, "Failed to move interface %s to namespace: %m", *i);
2752 }
2753
2754 return 0;
2755 }
2756
2757 static int setup_macvlan(pid_t pid) {
2758 _cleanup_udev_unref_ struct udev *udev = NULL;
2759 _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
2760 unsigned idx = 0;
2761 char **i;
2762 int r;
2763
2764 if (!arg_private_network)
2765 return 0;
2766
2767 if (strv_isempty(arg_network_macvlan))
2768 return 0;
2769
2770 r = sd_rtnl_open(&rtnl, 0);
2771 if (r < 0)
2772 return log_error_errno(r, "Failed to connect to netlink: %m");
2773
2774 udev = udev_new();
2775 if (!udev) {
2776 log_error("Failed to connect to udev.");
2777 return -ENOMEM;
2778 }
2779
2780 STRV_FOREACH(i, arg_network_macvlan) {
2781 _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
2782 _cleanup_free_ char *n = NULL;
2783 struct ether_addr mac;
2784 int ifi;
2785
2786 ifi = parse_interface(udev, *i);
2787 if (ifi < 0)
2788 return ifi;
2789
2790 r = generate_mac(&mac, MACVLAN_HASH_KEY, idx++);
2791 if (r < 0)
2792 return log_error_errno(r, "Failed to create MACVLAN MAC address: %m");
2793
2794 r = sd_rtnl_message_new_link(rtnl, &m, RTM_NEWLINK, 0);
2795 if (r < 0)
2796 return log_error_errno(r, "Failed to allocate netlink message: %m");
2797
2798 r = sd_rtnl_message_append_u32(m, IFLA_LINK, ifi);
2799 if (r < 0)
2800 return log_error_errno(r, "Failed to add netlink interface index: %m");
2801
2802 n = strappend("mv-", *i);
2803 if (!n)
2804 return log_oom();
2805
2806 strshorten(n, IFNAMSIZ-1);
2807
2808 r = sd_rtnl_message_append_string(m, IFLA_IFNAME, n);
2809 if (r < 0)
2810 return log_error_errno(r, "Failed to add netlink interface name: %m");
2811
2812 r = sd_rtnl_message_append_ether_addr(m, IFLA_ADDRESS, &mac);
2813 if (r < 0)
2814 return log_error_errno(r, "Failed to add netlink MAC address: %m");
2815
2816 r = sd_rtnl_message_append_u32(m, IFLA_NET_NS_PID, pid);
2817 if (r < 0)
2818 return log_error_errno(r, "Failed to add netlink namespace field: %m");
2819
2820 r = sd_rtnl_message_open_container(m, IFLA_LINKINFO);
2821 if (r < 0)
2822 return log_error_errno(r, "Failed to open netlink container: %m");
2823
2824 r = sd_rtnl_message_open_container_union(m, IFLA_INFO_DATA, "macvlan");
2825 if (r < 0)
2826 return log_error_errno(r, "Failed to open netlink container: %m");
2827
2828 r = sd_rtnl_message_append_u32(m, IFLA_MACVLAN_MODE, MACVLAN_MODE_BRIDGE);
2829 if (r < 0)
2830 return log_error_errno(r, "Failed to append macvlan mode: %m");
2831
2832 r = sd_rtnl_message_close_container(m);
2833 if (r < 0)
2834 return log_error_errno(r, "Failed to close netlink container: %m");
2835
2836 r = sd_rtnl_message_close_container(m);
2837 if (r < 0)
2838 return log_error_errno(r, "Failed to close netlink container: %m");
2839
2840 r = sd_rtnl_call(rtnl, m, 0, NULL);
2841 if (r < 0)
2842 return log_error_errno(r, "Failed to add new macvlan interfaces: %m");
2843 }
2844
2845 return 0;
2846 }
2847
2848 static int setup_ipvlan(pid_t pid) {
2849 _cleanup_udev_unref_ struct udev *udev = NULL;
2850 _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
2851 char **i;
2852 int r;
2853
2854 if (!arg_private_network)
2855 return 0;
2856
2857 if (strv_isempty(arg_network_ipvlan))
2858 return 0;
2859
2860 r = sd_rtnl_open(&rtnl, 0);
2861 if (r < 0)
2862 return log_error_errno(r, "Failed to connect to netlink: %m");
2863
2864 udev = udev_new();
2865 if (!udev) {
2866 log_error("Failed to connect to udev.");
2867 return -ENOMEM;
2868 }
2869
2870 STRV_FOREACH(i, arg_network_ipvlan) {
2871 _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
2872 _cleanup_free_ char *n = NULL;
2873 int ifi;
2874
2875 ifi = parse_interface(udev, *i);
2876 if (ifi < 0)
2877 return ifi;
2878
2879 r = sd_rtnl_message_new_link(rtnl, &m, RTM_NEWLINK, 0);
2880 if (r < 0)
2881 return log_error_errno(r, "Failed to allocate netlink message: %m");
2882
2883 r = sd_rtnl_message_append_u32(m, IFLA_LINK, ifi);
2884 if (r < 0)
2885 return log_error_errno(r, "Failed to add netlink interface index: %m");
2886
2887 n = strappend("iv-", *i);
2888 if (!n)
2889 return log_oom();
2890
2891 strshorten(n, IFNAMSIZ-1);
2892
2893 r = sd_rtnl_message_append_string(m, IFLA_IFNAME, n);
2894 if (r < 0)
2895 return log_error_errno(r, "Failed to add netlink interface name: %m");
2896
2897 r = sd_rtnl_message_append_u32(m, IFLA_NET_NS_PID, pid);
2898 if (r < 0)
2899 return log_error_errno(r, "Failed to add netlink namespace field: %m");
2900
2901 r = sd_rtnl_message_open_container(m, IFLA_LINKINFO);
2902 if (r < 0)
2903 return log_error_errno(r, "Failed to open netlink container: %m");
2904
2905 r = sd_rtnl_message_open_container_union(m, IFLA_INFO_DATA, "ipvlan");
2906 if (r < 0)
2907 return log_error_errno(r, "Failed to open netlink container: %m");
2908
2909 r = sd_rtnl_message_append_u16(m, IFLA_IPVLAN_MODE, IPVLAN_MODE_L2);
2910 if (r < 0)
2911 return log_error_errno(r, "Failed to add ipvlan mode: %m");
2912
2913 r = sd_rtnl_message_close_container(m);
2914 if (r < 0)
2915 return log_error_errno(r, "Failed to close netlink container: %m");
2916
2917 r = sd_rtnl_message_close_container(m);
2918 if (r < 0)
2919 return log_error_errno(r, "Failed to close netlink container: %m");
2920
2921 r = sd_rtnl_call(rtnl, m, 0, NULL);
2922 if (r < 0)
2923 return log_error_errno(r, "Failed to add new ipvlan interfaces: %m");
2924 }
2925
2926 return 0;
2927 }
2928
2929 static int setup_seccomp(void) {
2930
2931 #ifdef HAVE_SECCOMP
2932 static const struct {
2933 uint64_t capability;
2934 int syscall_num;
2935 } blacklist[] = {
2936 { CAP_SYS_RAWIO, SCMP_SYS(iopl) },
2937 { CAP_SYS_RAWIO, SCMP_SYS(ioperm) },
2938 { CAP_SYS_BOOT, SCMP_SYS(kexec_load) },
2939 { CAP_SYS_ADMIN, SCMP_SYS(swapon) },
2940 { CAP_SYS_ADMIN, SCMP_SYS(swapoff) },
2941 { CAP_SYS_ADMIN, SCMP_SYS(open_by_handle_at) },
2942 { CAP_SYS_MODULE, SCMP_SYS(init_module) },
2943 { CAP_SYS_MODULE, SCMP_SYS(finit_module) },
2944 { CAP_SYS_MODULE, SCMP_SYS(delete_module) },
2945 { CAP_SYSLOG, SCMP_SYS(syslog) },
2946 };
2947
2948 scmp_filter_ctx seccomp;
2949 unsigned i;
2950 int r;
2951
2952 seccomp = seccomp_init(SCMP_ACT_ALLOW);
2953 if (!seccomp)
2954 return log_oom();
2955
2956 r = seccomp_add_secondary_archs(seccomp);
2957 if (r < 0) {
2958 log_error_errno(r, "Failed to add secondary archs to seccomp filter: %m");
2959 goto finish;
2960 }
2961
2962 for (i = 0; i < ELEMENTSOF(blacklist); i++) {
2963 if (arg_retain & (1ULL << blacklist[i].capability))
2964 continue;
2965
2966 r = seccomp_rule_add(seccomp, SCMP_ACT_ERRNO(EPERM), blacklist[i].syscall_num, 0);
2967 if (r == -EFAULT)
2968 continue; /* unknown syscall */
2969 if (r < 0) {
2970 log_error_errno(r, "Failed to block syscall: %m");
2971 goto finish;
2972 }
2973 }
2974
2975
2976 /*
2977 Audit is broken in containers, much of the userspace audit
2978 hookup will fail if running inside a container. We don't
2979 care and just turn off creation of audit sockets.
2980
2981 This will make socket(AF_NETLINK, *, NETLINK_AUDIT) fail
2982 with EAFNOSUPPORT which audit userspace uses as indication
2983 that audit is disabled in the kernel.
2984 */
2985
2986 r = seccomp_rule_add(
2987 seccomp,
2988 SCMP_ACT_ERRNO(EAFNOSUPPORT),
2989 SCMP_SYS(socket),
2990 2,
2991 SCMP_A0(SCMP_CMP_EQ, AF_NETLINK),
2992 SCMP_A2(SCMP_CMP_EQ, NETLINK_AUDIT));
2993 if (r < 0) {
2994 log_error_errno(r, "Failed to add audit seccomp rule: %m");
2995 goto finish;
2996 }
2997
2998 r = seccomp_attr_set(seccomp, SCMP_FLTATR_CTL_NNP, 0);
2999 if (r < 0) {
3000 log_error_errno(r, "Failed to unset NO_NEW_PRIVS: %m");
3001 goto finish;
3002 }
3003
3004 r = seccomp_load(seccomp);
3005 if (r < 0)
3006 log_error_errno(r, "Failed to install seccomp audit filter: %m");
3007
3008 finish:
3009 seccomp_release(seccomp);
3010 return r;
3011 #else
3012 return 0;
3013 #endif
3014
3015 }
3016
3017 static int setup_propagate(const char *root) {
3018 const char *p, *q;
3019
3020 (void) mkdir_p("/run/systemd/nspawn/", 0755);
3021 (void) mkdir_p("/run/systemd/nspawn/propagate", 0600);
3022 p = strjoina("/run/systemd/nspawn/propagate/", arg_machine);
3023 (void) mkdir_p(p, 0600);
3024
3025 if (userns_mkdir(root, "/run/systemd", 0755, 0, 0) < 0)
3026 return log_error_errno(errno, "Failed to create /run/systemd: %m");
3027
3028 if (userns_mkdir(root, "/run/systemd/nspawn", 0755, 0, 0) < 0)
3029 return log_error_errno(errno, "Failed to create /run/systemd/nspawn: %m");
3030
3031 if (userns_mkdir(root, "/run/systemd/nspawn/incoming", 0600, 0, 0) < 0)
3032 return log_error_errno(errno, "Failed to create /run/systemd/nspawn/incoming: %m");
3033
3034 q = prefix_roota(root, "/run/systemd/nspawn/incoming");
3035 if (mount(p, q, NULL, MS_BIND, NULL) < 0)
3036 return log_error_errno(errno, "Failed to install propagation bind mount.");
3037
3038 if (mount(NULL, q, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY, NULL) < 0)
3039 return log_error_errno(errno, "Failed to make propagation mount read-only");
3040
3041 return 0;
3042 }
3043
3044 static int setup_image(char **device_path, int *loop_nr) {
3045 struct loop_info64 info = {
3046 .lo_flags = LO_FLAGS_AUTOCLEAR|LO_FLAGS_PARTSCAN
3047 };
3048 _cleanup_close_ int fd = -1, control = -1, loop = -1;
3049 _cleanup_free_ char* loopdev = NULL;
3050 struct stat st;
3051 int r, nr;
3052
3053 assert(device_path);
3054 assert(loop_nr);
3055 assert(arg_image);
3056
3057 fd = open(arg_image, O_CLOEXEC|(arg_read_only ? O_RDONLY : O_RDWR)|O_NONBLOCK|O_NOCTTY);
3058 if (fd < 0)
3059 return log_error_errno(errno, "Failed to open %s: %m", arg_image);
3060
3061 if (fstat(fd, &st) < 0)
3062 return log_error_errno(errno, "Failed to stat %s: %m", arg_image);
3063
3064 if (S_ISBLK(st.st_mode)) {
3065 char *p;
3066
3067 p = strdup(arg_image);
3068 if (!p)
3069 return log_oom();
3070
3071 *device_path = p;
3072
3073 *loop_nr = -1;
3074
3075 r = fd;
3076 fd = -1;
3077
3078 return r;
3079 }
3080
3081 if (!S_ISREG(st.st_mode)) {
3082 log_error_errno(errno, "%s is not a regular file or block device: %m", arg_image);
3083 return -EINVAL;
3084 }
3085
3086 control = open("/dev/loop-control", O_RDWR|O_CLOEXEC|O_NOCTTY|O_NONBLOCK);
3087 if (control < 0)
3088 return log_error_errno(errno, "Failed to open /dev/loop-control: %m");
3089
3090 nr = ioctl(control, LOOP_CTL_GET_FREE);
3091 if (nr < 0)
3092 return log_error_errno(errno, "Failed to allocate loop device: %m");
3093
3094 if (asprintf(&loopdev, "/dev/loop%i", nr) < 0)
3095 return log_oom();
3096
3097 loop = open(loopdev, O_CLOEXEC|(arg_read_only ? O_RDONLY : O_RDWR)|O_NONBLOCK|O_NOCTTY);
3098 if (loop < 0)
3099 return log_error_errno(errno, "Failed to open loop device %s: %m", loopdev);
3100
3101 if (ioctl(loop, LOOP_SET_FD, fd) < 0)
3102 return log_error_errno(errno, "Failed to set loopback file descriptor on %s: %m", loopdev);
3103
3104 if (arg_read_only)
3105 info.lo_flags |= LO_FLAGS_READ_ONLY;
3106
3107 if (ioctl(loop, LOOP_SET_STATUS64, &info) < 0)
3108 return log_error_errno(errno, "Failed to set loopback settings on %s: %m", loopdev);
3109
3110 *device_path = loopdev;
3111 loopdev = NULL;
3112
3113 *loop_nr = nr;
3114
3115 r = loop;
3116 loop = -1;
3117
3118 return r;
3119 }
3120
3121 #define PARTITION_TABLE_BLURB \
3122 "Note that the disk image needs to either contain only a single MBR partition of\n" \
3123 "type 0x83 that is marked bootable, or a single GPT partition of type " \
3124 "0FC63DAF-8483-4772-8E79-3D69D8477DE4 or follow\n" \
3125 " http://www.freedesktop.org/wiki/Specifications/DiscoverablePartitionsSpec/\n" \
3126 "to be bootable with systemd-nspawn."
3127
3128 static int dissect_image(
3129 int fd,
3130 char **root_device, bool *root_device_rw,
3131 char **home_device, bool *home_device_rw,
3132 char **srv_device, bool *srv_device_rw,
3133 bool *secondary) {
3134
3135 #ifdef HAVE_BLKID
3136 int home_nr = -1, srv_nr = -1;
3137 #ifdef GPT_ROOT_NATIVE
3138 int root_nr = -1;
3139 #endif
3140 #ifdef GPT_ROOT_SECONDARY
3141 int secondary_root_nr = -1;
3142 #endif
3143 _cleanup_free_ char *home = NULL, *root = NULL, *secondary_root = NULL, *srv = NULL, *generic = NULL;
3144 _cleanup_udev_enumerate_unref_ struct udev_enumerate *e = NULL;
3145 _cleanup_udev_device_unref_ struct udev_device *d = NULL;
3146 _cleanup_blkid_free_probe_ blkid_probe b = NULL;
3147 _cleanup_udev_unref_ struct udev *udev = NULL;
3148 struct udev_list_entry *first, *item;
3149 bool home_rw = true, root_rw = true, secondary_root_rw = true, srv_rw = true, generic_rw = true;
3150 bool is_gpt, is_mbr, multiple_generic = false;
3151 const char *pttype = NULL;
3152 blkid_partlist pl;
3153 struct stat st;
3154 unsigned i;
3155 int r;
3156
3157 assert(fd >= 0);
3158 assert(root_device);
3159 assert(home_device);
3160 assert(srv_device);
3161 assert(secondary);
3162 assert(arg_image);
3163
3164 b = blkid_new_probe();
3165 if (!b)
3166 return log_oom();
3167
3168 errno = 0;
3169 r = blkid_probe_set_device(b, fd, 0, 0);
3170 if (r != 0) {
3171 if (errno == 0)
3172 return log_oom();
3173
3174 log_error_errno(errno, "Failed to set device on blkid probe: %m");
3175 return -errno;
3176 }
3177
3178 blkid_probe_enable_partitions(b, 1);
3179 blkid_probe_set_partitions_flags(b, BLKID_PARTS_ENTRY_DETAILS);
3180
3181 errno = 0;
3182 r = blkid_do_safeprobe(b);
3183 if (r == -2 || r == 1) {
3184 log_error("Failed to identify any partition table on\n"
3185 " %s\n"
3186 PARTITION_TABLE_BLURB, arg_image);
3187 return -EINVAL;
3188 } else if (r != 0) {
3189 if (errno == 0)
3190 errno = EIO;
3191 log_error_errno(errno, "Failed to probe: %m");
3192 return -errno;
3193 }
3194
3195 (void) blkid_probe_lookup_value(b, "PTTYPE", &pttype, NULL);
3196
3197 is_gpt = streq_ptr(pttype, "gpt");
3198 is_mbr = streq_ptr(pttype, "dos");
3199
3200 if (!is_gpt && !is_mbr) {
3201 log_error("No GPT or MBR partition table discovered on\n"
3202 " %s\n"
3203 PARTITION_TABLE_BLURB, arg_image);
3204 return -EINVAL;
3205 }
3206
3207 errno = 0;
3208 pl = blkid_probe_get_partitions(b);
3209 if (!pl) {
3210 if (errno == 0)
3211 return log_oom();
3212
3213 log_error("Failed to list partitions of %s", arg_image);
3214 return -errno;
3215 }
3216
3217 udev = udev_new();
3218 if (!udev)
3219 return log_oom();
3220
3221 if (fstat(fd, &st) < 0)
3222 return log_error_errno(errno, "Failed to stat block device: %m");
3223
3224 d = udev_device_new_from_devnum(udev, 'b', st.st_rdev);
3225 if (!d)
3226 return log_oom();
3227
3228 for (i = 0;; i++) {
3229 int n, m;
3230
3231 if (i >= 10) {
3232 log_error("Kernel partitions never appeared.");
3233 return -ENXIO;
3234 }
3235
3236 e = udev_enumerate_new(udev);
3237 if (!e)
3238 return log_oom();
3239
3240 r = udev_enumerate_add_match_parent(e, d);
3241 if (r < 0)
3242 return log_oom();
3243
3244 r = udev_enumerate_scan_devices(e);
3245 if (r < 0)
3246 return log_error_errno(r, "Failed to scan for partition devices of %s: %m", arg_image);
3247
3248 /* Count the partitions enumerated by the kernel */
3249 n = 0;
3250 first = udev_enumerate_get_list_entry(e);
3251 udev_list_entry_foreach(item, first)
3252 n++;
3253
3254 /* Count the partitions enumerated by blkid */
3255 m = blkid_partlist_numof_partitions(pl);
3256 if (n == m + 1)
3257 break;
3258 if (n > m + 1) {
3259 log_error("blkid and kernel partition list do not match.");
3260 return -EIO;
3261 }
3262 if (n < m + 1) {
3263 unsigned j;
3264
3265 /* The kernel has probed fewer partitions than
3266 * blkid? Maybe the kernel prober is still
3267 * running or it got EBUSY because udev
3268 * already opened the device. Let's reprobe
3269 * the device, which is a synchronous call
3270 * that waits until probing is complete. */
3271
3272 for (j = 0; j < 20; j++) {
3273
3274 r = ioctl(fd, BLKRRPART, 0);
3275 if (r < 0)
3276 r = -errno;
3277 if (r >= 0 || r != -EBUSY)
3278 break;
3279
3280 /* If something else has the device
3281 * open, such as an udev rule, the
3282 * ioctl will return EBUSY. Since
3283 * there's no way to wait until it
3284 * isn't busy anymore, let's just wait
3285 * a bit, and try again.
3286 *
3287 * This is really something they
3288 * should fix in the kernel! */
3289
3290 usleep(50 * USEC_PER_MSEC);
3291 }
3292
3293 if (r < 0)
3294 return log_error_errno(r, "Failed to reread partition table: %m");
3295 }
3296
3297 e = udev_enumerate_unref(e);
3298 }
3299
3300 first = udev_enumerate_get_list_entry(e);
3301 udev_list_entry_foreach(item, first) {
3302 _cleanup_udev_device_unref_ struct udev_device *q;
3303 const char *node;
3304 unsigned long long flags;
3305 blkid_partition pp;
3306 dev_t qn;
3307 int nr;
3308
3309 errno = 0;
3310 q = udev_device_new_from_syspath(udev, udev_list_entry_get_name(item));
3311 if (!q) {
3312 if (!errno)
3313 errno = ENOMEM;
3314
3315 log_error_errno(errno, "Failed to get partition device of %s: %m", arg_image);
3316 return -errno;
3317 }
3318
3319 qn = udev_device_get_devnum(q);
3320 if (major(qn) == 0)
3321 continue;
3322
3323 if (st.st_rdev == qn)
3324 continue;
3325
3326 node = udev_device_get_devnode(q);
3327 if (!node)
3328 continue;
3329
3330 pp = blkid_partlist_devno_to_partition(pl, qn);
3331 if (!pp)
3332 continue;
3333
3334 flags = blkid_partition_get_flags(pp);
3335
3336 nr = blkid_partition_get_partno(pp);
3337 if (nr < 0)
3338 continue;
3339
3340 if (is_gpt) {
3341 sd_id128_t type_id;
3342 const char *stype;
3343
3344 if (flags & GPT_FLAG_NO_AUTO)
3345 continue;
3346
3347 stype = blkid_partition_get_type_string(pp);
3348 if (!stype)
3349 continue;
3350
3351 if (sd_id128_from_string(stype, &type_id) < 0)
3352 continue;
3353
3354 if (sd_id128_equal(type_id, GPT_HOME)) {
3355
3356 if (home && nr >= home_nr)
3357 continue;
3358
3359 home_nr = nr;
3360 home_rw = !(flags & GPT_FLAG_READ_ONLY);
3361
3362 r = free_and_strdup(&home, node);
3363 if (r < 0)
3364 return log_oom();
3365
3366 } else if (sd_id128_equal(type_id, GPT_SRV)) {
3367
3368 if (srv && nr >= srv_nr)
3369 continue;
3370
3371 srv_nr = nr;
3372 srv_rw = !(flags & GPT_FLAG_READ_ONLY);
3373
3374 r = free_and_strdup(&srv, node);
3375 if (r < 0)
3376 return log_oom();
3377 }
3378 #ifdef GPT_ROOT_NATIVE
3379 else if (sd_id128_equal(type_id, GPT_ROOT_NATIVE)) {
3380
3381 if (root && nr >= root_nr)
3382 continue;
3383
3384 root_nr = nr;
3385 root_rw = !(flags & GPT_FLAG_READ_ONLY);
3386
3387 r = free_and_strdup(&root, node);
3388 if (r < 0)
3389 return log_oom();
3390 }
3391 #endif
3392 #ifdef GPT_ROOT_SECONDARY
3393 else if (sd_id128_equal(type_id, GPT_ROOT_SECONDARY)) {
3394
3395 if (secondary_root && nr >= secondary_root_nr)
3396 continue;
3397
3398 secondary_root_nr = nr;
3399 secondary_root_rw = !(flags & GPT_FLAG_READ_ONLY);
3400
3401 r = free_and_strdup(&secondary_root, node);
3402 if (r < 0)
3403 return log_oom();
3404 }
3405 #endif
3406 else if (sd_id128_equal(type_id, GPT_LINUX_GENERIC)) {
3407
3408 if (generic)
3409 multiple_generic = true;
3410 else {
3411 generic_rw = !(flags & GPT_FLAG_READ_ONLY);
3412
3413 r = free_and_strdup(&generic, node);
3414 if (r < 0)
3415 return log_oom();
3416 }
3417 }
3418
3419 } else if (is_mbr) {
3420 int type;
3421
3422 if (flags != 0x80) /* Bootable flag */
3423 continue;
3424
3425 type = blkid_partition_get_type(pp);
3426 if (type != 0x83) /* Linux partition */
3427 continue;
3428
3429 if (generic)
3430 multiple_generic = true;
3431 else {
3432 generic_rw = true;
3433
3434 r = free_and_strdup(&root, node);
3435 if (r < 0)
3436 return log_oom();
3437 }
3438 }
3439 }
3440
3441 if (root) {
3442 *root_device = root;
3443 root = NULL;
3444
3445 *root_device_rw = root_rw;
3446 *secondary = false;
3447 } else if (secondary_root) {
3448 *root_device = secondary_root;
3449 secondary_root = NULL;
3450
3451 *root_device_rw = secondary_root_rw;
3452 *secondary = true;
3453 } else if (generic) {
3454
3455 /* There were no partitions with precise meanings
3456 * around, but we found generic partitions. In this
3457 * case, if there's only one, we can go ahead and boot
3458 * it, otherwise we bail out, because we really cannot
3459 * make any sense of it. */
3460
3461 if (multiple_generic) {
3462 log_error("Identified multiple bootable Linux partitions on\n"
3463 " %s\n"
3464 PARTITION_TABLE_BLURB, arg_image);
3465 return -EINVAL;
3466 }
3467
3468 *root_device = generic;
3469 generic = NULL;
3470
3471 *root_device_rw = generic_rw;
3472 *secondary = false;
3473 } else {
3474 log_error("Failed to identify root partition in disk image\n"
3475 " %s\n"
3476 PARTITION_TABLE_BLURB, arg_image);
3477 return -EINVAL;
3478 }
3479
3480 if (home) {
3481 *home_device = home;
3482 home = NULL;
3483
3484 *home_device_rw = home_rw;
3485 }
3486
3487 if (srv) {
3488 *srv_device = srv;
3489 srv = NULL;
3490
3491 *srv_device_rw = srv_rw;
3492 }
3493
3494 return 0;
3495 #else
3496 log_error("--image= is not supported, compiled without blkid support.");
3497 return -EOPNOTSUPP;
3498 #endif
3499 }
3500
3501 static int mount_device(const char *what, const char *where, const char *directory, bool rw) {
3502 #ifdef HAVE_BLKID
3503 _cleanup_blkid_free_probe_ blkid_probe b = NULL;
3504 const char *fstype, *p;
3505 int r;
3506
3507 assert(what);
3508 assert(where);
3509
3510 if (arg_read_only)
3511 rw = false;
3512
3513 if (directory)
3514 p = strjoina(where, directory);
3515 else
3516 p = where;
3517
3518 errno = 0;
3519 b = blkid_new_probe_from_filename(what);
3520 if (!b) {
3521 if (errno == 0)
3522 return log_oom();
3523 log_error_errno(errno, "Failed to allocate prober for %s: %m", what);
3524 return -errno;
3525 }
3526
3527 blkid_probe_enable_superblocks(b, 1);
3528 blkid_probe_set_superblocks_flags(b, BLKID_SUBLKS_TYPE);
3529
3530 errno = 0;
3531 r = blkid_do_safeprobe(b);
3532 if (r == -1 || r == 1) {
3533 log_error("Cannot determine file system type of %s", what);
3534 return -EINVAL;
3535 } else if (r != 0) {
3536 if (errno == 0)
3537 errno = EIO;
3538 log_error_errno(errno, "Failed to probe %s: %m", what);
3539 return -errno;
3540 }
3541
3542 errno = 0;
3543 if (blkid_probe_lookup_value(b, "TYPE", &fstype, NULL) < 0) {
3544 if (errno == 0)
3545 errno = EINVAL;
3546 log_error("Failed to determine file system type of %s", what);
3547 return -errno;
3548 }
3549
3550 if (streq(fstype, "crypto_LUKS")) {
3551 log_error("nspawn currently does not support LUKS disk images.");
3552 return -EOPNOTSUPP;
3553 }
3554
3555 if (mount(what, p, fstype, MS_NODEV|(rw ? 0 : MS_RDONLY), NULL) < 0)
3556 return log_error_errno(errno, "Failed to mount %s: %m", what);
3557
3558 return 0;
3559 #else
3560 log_error("--image= is not supported, compiled without blkid support.");
3561 return -EOPNOTSUPP;
3562 #endif
3563 }
3564
3565 static int mount_devices(
3566 const char *where,
3567 const char *root_device, bool root_device_rw,
3568 const char *home_device, bool home_device_rw,
3569 const char *srv_device, bool srv_device_rw) {
3570 int r;
3571
3572 assert(where);
3573
3574 if (root_device) {
3575 r = mount_device(root_device, arg_directory, NULL, root_device_rw);
3576 if (r < 0)
3577 return log_error_errno(r, "Failed to mount root directory: %m");
3578 }
3579
3580 if (home_device) {
3581 r = mount_device(home_device, arg_directory, "/home", home_device_rw);
3582 if (r < 0)
3583 return log_error_errno(r, "Failed to mount home directory: %m");
3584 }
3585
3586 if (srv_device) {
3587 r = mount_device(srv_device, arg_directory, "/srv", srv_device_rw);
3588 if (r < 0)
3589 return log_error_errno(r, "Failed to mount server data directory: %m");
3590 }
3591
3592 return 0;
3593 }
3594
3595 static void loop_remove(int nr, int *image_fd) {
3596 _cleanup_close_ int control = -1;
3597 int r;
3598
3599 if (nr < 0)
3600 return;
3601
3602 if (image_fd && *image_fd >= 0) {
3603 r = ioctl(*image_fd, LOOP_CLR_FD);
3604 if (r < 0)
3605 log_debug_errno(errno, "Failed to close loop image: %m");
3606 *image_fd = safe_close(*image_fd);
3607 }
3608
3609 control = open("/dev/loop-control", O_RDWR|O_CLOEXEC|O_NOCTTY|O_NONBLOCK);
3610 if (control < 0) {
3611 log_warning_errno(errno, "Failed to open /dev/loop-control: %m");
3612 return;
3613 }
3614
3615 r = ioctl(control, LOOP_CTL_REMOVE, nr);
3616 if (r < 0)
3617 log_debug_errno(errno, "Failed to remove loop %d: %m", nr);
3618 }
3619
3620 static int spawn_getent(const char *database, const char *key, pid_t *rpid) {
3621 int pipe_fds[2];
3622 pid_t pid;
3623
3624 assert(database);
3625 assert(key);
3626 assert(rpid);
3627
3628 if (pipe2(pipe_fds, O_CLOEXEC) < 0)
3629 return log_error_errno(errno, "Failed to allocate pipe: %m");
3630
3631 pid = fork();
3632 if (pid < 0)
3633 return log_error_errno(errno, "Failed to fork getent child: %m");
3634 else if (pid == 0) {
3635 int nullfd;
3636 char *empty_env = NULL;
3637
3638 if (dup3(pipe_fds[1], STDOUT_FILENO, 0) < 0)
3639 _exit(EXIT_FAILURE);
3640
3641 if (pipe_fds[0] > 2)
3642 safe_close(pipe_fds[0]);
3643 if (pipe_fds[1] > 2)
3644 safe_close(pipe_fds[1]);
3645
3646 nullfd = open("/dev/null", O_RDWR);
3647 if (nullfd < 0)
3648 _exit(EXIT_FAILURE);
3649
3650 if (dup3(nullfd, STDIN_FILENO, 0) < 0)
3651 _exit(EXIT_FAILURE);
3652
3653 if (dup3(nullfd, STDERR_FILENO, 0) < 0)
3654 _exit(EXIT_FAILURE);
3655
3656 if (nullfd > 2)
3657 safe_close(nullfd);
3658
3659 (void) reset_all_signal_handlers();
3660 (void) reset_signal_mask();
3661 close_all_fds(NULL, 0);
3662
3663 execle("/usr/bin/getent", "getent", database, key, NULL, &empty_env);
3664 execle("/bin/getent", "getent", database, key, NULL, &empty_env);
3665 _exit(EXIT_FAILURE);
3666 }
3667
3668 pipe_fds[1] = safe_close(pipe_fds[1]);
3669
3670 *rpid = pid;
3671
3672 return pipe_fds[0];
3673 }
3674
3675 static int change_uid_gid(char **_home) {
3676 char line[LINE_MAX], *x, *u, *g, *h;
3677 const char *word, *state;
3678 _cleanup_free_ uid_t *uids = NULL;
3679 _cleanup_free_ char *home = NULL;
3680 _cleanup_fclose_ FILE *f = NULL;
3681 _cleanup_close_ int fd = -1;
3682 unsigned n_uids = 0;
3683 size_t sz = 0, l;
3684 uid_t uid;
3685 gid_t gid;
3686 pid_t pid;
3687 int r;
3688
3689 assert(_home);
3690
3691 if (!arg_user || streq(arg_user, "root") || streq(arg_user, "0")) {
3692 /* Reset everything fully to 0, just in case */
3693
3694 r = reset_uid_gid();
3695 if (r < 0)
3696 return log_error_errno(r, "Failed to become root: %m");
3697
3698 *_home = NULL;
3699 return 0;
3700 }
3701
3702 /* First, get user credentials */
3703 fd = spawn_getent("passwd", arg_user, &pid);
3704 if (fd < 0)
3705 return fd;
3706
3707 f = fdopen(fd, "r");
3708 if (!f)
3709 return log_oom();
3710 fd = -1;
3711
3712 if (!fgets(line, sizeof(line), f)) {
3713
3714 if (!ferror(f)) {
3715 log_error("Failed to resolve user %s.", arg_user);
3716 return -ESRCH;
3717 }
3718
3719 log_error_errno(errno, "Failed to read from getent: %m");
3720 return -errno;
3721 }
3722
3723 truncate_nl(line);
3724
3725 wait_for_terminate_and_warn("getent passwd", pid, true);
3726
3727 x = strchr(line, ':');
3728 if (!x) {
3729 log_error("/etc/passwd entry has invalid user field.");
3730 return -EIO;
3731 }
3732
3733 u = strchr(x+1, ':');
3734 if (!u) {
3735 log_error("/etc/passwd entry has invalid password field.");
3736 return -EIO;
3737 }
3738
3739 u++;
3740 g = strchr(u, ':');
3741 if (!g) {
3742 log_error("/etc/passwd entry has invalid UID field.");
3743 return -EIO;
3744 }
3745
3746 *g = 0;
3747 g++;
3748 x = strchr(g, ':');
3749 if (!x) {
3750 log_error("/etc/passwd entry has invalid GID field.");
3751 return -EIO;
3752 }
3753
3754 *x = 0;
3755 h = strchr(x+1, ':');
3756 if (!h) {
3757 log_error("/etc/passwd entry has invalid GECOS field.");
3758 return -EIO;
3759 }
3760
3761 h++;
3762 x = strchr(h, ':');
3763 if (!x) {
3764 log_error("/etc/passwd entry has invalid home directory field.");
3765 return -EIO;
3766 }
3767
3768 *x = 0;
3769
3770 r = parse_uid(u, &uid);
3771 if (r < 0) {
3772 log_error("Failed to parse UID of user.");
3773 return -EIO;
3774 }
3775
3776 r = parse_gid(g, &gid);
3777 if (r < 0) {
3778 log_error("Failed to parse GID of user.");
3779 return -EIO;
3780 }
3781
3782 home = strdup(h);
3783 if (!home)
3784 return log_oom();
3785
3786 /* Second, get group memberships */
3787 fd = spawn_getent("initgroups", arg_user, &pid);
3788 if (fd < 0)
3789 return fd;
3790
3791 fclose(f);
3792 f = fdopen(fd, "r");
3793 if (!f)
3794 return log_oom();
3795 fd = -1;
3796
3797 if (!fgets(line, sizeof(line), f)) {
3798 if (!ferror(f)) {
3799 log_error("Failed to resolve user %s.", arg_user);
3800 return -ESRCH;
3801 }
3802
3803 log_error_errno(errno, "Failed to read from getent: %m");
3804 return -errno;
3805 }
3806
3807 truncate_nl(line);
3808
3809 wait_for_terminate_and_warn("getent initgroups", pid, true);
3810
3811 /* Skip over the username and subsequent separator whitespace */
3812 x = line;
3813 x += strcspn(x, WHITESPACE);
3814 x += strspn(x, WHITESPACE);
3815
3816 FOREACH_WORD(word, l, x, state) {
3817 char c[l+1];
3818
3819 memcpy(c, word, l);
3820 c[l] = 0;
3821
3822 if (!GREEDY_REALLOC(uids, sz, n_uids+1))
3823 return log_oom();
3824
3825 r = parse_uid(c, &uids[n_uids++]);
3826 if (r < 0) {
3827 log_error("Failed to parse group data from getent.");
3828 return -EIO;
3829 }
3830 }
3831
3832 r = mkdir_parents(home, 0775);
3833 if (r < 0)
3834 return log_error_errno(r, "Failed to make home root directory: %m");
3835
3836 r = mkdir_safe(home, 0755, uid, gid);
3837 if (r < 0 && r != -EEXIST)
3838 return log_error_errno(r, "Failed to make home directory: %m");
3839
3840 (void) fchown(STDIN_FILENO, uid, gid);
3841 (void) fchown(STDOUT_FILENO, uid, gid);
3842 (void) fchown(STDERR_FILENO, uid, gid);
3843
3844 if (setgroups(n_uids, uids) < 0)
3845 return log_error_errno(errno, "Failed to set auxiliary groups: %m");
3846
3847 if (setresgid(gid, gid, gid) < 0)
3848 return log_error_errno(errno, "setregid() failed: %m");
3849
3850 if (setresuid(uid, uid, uid) < 0)
3851 return log_error_errno(errno, "setreuid() failed: %m");
3852
3853 if (_home) {
3854 *_home = home;
3855 home = NULL;
3856 }
3857
3858 return 0;
3859 }
3860
3861 /*
3862 * Return values:
3863 * < 0 : wait_for_terminate() failed to get the state of the
3864 * container, the container was terminated by a signal, or
3865 * failed for an unknown reason. No change is made to the
3866 * container argument.
3867 * > 0 : The program executed in the container terminated with an
3868 * error. The exit code of the program executed in the
3869 * container is returned. The container argument has been set
3870 * to CONTAINER_TERMINATED.
3871 * 0 : The container is being rebooted, has been shut down or exited
3872 * successfully. The container argument has been set to either
3873 * CONTAINER_TERMINATED or CONTAINER_REBOOTED.
3874 *
3875 * That is, success is indicated by a return value of zero, and an
3876 * error is indicated by a non-zero value.
3877 */
3878 static int wait_for_container(pid_t pid, ContainerStatus *container) {
3879 siginfo_t status;
3880 int r;
3881
3882 r = wait_for_terminate(pid, &status);
3883 if (r < 0)
3884 return log_warning_errno(r, "Failed to wait for container: %m");
3885
3886 switch (status.si_code) {
3887
3888 case CLD_EXITED:
3889 if (status.si_status == 0) {
3890 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s exited successfully.", arg_machine);
3891
3892 } else
3893 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s failed with error code %i.", arg_machine, status.si_status);
3894
3895 *container = CONTAINER_TERMINATED;
3896 return status.si_status;
3897
3898 case CLD_KILLED:
3899 if (status.si_status == SIGINT) {
3900
3901 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s has been shut down.", arg_machine);
3902 *container = CONTAINER_TERMINATED;
3903 return 0;
3904
3905 } else if (status.si_status == SIGHUP) {
3906
3907 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s is being rebooted.", arg_machine);
3908 *container = CONTAINER_REBOOTED;
3909 return 0;
3910 }
3911
3912 /* CLD_KILLED fallthrough */
3913
3914 case CLD_DUMPED:
3915 log_error("Container %s terminated by signal %s.", arg_machine, signal_to_string(status.si_status));
3916 return -EIO;
3917
3918 default:
3919 log_error("Container %s failed due to unknown reason.", arg_machine);
3920 return -EIO;
3921 }
3922
3923 return r;
3924 }
3925
3926 static void nop_handler(int sig) {}
3927
3928 static int on_orderly_shutdown(sd_event_source *s, const struct signalfd_siginfo *si, void *userdata) {
3929 pid_t pid;
3930
3931 pid = PTR_TO_UINT32(userdata);
3932 if (pid > 0) {
3933 if (kill(pid, arg_kill_signal) >= 0) {
3934 log_info("Trying to halt container. Send SIGTERM again to trigger immediate termination.");
3935 sd_event_source_set_userdata(s, NULL);
3936 return 0;
3937 }
3938 }
3939
3940 sd_event_exit(sd_event_source_get_event(s), 0);
3941 return 0;
3942 }
3943
3944 static int determine_names(void) {
3945 int r;
3946
3947 if (!arg_image && !arg_directory) {
3948 if (arg_machine) {
3949 _cleanup_(image_unrefp) Image *i = NULL;
3950
3951 r = image_find(arg_machine, &i);
3952 if (r < 0)
3953 return log_error_errno(r, "Failed to find image for machine '%s': %m", arg_machine);
3954 else if (r == 0) {
3955 log_error("No image for machine '%s': %m", arg_machine);
3956 return -ENOENT;
3957 }
3958
3959 if (i->type == IMAGE_RAW)
3960 r = set_sanitized_path(&arg_image, i->path);
3961 else
3962 r = set_sanitized_path(&arg_directory, i->path);
3963 if (r < 0)
3964 return log_error_errno(r, "Invalid image directory: %m");
3965
3966 if (!arg_ephemeral)
3967 arg_read_only = arg_read_only || i->read_only;
3968 } else
3969 arg_directory = get_current_dir_name();
3970
3971 if (!arg_directory && !arg_machine) {
3972 log_error("Failed to determine path, please use -D or -i.");
3973 return -EINVAL;
3974 }
3975 }
3976
3977 if (!arg_machine) {
3978 if (arg_directory && path_equal(arg_directory, "/"))
3979 arg_machine = gethostname_malloc();
3980 else
3981 arg_machine = strdup(basename(arg_image ?: arg_directory));
3982
3983 if (!arg_machine)
3984 return log_oom();
3985
3986 hostname_cleanup(arg_machine, false);
3987 if (!machine_name_is_valid(arg_machine)) {
3988 log_error("Failed to determine machine name automatically, please use -M.");
3989 return -EINVAL;
3990 }
3991
3992 if (arg_ephemeral) {
3993 char *b;
3994
3995 /* Add a random suffix when this is an
3996 * ephemeral machine, so that we can run many
3997 * instances at once without manually having
3998 * to specify -M each time. */
3999
4000 if (asprintf(&b, "%s-%016" PRIx64, arg_machine, random_u64()) < 0)
4001 return log_oom();
4002
4003 free(arg_machine);
4004 arg_machine = b;
4005 }
4006 }
4007
4008 return 0;
4009 }
4010
4011 static int determine_uid_shift(const char *directory) {
4012 int r;
4013
4014 if (!arg_userns) {
4015 arg_uid_shift = 0;
4016 return 0;
4017 }
4018
4019 if (arg_uid_shift == UID_INVALID) {
4020 struct stat st;
4021
4022 r = stat(directory, &st);
4023 if (r < 0)
4024 return log_error_errno(errno, "Failed to determine UID base of %s: %m", directory);
4025
4026 arg_uid_shift = st.st_uid & UINT32_C(0xffff0000);
4027
4028 if (arg_uid_shift != (st.st_gid & UINT32_C(0xffff0000))) {
4029 log_error("UID and GID base of %s don't match.", directory);
4030 return -EINVAL;
4031 }
4032
4033 arg_uid_range = UINT32_C(0x10000);
4034 }
4035
4036 if (arg_uid_shift > (uid_t) -1 - arg_uid_range) {
4037 log_error("UID base too high for UID range.");
4038 return -EINVAL;
4039 }
4040
4041 log_info("Using user namespaces with base " UID_FMT " and range " UID_FMT ".", arg_uid_shift, arg_uid_range);
4042 return 0;
4043 }
4044
4045 static int inner_child(
4046 Barrier *barrier,
4047 const char *directory,
4048 bool secondary,
4049 int kmsg_socket,
4050 int rtnl_socket,
4051 FDSet *fds,
4052 int argc,
4053 char *argv[]) {
4054
4055 _cleanup_free_ char *home = NULL;
4056 unsigned n_env = 2;
4057 const char *envp[] = {
4058 "PATH=" DEFAULT_PATH_SPLIT_USR,
4059 "container=systemd-nspawn", /* LXC sets container=lxc, so follow the scheme here */
4060 NULL, /* TERM */
4061 NULL, /* HOME */
4062 NULL, /* USER */
4063 NULL, /* LOGNAME */
4064 NULL, /* container_uuid */
4065 NULL, /* LISTEN_FDS */
4066 NULL, /* LISTEN_PID */
4067 NULL
4068 };
4069
4070 _cleanup_strv_free_ char **env_use = NULL;
4071 int r;
4072
4073 assert(barrier);
4074 assert(directory);
4075 assert(kmsg_socket >= 0);
4076
4077 if (arg_userns) {
4078 /* Tell the parent, that it now can write the UID map. */
4079 (void) barrier_place(barrier); /* #1 */
4080
4081 /* Wait until the parent wrote the UID map */
4082 if (!barrier_place_and_sync(barrier)) { /* #2 */
4083 log_error("Parent died too early");
4084 return -ESRCH;
4085 }
4086 }
4087
4088 r = mount_all(NULL, true);
4089 if (r < 0)
4090 return r;
4091
4092 /* Wait until we are cgroup-ified, so that we
4093 * can mount the right cgroup path writable */
4094 if (!barrier_place_and_sync(barrier)) { /* #3 */
4095 log_error("Parent died too early");
4096 return -ESRCH;
4097 }
4098
4099 r = mount_systemd_cgroup_writable("");
4100 if (r < 0)
4101 return r;
4102
4103 r = reset_uid_gid();
4104 if (r < 0)
4105 return log_error_errno(r, "Couldn't become new root: %m");
4106
4107 r = setup_boot_id(NULL);
4108 if (r < 0)
4109 return r;
4110
4111 r = setup_kmsg(NULL, kmsg_socket);
4112 if (r < 0)
4113 return r;
4114 kmsg_socket = safe_close(kmsg_socket);
4115
4116 umask(0022);
4117
4118 if (setsid() < 0)
4119 return log_error_errno(errno, "setsid() failed: %m");
4120
4121 if (arg_private_network)
4122 loopback_setup();
4123
4124 r = send_rtnl(rtnl_socket);
4125 if (r < 0)
4126 return r;
4127 rtnl_socket = safe_close(rtnl_socket);
4128
4129 if (drop_capabilities() < 0)
4130 return log_error_errno(errno, "drop_capabilities() failed: %m");
4131
4132 setup_hostname();
4133
4134 if (arg_personality != PERSONALITY_INVALID) {
4135 if (personality(arg_personality) < 0)
4136 return log_error_errno(errno, "personality() failed: %m");
4137 } else if (secondary) {
4138 if (personality(PER_LINUX32) < 0)
4139 return log_error_errno(errno, "personality() failed: %m");
4140 }
4141
4142 #ifdef HAVE_SELINUX
4143 if (arg_selinux_context)
4144 if (setexeccon((security_context_t) arg_selinux_context) < 0)
4145 return log_error_errno(errno, "setexeccon(\"%s\") failed: %m", arg_selinux_context);
4146 #endif
4147
4148 r = change_uid_gid(&home);
4149 if (r < 0)
4150 return r;
4151
4152 envp[n_env] = strv_find_prefix(environ, "TERM=");
4153 if (envp[n_env])
4154 n_env ++;
4155
4156 if ((asprintf((char**)(envp + n_env++), "HOME=%s", home ? home: "/root") < 0) ||
4157 (asprintf((char**)(envp + n_env++), "USER=%s", arg_user ? arg_user : "root") < 0) ||
4158 (asprintf((char**)(envp + n_env++), "LOGNAME=%s", arg_user ? arg_user : "root") < 0))
4159 return log_oom();
4160
4161 if (!sd_id128_equal(arg_uuid, SD_ID128_NULL)) {
4162 char as_uuid[37];
4163
4164 if (asprintf((char**)(envp + n_env++), "container_uuid=%s", id128_format_as_uuid(arg_uuid, as_uuid)) < 0)
4165 return log_oom();
4166 }
4167
4168 if (fdset_size(fds) > 0) {
4169 r = fdset_cloexec(fds, false);
4170 if (r < 0)
4171 return log_error_errno(r, "Failed to unset O_CLOEXEC for file descriptors.");
4172
4173 if ((asprintf((char **)(envp + n_env++), "LISTEN_FDS=%u", fdset_size(fds)) < 0) ||
4174 (asprintf((char **)(envp + n_env++), "LISTEN_PID=1") < 0))
4175 return log_oom();
4176 }
4177
4178 env_use = strv_env_merge(2, envp, arg_setenv);
4179 if (!env_use)
4180 return log_oom();
4181
4182 /* Let the parent know that we are ready and
4183 * wait until the parent is ready with the
4184 * setup, too... */
4185 if (!barrier_place_and_sync(barrier)) { /* #4 */
4186 log_error("Parent died too early");
4187 return -ESRCH;
4188 }
4189
4190 /* Now, explicitly close the log, so that we
4191 * then can close all remaining fds. Closing
4192 * the log explicitly first has the benefit
4193 * that the logging subsystem knows about it,
4194 * and is thus ready to be reopened should we
4195 * need it again. Note that the other fds
4196 * closed here are at least the locking and
4197 * barrier fds. */
4198 log_close();
4199 (void) fdset_close_others(fds);
4200
4201 if (arg_boot) {
4202 char **a;
4203 size_t m;
4204
4205 /* Automatically search for the init system */
4206
4207 m = 1 + argc - optind;
4208 a = newa(char*, m + 1);
4209 memcpy(a + 1, argv + optind, m * sizeof(char*));
4210
4211 a[0] = (char*) "/usr/lib/systemd/systemd";
4212 execve(a[0], a, env_use);
4213
4214 a[0] = (char*) "/lib/systemd/systemd";
4215 execve(a[0], a, env_use);
4216
4217 a[0] = (char*) "/sbin/init";
4218 execve(a[0], a, env_use);
4219 } else if (argc > optind)
4220 execvpe(argv[optind], argv + optind, env_use);
4221 else {
4222 chdir(home ? home : "/root");
4223 execle("/bin/bash", "-bash", NULL, env_use);
4224 execle("/bin/sh", "-sh", NULL, env_use);
4225 }
4226
4227 (void) log_open();
4228 return log_error_errno(errno, "execv() failed: %m");
4229 }
4230
4231 static int outer_child(
4232 Barrier *barrier,
4233 const char *directory,
4234 const char *console,
4235 const char *root_device, bool root_device_rw,
4236 const char *home_device, bool home_device_rw,
4237 const char *srv_device, bool srv_device_rw,
4238 bool interactive,
4239 bool secondary,
4240 int pid_socket,
4241 int kmsg_socket,
4242 int rtnl_socket,
4243 FDSet *fds,
4244 int argc,
4245 char *argv[]) {
4246
4247 pid_t pid;
4248 ssize_t l;
4249 int r;
4250
4251 assert(barrier);
4252 assert(directory);
4253 assert(console);
4254 assert(pid_socket >= 0);
4255 assert(kmsg_socket >= 0);
4256
4257 if (prctl(PR_SET_PDEATHSIG, SIGKILL) < 0)
4258 return log_error_errno(errno, "PR_SET_PDEATHSIG failed: %m");
4259
4260 if (interactive) {
4261 close_nointr(STDIN_FILENO);
4262 close_nointr(STDOUT_FILENO);
4263 close_nointr(STDERR_FILENO);
4264
4265 r = open_terminal(console, O_RDWR);
4266 if (r != STDIN_FILENO) {
4267 if (r >= 0) {
4268 safe_close(r);
4269 r = -EINVAL;
4270 }
4271
4272 return log_error_errno(r, "Failed to open console: %m");
4273 }
4274
4275 if (dup2(STDIN_FILENO, STDOUT_FILENO) != STDOUT_FILENO ||
4276 dup2(STDIN_FILENO, STDERR_FILENO) != STDERR_FILENO)
4277 return log_error_errno(errno, "Failed to duplicate console: %m");
4278 }
4279
4280 r = reset_audit_loginuid();
4281 if (r < 0)
4282 return r;
4283
4284 /* Mark everything as slave, so that we still
4285 * receive mounts from the real root, but don't
4286 * propagate mounts to the real root. */
4287 if (mount(NULL, "/", NULL, MS_SLAVE|MS_REC, NULL) < 0)
4288 return log_error_errno(errno, "MS_SLAVE|MS_REC failed: %m");
4289
4290 r = mount_devices(directory,
4291 root_device, root_device_rw,
4292 home_device, home_device_rw,
4293 srv_device, srv_device_rw);
4294 if (r < 0)
4295 return r;
4296
4297 r = determine_uid_shift(directory);
4298 if (r < 0)
4299 return r;
4300
4301 /* Turn directory into bind mount */
4302 if (mount(directory, directory, NULL, MS_BIND|MS_REC, NULL) < 0)
4303 return log_error_errno(errno, "Failed to make bind mount: %m");
4304
4305 r = setup_volatile(directory);
4306 if (r < 0)
4307 return r;
4308
4309 r = setup_volatile_state(directory);
4310 if (r < 0)
4311 return r;
4312
4313 r = base_filesystem_create(directory, arg_uid_shift, (gid_t) arg_uid_shift);
4314 if (r < 0)
4315 return r;
4316
4317 if (arg_read_only) {
4318 r = bind_remount_recursive(directory, true);
4319 if (r < 0)
4320 return log_error_errno(r, "Failed to make tree read-only: %m");
4321 }
4322
4323 r = mount_all(directory, false);
4324 if (r < 0)
4325 return r;
4326
4327 if (copy_devnodes(directory) < 0)
4328 return r;
4329
4330 dev_setup(directory, arg_uid_shift, arg_uid_shift);
4331
4332 if (setup_pts(directory) < 0)
4333 return r;
4334
4335 r = setup_propagate(directory);
4336 if (r < 0)
4337 return r;
4338
4339 r = setup_dev_console(directory, console);
4340 if (r < 0)
4341 return r;
4342
4343 r = setup_seccomp();
4344 if (r < 0)
4345 return r;
4346
4347 r = setup_timezone(directory);
4348 if (r < 0)
4349 return r;
4350
4351 r = setup_resolv_conf(directory);
4352 if (r < 0)
4353 return r;
4354
4355 r = setup_journal(directory);
4356 if (r < 0)
4357 return r;
4358
4359 r = mount_custom(directory);
4360 if (r < 0)
4361 return r;
4362
4363 r = mount_cgroup(directory);
4364 if (r < 0)
4365 return r;
4366
4367 r = mount_move_root(directory);
4368 if (r < 0)
4369 return log_error_errno(r, "Failed to move root directory: %m");
4370
4371 pid = raw_clone(SIGCHLD|CLONE_NEWNS|
4372 (arg_share_system ? 0 : CLONE_NEWIPC|CLONE_NEWPID|CLONE_NEWUTS) |
4373 (arg_private_network ? CLONE_NEWNET : 0) |
4374 (arg_userns ? CLONE_NEWUSER : 0),
4375 NULL);
4376 if (pid < 0)
4377 return log_error_errno(errno, "Failed to fork inner child: %m");
4378
4379 if (pid == 0) {
4380 pid_socket = safe_close(pid_socket);
4381
4382 /* The inner child has all namespaces that are
4383 * requested, so that we all are owned by the user if
4384 * user namespaces are turned on. */
4385
4386 r = inner_child(barrier, directory, secondary, kmsg_socket, rtnl_socket, fds, argc, argv);
4387 if (r < 0)
4388 _exit(EXIT_FAILURE);
4389
4390 _exit(EXIT_SUCCESS);
4391 }
4392
4393 l = send(pid_socket, &pid, sizeof(pid), MSG_NOSIGNAL);
4394 if (l < 0)
4395 return log_error_errno(errno, "Failed to send PID: %m");
4396 if (l != sizeof(pid)) {
4397 log_error("Short write while sending PID.");
4398 return -EIO;
4399 }
4400
4401 pid_socket = safe_close(pid_socket);
4402
4403 return 0;
4404 }
4405
4406 static int setup_uid_map(pid_t pid) {
4407 char uid_map[strlen("/proc//uid_map") + DECIMAL_STR_MAX(uid_t) + 1], line[DECIMAL_STR_MAX(uid_t)*3+3+1];
4408 int r;
4409
4410 assert(pid > 1);
4411
4412 xsprintf(uid_map, "/proc/" PID_FMT "/uid_map", pid);
4413 xsprintf(line, UID_FMT " " UID_FMT " " UID_FMT "\n", 0, arg_uid_shift, arg_uid_range);
4414 r = write_string_file(uid_map, line);
4415 if (r < 0)
4416 return log_error_errno(r, "Failed to write UID map: %m");
4417
4418 /* We always assign the same UID and GID ranges */
4419 xsprintf(uid_map, "/proc/" PID_FMT "/gid_map", pid);
4420 r = write_string_file(uid_map, line);
4421 if (r < 0)
4422 return log_error_errno(r, "Failed to write GID map: %m");
4423
4424 return 0;
4425 }
4426
4427 static int chown_cgroup(pid_t pid) {
4428 _cleanup_free_ char *path = NULL, *fs = NULL;
4429 _cleanup_close_ int fd = -1;
4430 const char *fn;
4431 int r;
4432
4433 r = cg_pid_get_path(NULL, pid, &path);
4434 if (r < 0)
4435 return log_error_errno(r, "Failed to get container cgroup path: %m");
4436
4437 r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, path, NULL, &fs);
4438 if (r < 0)
4439 return log_error_errno(r, "Failed to get file system path for container cgroup: %m");
4440
4441 fd = open(fs, O_RDONLY|O_CLOEXEC|O_DIRECTORY);
4442 if (fd < 0)
4443 return log_error_errno(errno, "Failed to open %s: %m", fs);
4444
4445 FOREACH_STRING(fn, ".", "tasks", "notify_on_release", "cgroup.procs", "cgroup.clone_children")
4446 if (fchownat(fd, fn, arg_uid_shift, arg_uid_shift, 0) < 0)
4447 log_warning_errno(errno, "Failed to chown() cgroup file %s, ignoring: %m", fn);
4448
4449 return 0;
4450 }
4451
4452 int main(int argc, char *argv[]) {
4453
4454 _cleanup_free_ char *device_path = NULL, *root_device = NULL, *home_device = NULL, *srv_device = NULL, *console = NULL;
4455 bool root_device_rw = true, home_device_rw = true, srv_device_rw = true;
4456 _cleanup_close_ int master = -1, image_fd = -1;
4457 _cleanup_fdset_free_ FDSet *fds = NULL;
4458 int r, n_fd_passed, loop_nr = -1;
4459 char veth_name[IFNAMSIZ];
4460 bool secondary = false, remove_subvol = false;
4461 sigset_t mask, mask_chld;
4462 pid_t pid = 0;
4463 int ret = EXIT_SUCCESS;
4464 union in_addr_union exposed = {};
4465 _cleanup_release_lock_file_ LockFile tree_global_lock = LOCK_FILE_INIT, tree_local_lock = LOCK_FILE_INIT;
4466 bool interactive;
4467
4468 log_parse_environment();
4469 log_open();
4470
4471 r = parse_argv(argc, argv);
4472 if (r <= 0)
4473 goto finish;
4474
4475 r = determine_names();
4476 if (r < 0)
4477 goto finish;
4478
4479 if (geteuid() != 0) {
4480 log_error("Need to be root.");
4481 r = -EPERM;
4482 goto finish;
4483 }
4484
4485 n_fd_passed = sd_listen_fds(false);
4486 if (n_fd_passed > 0) {
4487 r = fdset_new_listen_fds(&fds, false);
4488 if (r < 0) {
4489 log_error_errno(r, "Failed to collect file descriptors: %m");
4490 goto finish;
4491 }
4492 }
4493
4494 if (arg_directory) {
4495 assert(!arg_image);
4496
4497 if (path_equal(arg_directory, "/") && !arg_ephemeral) {
4498 log_error("Spawning container on root directory is not supported. Consider using --ephemeral.");
4499 r = -EINVAL;
4500 goto finish;
4501 }
4502
4503 if (arg_ephemeral) {
4504 _cleanup_free_ char *np = NULL;
4505
4506 /* If the specified path is a mount point we
4507 * generate the new snapshot immediately
4508 * inside it under a random name. However if
4509 * the specified is not a mount point we
4510 * create the new snapshot in the parent
4511 * directory, just next to it. */
4512 r = path_is_mount_point(arg_directory, 0);
4513 if (r < 0) {
4514 log_error_errno(r, "Failed to determine whether directory %s is mount point: %m", arg_directory);
4515 goto finish;
4516 }
4517 if (r > 0)
4518 r = tempfn_random_child(arg_directory, &np);
4519 else
4520 r = tempfn_random(arg_directory, &np);
4521 if (r < 0) {
4522 log_error_errno(r, "Failed to generate name for snapshot: %m");
4523 goto finish;
4524 }
4525
4526 r = image_path_lock(np, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
4527 if (r < 0) {
4528 log_error_errno(r, "Failed to lock %s: %m", np);
4529 goto finish;
4530 }
4531
4532 r = btrfs_subvol_snapshot(arg_directory, np, (arg_read_only ? BTRFS_SNAPSHOT_READ_ONLY : 0) | BTRFS_SNAPSHOT_FALLBACK_COPY | BTRFS_SNAPSHOT_RECURSIVE);
4533 if (r < 0) {
4534 log_error_errno(r, "Failed to create snapshot %s from %s: %m", np, arg_directory);
4535 goto finish;
4536 }
4537
4538 free(arg_directory);
4539 arg_directory = np;
4540 np = NULL;
4541
4542 remove_subvol = true;
4543
4544 } else {
4545 r = image_path_lock(arg_directory, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
4546 if (r == -EBUSY) {
4547 log_error_errno(r, "Directory tree %s is currently busy.", arg_directory);
4548 goto finish;
4549 }
4550 if (r < 0) {
4551 log_error_errno(r, "Failed to lock %s: %m", arg_directory);
4552 return r;
4553 }
4554
4555 if (arg_template) {
4556 r = btrfs_subvol_snapshot(arg_template, arg_directory, (arg_read_only ? BTRFS_SNAPSHOT_READ_ONLY : 0) | BTRFS_SNAPSHOT_FALLBACK_COPY | BTRFS_SNAPSHOT_RECURSIVE);
4557 if (r == -EEXIST) {
4558 if (!arg_quiet)
4559 log_info("Directory %s already exists, not populating from template %s.", arg_directory, arg_template);
4560 } else if (r < 0) {
4561 log_error_errno(r, "Couldn't create snapshot %s from %s: %m", arg_directory, arg_template);
4562 goto finish;
4563 } else {
4564 if (!arg_quiet)
4565 log_info("Populated %s from template %s.", arg_directory, arg_template);
4566 }
4567 }
4568 }
4569
4570 if (arg_boot) {
4571 if (path_is_os_tree(arg_directory) <= 0) {
4572 log_error("Directory %s doesn't look like an OS root directory (os-release file is missing). Refusing.", arg_directory);
4573 r = -EINVAL;
4574 goto finish;
4575 }
4576 } else {
4577 const char *p;
4578
4579 p = strjoina(arg_directory,
4580 argc > optind && path_is_absolute(argv[optind]) ? argv[optind] : "/usr/bin/");
4581 if (access(p, F_OK) < 0) {
4582 log_error("Directory %s lacks the binary to execute or doesn't look like a binary tree. Refusing.", arg_directory);
4583 r = -EINVAL;
4584 goto finish;
4585 }
4586 }
4587
4588 } else {
4589 char template[] = "/tmp/nspawn-root-XXXXXX";
4590
4591 assert(arg_image);
4592 assert(!arg_template);
4593
4594 r = image_path_lock(arg_image, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
4595 if (r == -EBUSY) {
4596 r = log_error_errno(r, "Disk image %s is currently busy.", arg_image);
4597 goto finish;
4598 }
4599 if (r < 0) {
4600 r = log_error_errno(r, "Failed to create image lock: %m");
4601 goto finish;
4602 }
4603
4604 if (!mkdtemp(template)) {
4605 log_error_errno(errno, "Failed to create temporary directory: %m");
4606 r = -errno;
4607 goto finish;
4608 }
4609
4610 arg_directory = strdup(template);
4611 if (!arg_directory) {
4612 r = log_oom();
4613 goto finish;
4614 }
4615
4616 image_fd = setup_image(&device_path, &loop_nr);
4617 if (image_fd < 0) {
4618 r = image_fd;
4619 goto finish;
4620 }
4621
4622 r = dissect_image(image_fd,
4623 &root_device, &root_device_rw,
4624 &home_device, &home_device_rw,
4625 &srv_device, &srv_device_rw,
4626 &secondary);
4627 if (r < 0)
4628 goto finish;
4629 }
4630
4631 r = custom_mounts_prepare();
4632 if (r < 0)
4633 goto finish;
4634
4635 interactive =
4636 isatty(STDIN_FILENO) > 0 &&
4637 isatty(STDOUT_FILENO) > 0;
4638
4639 master = posix_openpt(O_RDWR|O_NOCTTY|O_CLOEXEC|O_NDELAY);
4640 if (master < 0) {
4641 r = log_error_errno(errno, "Failed to acquire pseudo tty: %m");
4642 goto finish;
4643 }
4644
4645 r = ptsname_malloc(master, &console);
4646 if (r < 0) {
4647 r = log_error_errno(r, "Failed to determine tty name: %m");
4648 goto finish;
4649 }
4650
4651 if (unlockpt(master) < 0) {
4652 r = log_error_errno(errno, "Failed to unlock tty: %m");
4653 goto finish;
4654 }
4655
4656 if (!arg_quiet)
4657 log_info("Spawning container %s on %s.\nPress ^] three times within 1s to kill container.",
4658 arg_machine, arg_image ?: arg_directory);
4659
4660 assert_se(sigemptyset(&mask) == 0);
4661 sigset_add_many(&mask, SIGCHLD, SIGWINCH, SIGTERM, SIGINT, -1);
4662 assert_se(sigprocmask(SIG_BLOCK, &mask, NULL) == 0);
4663
4664 assert_se(sigemptyset(&mask_chld) == 0);
4665 assert_se(sigaddset(&mask_chld, SIGCHLD) == 0);
4666
4667 if (prctl(PR_SET_CHILD_SUBREAPER, 1) < 0) {
4668 r = log_error_errno(errno, "Failed to become subreaper: %m");
4669 goto finish;
4670 }
4671
4672 for (;;) {
4673 _cleanup_close_pair_ int kmsg_socket_pair[2] = { -1, -1 }, rtnl_socket_pair[2] = { -1, -1 }, pid_socket_pair[2] = { -1, -1 };
4674 ContainerStatus container_status;
4675 _cleanup_(barrier_destroy) Barrier barrier = BARRIER_NULL;
4676 static const struct sigaction sa = {
4677 .sa_handler = nop_handler,
4678 .sa_flags = SA_NOCLDSTOP,
4679 };
4680 int ifi = 0;
4681 ssize_t l;
4682 _cleanup_event_unref_ sd_event *event = NULL;
4683 _cleanup_(pty_forward_freep) PTYForward *forward = NULL;
4684 _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
4685 char last_char = 0;
4686
4687 r = barrier_create(&barrier);
4688 if (r < 0) {
4689 log_error_errno(r, "Cannot initialize IPC barrier: %m");
4690 goto finish;
4691 }
4692
4693 if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC, 0, kmsg_socket_pair) < 0) {
4694 r = log_error_errno(errno, "Failed to create kmsg socket pair: %m");
4695 goto finish;
4696 }
4697
4698 if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC, 0, rtnl_socket_pair) < 0) {
4699 r = log_error_errno(errno, "Failed to create rtnl socket pair: %m");
4700 goto finish;
4701 }
4702
4703 if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC, 0, pid_socket_pair) < 0) {
4704 r = log_error_errno(errno, "Failed to create pid socket pair: %m");
4705 goto finish;
4706 }
4707
4708 /* Child can be killed before execv(), so handle SIGCHLD
4709 * in order to interrupt parent's blocking calls and
4710 * give it a chance to call wait() and terminate. */
4711 r = sigprocmask(SIG_UNBLOCK, &mask_chld, NULL);
4712 if (r < 0) {
4713 r = log_error_errno(errno, "Failed to change the signal mask: %m");
4714 goto finish;
4715 }
4716
4717 r = sigaction(SIGCHLD, &sa, NULL);
4718 if (r < 0) {
4719 r = log_error_errno(errno, "Failed to install SIGCHLD handler: %m");
4720 goto finish;
4721 }
4722
4723 pid = raw_clone(SIGCHLD|CLONE_NEWNS, NULL);
4724 if (pid < 0) {
4725 if (errno == EINVAL)
4726 r = log_error_errno(errno, "clone() failed, do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in): %m");
4727 else
4728 r = log_error_errno(errno, "clone() failed: %m");
4729
4730 goto finish;
4731 }
4732
4733 if (pid == 0) {
4734 /* The outer child only has a file system namespace. */
4735 barrier_set_role(&barrier, BARRIER_CHILD);
4736
4737 master = safe_close(master);
4738
4739 kmsg_socket_pair[0] = safe_close(kmsg_socket_pair[0]);
4740 rtnl_socket_pair[0] = safe_close(rtnl_socket_pair[0]);
4741 pid_socket_pair[0] = safe_close(pid_socket_pair[0]);
4742
4743 (void) reset_all_signal_handlers();
4744 (void) reset_signal_mask();
4745
4746 r = outer_child(&barrier,
4747 arg_directory,
4748 console,
4749 root_device, root_device_rw,
4750 home_device, home_device_rw,
4751 srv_device, srv_device_rw,
4752 interactive,
4753 secondary,
4754 pid_socket_pair[1],
4755 kmsg_socket_pair[1],
4756 rtnl_socket_pair[1],
4757 fds,
4758 argc, argv);
4759 if (r < 0)
4760 _exit(EXIT_FAILURE);
4761
4762 _exit(EXIT_SUCCESS);
4763 }
4764
4765 barrier_set_role(&barrier, BARRIER_PARENT);
4766
4767 fdset_free(fds);
4768 fds = NULL;
4769
4770 kmsg_socket_pair[1] = safe_close(kmsg_socket_pair[1]);
4771 rtnl_socket_pair[1] = safe_close(rtnl_socket_pair[1]);
4772 pid_socket_pair[1] = safe_close(pid_socket_pair[1]);
4773
4774 /* Wait for the outer child. */
4775 r = wait_for_terminate_and_warn("namespace helper", pid, NULL);
4776 if (r < 0)
4777 goto finish;
4778 if (r != 0) {
4779 r = -EIO;
4780 goto finish;
4781 }
4782 pid = 0;
4783
4784 /* And now retrieve the PID of the inner child. */
4785 l = recv(pid_socket_pair[0], &pid, sizeof(pid), 0);
4786 if (l < 0) {
4787 r = log_error_errno(errno, "Failed to read inner child PID: %m");
4788 goto finish;
4789 }
4790 if (l != sizeof(pid)) {
4791 log_error("Short read while reading inner child PID: %m");
4792 r = EIO;
4793 goto finish;
4794 }
4795
4796 log_debug("Init process invoked as PID " PID_FMT, pid);
4797
4798 if (arg_userns) {
4799 if (!barrier_place_and_sync(&barrier)) { /* #1 */
4800 log_error("Child died too early.");
4801 r = -ESRCH;
4802 goto finish;
4803 }
4804
4805 r = setup_uid_map(pid);
4806 if (r < 0)
4807 goto finish;
4808
4809 (void) barrier_place(&barrier); /* #2 */
4810 }
4811
4812 r = move_network_interfaces(pid);
4813 if (r < 0)
4814 goto finish;
4815
4816 r = setup_veth(pid, veth_name, &ifi);
4817 if (r < 0)
4818 goto finish;
4819
4820 r = setup_bridge(veth_name, &ifi);
4821 if (r < 0)
4822 goto finish;
4823
4824 r = setup_macvlan(pid);
4825 if (r < 0)
4826 goto finish;
4827
4828 r = setup_ipvlan(pid);
4829 if (r < 0)
4830 goto finish;
4831
4832 r = register_machine(pid, ifi);
4833 if (r < 0)
4834 goto finish;
4835
4836 r = chown_cgroup(pid);
4837 if (r < 0)
4838 goto finish;
4839
4840 /* Notify the child that the parent is ready with all
4841 * its setup (including cgroup-ification), and that
4842 * the child can now hand over control to the code to
4843 * run inside the container. */
4844 (void) barrier_place(&barrier); /* #3 */
4845
4846 /* Block SIGCHLD here, before notifying child.
4847 * process_pty() will handle it with the other signals. */
4848 assert_se(sigprocmask(SIG_BLOCK, &mask_chld, NULL) >= 0);
4849
4850 /* Reset signal to default */
4851 r = default_signals(SIGCHLD, -1);
4852 if (r < 0) {
4853 log_error_errno(r, "Failed to reset SIGCHLD: %m");
4854 goto finish;
4855 }
4856
4857 /* Let the child know that we are ready and wait that the child is completely ready now. */
4858 if (!barrier_place_and_sync(&barrier)) { /* #5 */
4859 log_error("Client died too early.");
4860 r = -ESRCH;
4861 goto finish;
4862 }
4863
4864 sd_notifyf(false,
4865 "READY=1\n"
4866 "STATUS=Container running.\n"
4867 "X_NSPAWN_LEADER_PID=" PID_FMT, pid);
4868
4869 r = sd_event_new(&event);
4870 if (r < 0) {
4871 log_error_errno(r, "Failed to get default event source: %m");
4872 goto finish;
4873 }
4874
4875 if (arg_kill_signal > 0) {
4876 /* Try to kill the init system on SIGINT or SIGTERM */
4877 sd_event_add_signal(event, NULL, SIGINT, on_orderly_shutdown, UINT32_TO_PTR(pid));
4878 sd_event_add_signal(event, NULL, SIGTERM, on_orderly_shutdown, UINT32_TO_PTR(pid));
4879 } else {
4880 /* Immediately exit */
4881 sd_event_add_signal(event, NULL, SIGINT, NULL, NULL);
4882 sd_event_add_signal(event, NULL, SIGTERM, NULL, NULL);
4883 }
4884
4885 /* simply exit on sigchld */
4886 sd_event_add_signal(event, NULL, SIGCHLD, NULL, NULL);
4887
4888 if (arg_expose_ports) {
4889 r = watch_rtnl(event, rtnl_socket_pair[0], &exposed, &rtnl);
4890 if (r < 0)
4891 goto finish;
4892
4893 (void) expose_ports(rtnl, &exposed);
4894 }
4895
4896 rtnl_socket_pair[0] = safe_close(rtnl_socket_pair[0]);
4897
4898 r = pty_forward_new(event, master, true, !interactive, &forward);
4899 if (r < 0) {
4900 log_error_errno(r, "Failed to create PTY forwarder: %m");
4901 goto finish;
4902 }
4903
4904 r = sd_event_loop(event);
4905 if (r < 0) {
4906 log_error_errno(r, "Failed to run event loop: %m");
4907 goto finish;
4908 }
4909
4910 pty_forward_get_last_char(forward, &last_char);
4911
4912 forward = pty_forward_free(forward);
4913
4914 if (!arg_quiet && last_char != '\n')
4915 putc('\n', stdout);
4916
4917 /* Kill if it is not dead yet anyway */
4918 terminate_machine(pid);
4919
4920 /* Normally redundant, but better safe than sorry */
4921 kill(pid, SIGKILL);
4922
4923 r = wait_for_container(pid, &container_status);
4924 pid = 0;
4925
4926 if (r < 0)
4927 /* We failed to wait for the container, or the
4928 * container exited abnormally */
4929 goto finish;
4930 else if (r > 0 || container_status == CONTAINER_TERMINATED){
4931 /* The container exited with a non-zero
4932 * status, or with zero status and no reboot
4933 * was requested. */
4934 ret = r;
4935 break;
4936 }
4937
4938 /* CONTAINER_REBOOTED, loop again */
4939
4940 if (arg_keep_unit) {
4941 /* Special handling if we are running as a
4942 * service: instead of simply restarting the
4943 * machine we want to restart the entire
4944 * service, so let's inform systemd about this
4945 * with the special exit code 133. The service
4946 * file uses RestartForceExitStatus=133 so
4947 * that this results in a full nspawn
4948 * restart. This is necessary since we might
4949 * have cgroup parameters set we want to have
4950 * flushed out. */
4951 ret = 133;
4952 r = 0;
4953 break;
4954 }
4955
4956 flush_ports(&exposed);
4957 }
4958
4959 finish:
4960 sd_notify(false,
4961 "STOPPING=1\n"
4962 "STATUS=Terminating...");
4963
4964 if (pid > 0)
4965 kill(pid, SIGKILL);
4966
4967 loop_remove(loop_nr, &image_fd);
4968
4969 if (remove_subvol && arg_directory) {
4970 int k;
4971
4972 k = btrfs_subvol_remove(arg_directory, true);
4973 if (k < 0)
4974 log_warning_errno(k, "Cannot remove subvolume '%s', ignoring: %m", arg_directory);
4975 }
4976
4977 if (arg_machine) {
4978 const char *p;
4979
4980 p = strjoina("/run/systemd/nspawn/propagate/", arg_machine);
4981 (void) rm_rf(p, REMOVE_ROOT);
4982 }
4983
4984 free(arg_directory);
4985 free(arg_template);
4986 free(arg_image);
4987 free(arg_machine);
4988 free(arg_user);
4989 strv_free(arg_setenv);
4990 strv_free(arg_network_interfaces);
4991 strv_free(arg_network_macvlan);
4992 strv_free(arg_network_ipvlan);
4993 custom_mount_free_all();
4994
4995 flush_ports(&exposed);
4996
4997 while (arg_expose_ports) {
4998 ExposePort *p = arg_expose_ports;
4999 LIST_REMOVE(ports, arg_expose_ports, p);
5000 free(p);
5001 }
5002
5003 return r < 0 ? EXIT_FAILURE : ret;
5004 }