]> git.ipfire.org Git - thirdparty/systemd.git/blob - src/nspawn/nspawn.c
Merge pull request #470 from marineam/escape
[thirdparty/systemd.git] / src / nspawn / nspawn.c
1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
2
3 /***
4 This file is part of systemd.
5
6 Copyright 2010 Lennart Poettering
7
8 systemd is free software; you can redistribute it and/or modify it
9 under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 2.1 of the License, or
11 (at your option) any later version.
12
13 systemd is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 Lesser General Public License for more details.
17
18 You should have received a copy of the GNU Lesser General Public License
19 along with systemd; If not, see <http://www.gnu.org/licenses/>.
20 ***/
21
22 #include <signal.h>
23 #include <sched.h>
24 #include <unistd.h>
25 #include <sys/types.h>
26 #include <sys/mount.h>
27 #include <stdlib.h>
28 #include <string.h>
29 #include <stdio.h>
30 #include <errno.h>
31 #include <sys/prctl.h>
32 #include <getopt.h>
33 #include <grp.h>
34 #include <linux/fs.h>
35 #include <sys/socket.h>
36 #include <linux/netlink.h>
37 #include <net/if.h>
38 #include <linux/veth.h>
39 #include <sys/personality.h>
40 #include <linux/loop.h>
41 #include <sys/file.h>
42
43 #ifdef HAVE_SELINUX
44 #include <selinux/selinux.h>
45 #endif
46
47 #ifdef HAVE_SECCOMP
48 #include <seccomp.h>
49 #endif
50
51 #ifdef HAVE_BLKID
52 #include <blkid/blkid.h>
53 #endif
54
55 #include "sd-daemon.h"
56 #include "sd-bus.h"
57 #include "sd-id128.h"
58 #include "sd-netlink.h"
59 #include "random-util.h"
60 #include "log.h"
61 #include "util.h"
62 #include "mkdir.h"
63 #include "rm-rf.h"
64 #include "macro.h"
65 #include "missing.h"
66 #include "cgroup-util.h"
67 #include "strv.h"
68 #include "path-util.h"
69 #include "loopback-setup.h"
70 #include "dev-setup.h"
71 #include "fdset.h"
72 #include "build.h"
73 #include "fileio.h"
74 #include "bus-util.h"
75 #include "bus-error.h"
76 #include "ptyfwd.h"
77 #include "env-util.h"
78 #include "netlink-util.h"
79 #include "udev-util.h"
80 #include "blkid-util.h"
81 #include "gpt.h"
82 #include "siphash24.h"
83 #include "copy.h"
84 #include "base-filesystem.h"
85 #include "barrier.h"
86 #include "event-util.h"
87 #include "capability.h"
88 #include "cap-list.h"
89 #include "btrfs-util.h"
90 #include "machine-image.h"
91 #include "list.h"
92 #include "in-addr-util.h"
93 #include "firewall-util.h"
94 #include "local-addresses.h"
95 #include "formats-util.h"
96 #include "process-util.h"
97 #include "terminal-util.h"
98 #include "hostname-util.h"
99 #include "signal-util.h"
100
101 #ifdef HAVE_SECCOMP
102 #include "seccomp-util.h"
103 #endif
104
105 typedef struct ExposePort {
106 int protocol;
107 uint16_t host_port;
108 uint16_t container_port;
109 LIST_FIELDS(struct ExposePort, ports);
110 } ExposePort;
111
112 typedef enum ContainerStatus {
113 CONTAINER_TERMINATED,
114 CONTAINER_REBOOTED
115 } ContainerStatus;
116
117 typedef enum LinkJournal {
118 LINK_NO,
119 LINK_AUTO,
120 LINK_HOST,
121 LINK_GUEST
122 } LinkJournal;
123
124 typedef enum Volatile {
125 VOLATILE_NO,
126 VOLATILE_YES,
127 VOLATILE_STATE,
128 } Volatile;
129
130 typedef enum CustomMountType {
131 CUSTOM_MOUNT_BIND,
132 CUSTOM_MOUNT_TMPFS,
133 CUSTOM_MOUNT_OVERLAY,
134 } CustomMountType;
135
136 typedef struct CustomMount {
137 CustomMountType type;
138 bool read_only;
139 char *source; /* for overlayfs this is the upper directory */
140 char *destination;
141 char *options;
142 char *work_dir;
143 char **lower;
144 } CustomMount;
145
146 static char *arg_directory = NULL;
147 static char *arg_template = NULL;
148 static char *arg_user = NULL;
149 static sd_id128_t arg_uuid = {};
150 static char *arg_machine = NULL;
151 static const char *arg_selinux_context = NULL;
152 static const char *arg_selinux_apifs_context = NULL;
153 static const char *arg_slice = NULL;
154 static bool arg_private_network = false;
155 static bool arg_read_only = false;
156 static bool arg_boot = false;
157 static bool arg_ephemeral = false;
158 static LinkJournal arg_link_journal = LINK_AUTO;
159 static bool arg_link_journal_try = false;
160 static uint64_t arg_retain =
161 (1ULL << CAP_CHOWN) |
162 (1ULL << CAP_DAC_OVERRIDE) |
163 (1ULL << CAP_DAC_READ_SEARCH) |
164 (1ULL << CAP_FOWNER) |
165 (1ULL << CAP_FSETID) |
166 (1ULL << CAP_IPC_OWNER) |
167 (1ULL << CAP_KILL) |
168 (1ULL << CAP_LEASE) |
169 (1ULL << CAP_LINUX_IMMUTABLE) |
170 (1ULL << CAP_NET_BIND_SERVICE) |
171 (1ULL << CAP_NET_BROADCAST) |
172 (1ULL << CAP_NET_RAW) |
173 (1ULL << CAP_SETGID) |
174 (1ULL << CAP_SETFCAP) |
175 (1ULL << CAP_SETPCAP) |
176 (1ULL << CAP_SETUID) |
177 (1ULL << CAP_SYS_ADMIN) |
178 (1ULL << CAP_SYS_CHROOT) |
179 (1ULL << CAP_SYS_NICE) |
180 (1ULL << CAP_SYS_PTRACE) |
181 (1ULL << CAP_SYS_TTY_CONFIG) |
182 (1ULL << CAP_SYS_RESOURCE) |
183 (1ULL << CAP_SYS_BOOT) |
184 (1ULL << CAP_AUDIT_WRITE) |
185 (1ULL << CAP_AUDIT_CONTROL) |
186 (1ULL << CAP_MKNOD);
187 static CustomMount *arg_custom_mounts = NULL;
188 static unsigned arg_n_custom_mounts = 0;
189 static char **arg_setenv = NULL;
190 static bool arg_quiet = false;
191 static bool arg_share_system = false;
192 static bool arg_register = true;
193 static bool arg_keep_unit = false;
194 static char **arg_network_interfaces = NULL;
195 static char **arg_network_macvlan = NULL;
196 static char **arg_network_ipvlan = NULL;
197 static bool arg_network_veth = false;
198 static const char *arg_network_bridge = NULL;
199 static unsigned long arg_personality = PERSONALITY_INVALID;
200 static char *arg_image = NULL;
201 static Volatile arg_volatile = VOLATILE_NO;
202 static ExposePort *arg_expose_ports = NULL;
203 static char **arg_property = NULL;
204 static uid_t arg_uid_shift = UID_INVALID, arg_uid_range = 0x10000U;
205 static bool arg_userns = false;
206 static int arg_kill_signal = 0;
207
208 static void help(void) {
209 printf("%s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
210 "Spawn a minimal namespace container for debugging, testing and building.\n\n"
211 " -h --help Show this help\n"
212 " --version Print version string\n"
213 " -q --quiet Do not show status information\n"
214 " -D --directory=PATH Root directory for the container\n"
215 " --template=PATH Initialize root directory from template directory,\n"
216 " if missing\n"
217 " -x --ephemeral Run container with snapshot of root directory, and\n"
218 " remove it after exit\n"
219 " -i --image=PATH File system device or disk image for the container\n"
220 " -b --boot Boot up full system (i.e. invoke init)\n"
221 " -u --user=USER Run the command under specified user or uid\n"
222 " -M --machine=NAME Set the machine name for the container\n"
223 " --uuid=UUID Set a specific machine UUID for the container\n"
224 " -S --slice=SLICE Place the container in the specified slice\n"
225 " --property=NAME=VALUE Set scope unit property\n"
226 " --private-users[=UIDBASE[:NUIDS]]\n"
227 " Run within user namespace\n"
228 " --private-network Disable network in container\n"
229 " --network-interface=INTERFACE\n"
230 " Assign an existing network interface to the\n"
231 " container\n"
232 " --network-macvlan=INTERFACE\n"
233 " Create a macvlan network interface based on an\n"
234 " existing network interface to the container\n"
235 " --network-ipvlan=INTERFACE\n"
236 " Create a ipvlan network interface based on an\n"
237 " existing network interface to the container\n"
238 " -n --network-veth Add a virtual ethernet connection between host\n"
239 " and container\n"
240 " --network-bridge=INTERFACE\n"
241 " Add a virtual ethernet connection between host\n"
242 " and container and add it to an existing bridge on\n"
243 " the host\n"
244 " -p --port=[PROTOCOL:]HOSTPORT[:CONTAINERPORT]\n"
245 " Expose a container IP port on the host\n"
246 " -Z --selinux-context=SECLABEL\n"
247 " Set the SELinux security context to be used by\n"
248 " processes in the container\n"
249 " -L --selinux-apifs-context=SECLABEL\n"
250 " Set the SELinux security context to be used by\n"
251 " API/tmpfs file systems in the container\n"
252 " --capability=CAP In addition to the default, retain specified\n"
253 " capability\n"
254 " --drop-capability=CAP Drop the specified capability from the default set\n"
255 " --kill-signal=SIGNAL Select signal to use for shutting down PID 1\n"
256 " --link-journal=MODE Link up guest journal, one of no, auto, guest, host,\n"
257 " try-guest, try-host\n"
258 " -j Equivalent to --link-journal=try-guest\n"
259 " --read-only Mount the root directory read-only\n"
260 " --bind=PATH[:PATH] Bind mount a file or directory from the host into\n"
261 " the container\n"
262 " --bind-ro=PATH[:PATH] Similar, but creates a read-only bind mount\n"
263 " --tmpfs=PATH:[OPTIONS] Mount an empty tmpfs to the specified directory\n"
264 " --overlay=PATH[:PATH...]:PATH\n"
265 " Create an overlay mount from the host to \n"
266 " the container\n"
267 " --overlay-ro=PATH[:PATH...]:PATH\n"
268 " Similar, but creates a read-only overlay mount\n"
269 " --setenv=NAME=VALUE Pass an environment variable to PID 1\n"
270 " --share-system Share system namespaces with host\n"
271 " --register=BOOLEAN Register container as machine\n"
272 " --keep-unit Do not register a scope for the machine, reuse\n"
273 " the service unit nspawn is running in\n"
274 " --volatile[=MODE] Run the system in volatile mode\n"
275 , program_invocation_short_name);
276 }
277
278 static CustomMount* custom_mount_add(CustomMountType t) {
279 CustomMount *c, *ret;
280
281 c = realloc(arg_custom_mounts, (arg_n_custom_mounts + 1) * sizeof(CustomMount));
282 if (!c)
283 return NULL;
284
285 arg_custom_mounts = c;
286 ret = arg_custom_mounts + arg_n_custom_mounts;
287 arg_n_custom_mounts++;
288
289 *ret = (CustomMount) { .type = t };
290
291 return ret;
292 }
293
294 static void custom_mount_free_all(void) {
295 unsigned i;
296
297 for (i = 0; i < arg_n_custom_mounts; i++) {
298 CustomMount *m = &arg_custom_mounts[i];
299
300 free(m->source);
301 free(m->destination);
302 free(m->options);
303
304 if (m->work_dir) {
305 (void) rm_rf(m->work_dir, REMOVE_ROOT|REMOVE_PHYSICAL);
306 free(m->work_dir);
307 }
308
309 strv_free(m->lower);
310 }
311
312 free(arg_custom_mounts);
313 arg_custom_mounts = NULL;
314 arg_n_custom_mounts = 0;
315 }
316
317 static int custom_mount_compare(const void *a, const void *b) {
318 const CustomMount *x = a, *y = b;
319 int r;
320
321 r = path_compare(x->destination, y->destination);
322 if (r != 0)
323 return r;
324
325 if (x->type < y->type)
326 return -1;
327 if (x->type > y->type)
328 return 1;
329
330 return 0;
331 }
332
333 static int custom_mounts_prepare(void) {
334 unsigned i;
335 int r;
336
337 /* Ensure the mounts are applied prefix first. */
338 qsort_safe(arg_custom_mounts, arg_n_custom_mounts, sizeof(CustomMount), custom_mount_compare);
339
340 /* Allocate working directories for the overlay file systems that need it */
341 for (i = 0; i < arg_n_custom_mounts; i++) {
342 CustomMount *m = &arg_custom_mounts[i];
343
344 if (m->type != CUSTOM_MOUNT_OVERLAY)
345 continue;
346
347 if (m->work_dir)
348 continue;
349
350 if (m->read_only)
351 continue;
352
353 r = tempfn_random(m->source, NULL, &m->work_dir);
354 if (r < 0)
355 return log_error_errno(r, "Failed to generate work directory from %s: %m", m->source);
356 }
357
358 return 0;
359 }
360
361 static int set_sanitized_path(char **b, const char *path) {
362 char *p;
363
364 assert(b);
365 assert(path);
366
367 p = canonicalize_file_name(path);
368 if (!p) {
369 if (errno != ENOENT)
370 return -errno;
371
372 p = path_make_absolute_cwd(path);
373 if (!p)
374 return -ENOMEM;
375 }
376
377 free(*b);
378 *b = path_kill_slashes(p);
379 return 0;
380 }
381
382 static int parse_argv(int argc, char *argv[]) {
383
384 enum {
385 ARG_VERSION = 0x100,
386 ARG_PRIVATE_NETWORK,
387 ARG_UUID,
388 ARG_READ_ONLY,
389 ARG_CAPABILITY,
390 ARG_DROP_CAPABILITY,
391 ARG_LINK_JOURNAL,
392 ARG_BIND,
393 ARG_BIND_RO,
394 ARG_TMPFS,
395 ARG_OVERLAY,
396 ARG_OVERLAY_RO,
397 ARG_SETENV,
398 ARG_SHARE_SYSTEM,
399 ARG_REGISTER,
400 ARG_KEEP_UNIT,
401 ARG_NETWORK_INTERFACE,
402 ARG_NETWORK_MACVLAN,
403 ARG_NETWORK_IPVLAN,
404 ARG_NETWORK_BRIDGE,
405 ARG_PERSONALITY,
406 ARG_VOLATILE,
407 ARG_TEMPLATE,
408 ARG_PROPERTY,
409 ARG_PRIVATE_USERS,
410 ARG_KILL_SIGNAL,
411 };
412
413 static const struct option options[] = {
414 { "help", no_argument, NULL, 'h' },
415 { "version", no_argument, NULL, ARG_VERSION },
416 { "directory", required_argument, NULL, 'D' },
417 { "template", required_argument, NULL, ARG_TEMPLATE },
418 { "ephemeral", no_argument, NULL, 'x' },
419 { "user", required_argument, NULL, 'u' },
420 { "private-network", no_argument, NULL, ARG_PRIVATE_NETWORK },
421 { "boot", no_argument, NULL, 'b' },
422 { "uuid", required_argument, NULL, ARG_UUID },
423 { "read-only", no_argument, NULL, ARG_READ_ONLY },
424 { "capability", required_argument, NULL, ARG_CAPABILITY },
425 { "drop-capability", required_argument, NULL, ARG_DROP_CAPABILITY },
426 { "link-journal", required_argument, NULL, ARG_LINK_JOURNAL },
427 { "bind", required_argument, NULL, ARG_BIND },
428 { "bind-ro", required_argument, NULL, ARG_BIND_RO },
429 { "tmpfs", required_argument, NULL, ARG_TMPFS },
430 { "overlay", required_argument, NULL, ARG_OVERLAY },
431 { "overlay-ro", required_argument, NULL, ARG_OVERLAY_RO },
432 { "machine", required_argument, NULL, 'M' },
433 { "slice", required_argument, NULL, 'S' },
434 { "setenv", required_argument, NULL, ARG_SETENV },
435 { "selinux-context", required_argument, NULL, 'Z' },
436 { "selinux-apifs-context", required_argument, NULL, 'L' },
437 { "quiet", no_argument, NULL, 'q' },
438 { "share-system", no_argument, NULL, ARG_SHARE_SYSTEM },
439 { "register", required_argument, NULL, ARG_REGISTER },
440 { "keep-unit", no_argument, NULL, ARG_KEEP_UNIT },
441 { "network-interface", required_argument, NULL, ARG_NETWORK_INTERFACE },
442 { "network-macvlan", required_argument, NULL, ARG_NETWORK_MACVLAN },
443 { "network-ipvlan", required_argument, NULL, ARG_NETWORK_IPVLAN },
444 { "network-veth", no_argument, NULL, 'n' },
445 { "network-bridge", required_argument, NULL, ARG_NETWORK_BRIDGE },
446 { "personality", required_argument, NULL, ARG_PERSONALITY },
447 { "image", required_argument, NULL, 'i' },
448 { "volatile", optional_argument, NULL, ARG_VOLATILE },
449 { "port", required_argument, NULL, 'p' },
450 { "property", required_argument, NULL, ARG_PROPERTY },
451 { "private-users", optional_argument, NULL, ARG_PRIVATE_USERS },
452 { "kill-signal", required_argument, NULL, ARG_KILL_SIGNAL },
453 {}
454 };
455
456 int c, r;
457 uint64_t plus = 0, minus = 0;
458
459 assert(argc >= 0);
460 assert(argv);
461
462 while ((c = getopt_long(argc, argv, "+hD:u:bL:M:jS:Z:qi:xp:n", options, NULL)) >= 0)
463
464 switch (c) {
465
466 case 'h':
467 help();
468 return 0;
469
470 case ARG_VERSION:
471 puts(PACKAGE_STRING);
472 puts(SYSTEMD_FEATURES);
473 return 0;
474
475 case 'D':
476 r = set_sanitized_path(&arg_directory, optarg);
477 if (r < 0)
478 return log_error_errno(r, "Invalid root directory: %m");
479
480 break;
481
482 case ARG_TEMPLATE:
483 r = set_sanitized_path(&arg_template, optarg);
484 if (r < 0)
485 return log_error_errno(r, "Invalid template directory: %m");
486
487 break;
488
489 case 'i':
490 r = set_sanitized_path(&arg_image, optarg);
491 if (r < 0)
492 return log_error_errno(r, "Invalid image path: %m");
493
494 break;
495
496 case 'x':
497 arg_ephemeral = true;
498 break;
499
500 case 'u':
501 free(arg_user);
502 arg_user = strdup(optarg);
503 if (!arg_user)
504 return log_oom();
505
506 break;
507
508 case ARG_NETWORK_BRIDGE:
509 arg_network_bridge = optarg;
510
511 /* fall through */
512
513 case 'n':
514 arg_network_veth = true;
515 arg_private_network = true;
516 break;
517
518 case ARG_NETWORK_INTERFACE:
519 if (strv_extend(&arg_network_interfaces, optarg) < 0)
520 return log_oom();
521
522 arg_private_network = true;
523 break;
524
525 case ARG_NETWORK_MACVLAN:
526 if (strv_extend(&arg_network_macvlan, optarg) < 0)
527 return log_oom();
528
529 arg_private_network = true;
530 break;
531
532 case ARG_NETWORK_IPVLAN:
533 if (strv_extend(&arg_network_ipvlan, optarg) < 0)
534 return log_oom();
535
536 /* fall through */
537
538 case ARG_PRIVATE_NETWORK:
539 arg_private_network = true;
540 break;
541
542 case 'b':
543 arg_boot = true;
544 break;
545
546 case ARG_UUID:
547 r = sd_id128_from_string(optarg, &arg_uuid);
548 if (r < 0) {
549 log_error("Invalid UUID: %s", optarg);
550 return r;
551 }
552 break;
553
554 case 'S':
555 arg_slice = optarg;
556 break;
557
558 case 'M':
559 if (isempty(optarg)) {
560 free(arg_machine);
561 arg_machine = NULL;
562 } else {
563 if (!machine_name_is_valid(optarg)) {
564 log_error("Invalid machine name: %s", optarg);
565 return -EINVAL;
566 }
567
568 r = free_and_strdup(&arg_machine, optarg);
569 if (r < 0)
570 return log_oom();
571
572 break;
573 }
574
575 case 'Z':
576 arg_selinux_context = optarg;
577 break;
578
579 case 'L':
580 arg_selinux_apifs_context = optarg;
581 break;
582
583 case ARG_READ_ONLY:
584 arg_read_only = true;
585 break;
586
587 case ARG_CAPABILITY:
588 case ARG_DROP_CAPABILITY: {
589 const char *state, *word;
590 size_t length;
591
592 FOREACH_WORD_SEPARATOR(word, length, optarg, ",", state) {
593 _cleanup_free_ char *t;
594
595 t = strndup(word, length);
596 if (!t)
597 return log_oom();
598
599 if (streq(t, "all")) {
600 if (c == ARG_CAPABILITY)
601 plus = (uint64_t) -1;
602 else
603 minus = (uint64_t) -1;
604 } else {
605 int cap;
606
607 cap = capability_from_name(t);
608 if (cap < 0) {
609 log_error("Failed to parse capability %s.", t);
610 return -EINVAL;
611 }
612
613 if (c == ARG_CAPABILITY)
614 plus |= 1ULL << (uint64_t) cap;
615 else
616 minus |= 1ULL << (uint64_t) cap;
617 }
618 }
619
620 break;
621 }
622
623 case 'j':
624 arg_link_journal = LINK_GUEST;
625 arg_link_journal_try = true;
626 break;
627
628 case ARG_LINK_JOURNAL:
629 if (streq(optarg, "auto")) {
630 arg_link_journal = LINK_AUTO;
631 arg_link_journal_try = false;
632 } else if (streq(optarg, "no")) {
633 arg_link_journal = LINK_NO;
634 arg_link_journal_try = false;
635 } else if (streq(optarg, "guest")) {
636 arg_link_journal = LINK_GUEST;
637 arg_link_journal_try = false;
638 } else if (streq(optarg, "host")) {
639 arg_link_journal = LINK_HOST;
640 arg_link_journal_try = false;
641 } else if (streq(optarg, "try-guest")) {
642 arg_link_journal = LINK_GUEST;
643 arg_link_journal_try = true;
644 } else if (streq(optarg, "try-host")) {
645 arg_link_journal = LINK_HOST;
646 arg_link_journal_try = true;
647 } else {
648 log_error("Failed to parse link journal mode %s", optarg);
649 return -EINVAL;
650 }
651
652 break;
653
654 case ARG_BIND:
655 case ARG_BIND_RO: {
656 _cleanup_free_ char *source = NULL, *destination = NULL;
657 CustomMount *m;
658 char *e;
659
660 e = strchr(optarg, ':');
661 if (e) {
662 source = strndup(optarg, e - optarg);
663 destination = strdup(e + 1);
664 } else {
665 source = strdup(optarg);
666 destination = strdup(optarg);
667 }
668
669 if (!source || !destination)
670 return log_oom();
671
672 if (!path_is_absolute(source) || !path_is_absolute(destination)) {
673 log_error("Invalid bind mount specification: %s", optarg);
674 return -EINVAL;
675 }
676
677 m = custom_mount_add(CUSTOM_MOUNT_BIND);
678 if (!m)
679 return log_oom();
680
681 m->source = source;
682 m->destination = destination;
683 m->read_only = c == ARG_BIND_RO;
684
685 source = destination = NULL;
686
687 break;
688 }
689
690 case ARG_TMPFS: {
691 _cleanup_free_ char *path = NULL, *opts = NULL;
692 CustomMount *m;
693 char *e;
694
695 e = strchr(optarg, ':');
696 if (e) {
697 path = strndup(optarg, e - optarg);
698 opts = strdup(e + 1);
699 } else {
700 path = strdup(optarg);
701 opts = strdup("mode=0755");
702 }
703
704 if (!path || !opts)
705 return log_oom();
706
707 if (!path_is_absolute(path)) {
708 log_error("Invalid tmpfs specification: %s", optarg);
709 return -EINVAL;
710 }
711
712 m = custom_mount_add(CUSTOM_MOUNT_TMPFS);
713 if (!m)
714 return log_oom();
715
716 m->destination = path;
717 m->options = opts;
718
719 path = opts = NULL;
720
721 break;
722 }
723
724 case ARG_OVERLAY:
725 case ARG_OVERLAY_RO: {
726 _cleanup_free_ char *upper = NULL, *destination = NULL;
727 _cleanup_strv_free_ char **lower = NULL;
728 CustomMount *m;
729 unsigned n = 0;
730 char **i;
731
732 lower = strv_split(optarg, ":");
733 if (!lower)
734 return log_oom();
735
736 STRV_FOREACH(i, lower) {
737 if (!path_is_absolute(*i)) {
738 log_error("Overlay path %s is not absolute.", *i);
739 return -EINVAL;
740 }
741
742 n++;
743 }
744
745 if (n < 2) {
746 log_error("--overlay= needs at least two colon-separated directories specified.");
747 return -EINVAL;
748 }
749
750 if (n == 2) {
751 /* If two parameters are specified,
752 * the first one is the lower, the
753 * second one the upper directory. And
754 * we'll also define the the
755 * destination mount point the same as
756 * the upper. */
757 upper = lower[1];
758 lower[1] = NULL;
759
760 destination = strdup(upper);
761 if (!destination)
762 return log_oom();
763
764 } else {
765 upper = lower[n - 2];
766 destination = lower[n - 1];
767 lower[n - 2] = NULL;
768 }
769
770 m = custom_mount_add(CUSTOM_MOUNT_OVERLAY);
771 if (!m)
772 return log_oom();
773
774 m->destination = destination;
775 m->source = upper;
776 m->lower = lower;
777 m->read_only = c == ARG_OVERLAY_RO;
778
779 upper = destination = NULL;
780 lower = NULL;
781
782 break;
783 }
784
785 case ARG_SETENV: {
786 char **n;
787
788 if (!env_assignment_is_valid(optarg)) {
789 log_error("Environment variable assignment '%s' is not valid.", optarg);
790 return -EINVAL;
791 }
792
793 n = strv_env_set(arg_setenv, optarg);
794 if (!n)
795 return log_oom();
796
797 strv_free(arg_setenv);
798 arg_setenv = n;
799 break;
800 }
801
802 case 'q':
803 arg_quiet = true;
804 break;
805
806 case ARG_SHARE_SYSTEM:
807 arg_share_system = true;
808 break;
809
810 case ARG_REGISTER:
811 r = parse_boolean(optarg);
812 if (r < 0) {
813 log_error("Failed to parse --register= argument: %s", optarg);
814 return r;
815 }
816
817 arg_register = r;
818 break;
819
820 case ARG_KEEP_UNIT:
821 arg_keep_unit = true;
822 break;
823
824 case ARG_PERSONALITY:
825
826 arg_personality = personality_from_string(optarg);
827 if (arg_personality == PERSONALITY_INVALID) {
828 log_error("Unknown or unsupported personality '%s'.", optarg);
829 return -EINVAL;
830 }
831
832 break;
833
834 case ARG_VOLATILE:
835
836 if (!optarg)
837 arg_volatile = VOLATILE_YES;
838 else {
839 r = parse_boolean(optarg);
840 if (r < 0) {
841 if (streq(optarg, "state"))
842 arg_volatile = VOLATILE_STATE;
843 else {
844 log_error("Failed to parse --volatile= argument: %s", optarg);
845 return r;
846 }
847 } else
848 arg_volatile = r ? VOLATILE_YES : VOLATILE_NO;
849 }
850
851 break;
852
853 case 'p': {
854 const char *split, *e;
855 uint16_t container_port, host_port;
856 int protocol;
857 ExposePort *p;
858
859 if ((e = startswith(optarg, "tcp:")))
860 protocol = IPPROTO_TCP;
861 else if ((e = startswith(optarg, "udp:")))
862 protocol = IPPROTO_UDP;
863 else {
864 e = optarg;
865 protocol = IPPROTO_TCP;
866 }
867
868 split = strchr(e, ':');
869 if (split) {
870 char v[split - e + 1];
871
872 memcpy(v, e, split - e);
873 v[split - e] = 0;
874
875 r = safe_atou16(v, &host_port);
876 if (r < 0 || host_port <= 0) {
877 log_error("Failed to parse host port: %s", optarg);
878 return -EINVAL;
879 }
880
881 r = safe_atou16(split + 1, &container_port);
882 } else {
883 r = safe_atou16(e, &container_port);
884 host_port = container_port;
885 }
886
887 if (r < 0 || container_port <= 0) {
888 log_error("Failed to parse host port: %s", optarg);
889 return -EINVAL;
890 }
891
892 LIST_FOREACH(ports, p, arg_expose_ports) {
893 if (p->protocol == protocol && p->host_port == host_port) {
894 log_error("Duplicate port specification: %s", optarg);
895 return -EINVAL;
896 }
897 }
898
899 p = new(ExposePort, 1);
900 if (!p)
901 return log_oom();
902
903 p->protocol = protocol;
904 p->host_port = host_port;
905 p->container_port = container_port;
906
907 LIST_PREPEND(ports, arg_expose_ports, p);
908
909 break;
910 }
911
912 case ARG_PROPERTY:
913 if (strv_extend(&arg_property, optarg) < 0)
914 return log_oom();
915
916 break;
917
918 case ARG_PRIVATE_USERS:
919 if (optarg) {
920 _cleanup_free_ char *buffer = NULL;
921 const char *range, *shift;
922
923 range = strchr(optarg, ':');
924 if (range) {
925 buffer = strndup(optarg, range - optarg);
926 if (!buffer)
927 return log_oom();
928 shift = buffer;
929
930 range++;
931 if (safe_atou32(range, &arg_uid_range) < 0 || arg_uid_range <= 0) {
932 log_error("Failed to parse UID range: %s", range);
933 return -EINVAL;
934 }
935 } else
936 shift = optarg;
937
938 if (parse_uid(shift, &arg_uid_shift) < 0) {
939 log_error("Failed to parse UID: %s", optarg);
940 return -EINVAL;
941 }
942 }
943
944 arg_userns = true;
945 break;
946
947 case ARG_KILL_SIGNAL:
948 arg_kill_signal = signal_from_string_try_harder(optarg);
949 if (arg_kill_signal < 0) {
950 log_error("Cannot parse signal: %s", optarg);
951 return -EINVAL;
952 }
953
954 break;
955
956 case '?':
957 return -EINVAL;
958
959 default:
960 assert_not_reached("Unhandled option");
961 }
962
963 if (arg_share_system)
964 arg_register = false;
965
966 if (arg_boot && arg_share_system) {
967 log_error("--boot and --share-system may not be combined.");
968 return -EINVAL;
969 }
970
971 if (arg_keep_unit && cg_pid_get_owner_uid(0, NULL) >= 0) {
972 log_error("--keep-unit may not be used when invoked from a user session.");
973 return -EINVAL;
974 }
975
976 if (arg_directory && arg_image) {
977 log_error("--directory= and --image= may not be combined.");
978 return -EINVAL;
979 }
980
981 if (arg_template && arg_image) {
982 log_error("--template= and --image= may not be combined.");
983 return -EINVAL;
984 }
985
986 if (arg_template && !(arg_directory || arg_machine)) {
987 log_error("--template= needs --directory= or --machine=.");
988 return -EINVAL;
989 }
990
991 if (arg_ephemeral && arg_template) {
992 log_error("--ephemeral and --template= may not be combined.");
993 return -EINVAL;
994 }
995
996 if (arg_ephemeral && arg_image) {
997 log_error("--ephemeral and --image= may not be combined.");
998 return -EINVAL;
999 }
1000
1001 if (arg_ephemeral && !IN_SET(arg_link_journal, LINK_NO, LINK_AUTO)) {
1002 log_error("--ephemeral and --link-journal= may not be combined.");
1003 return -EINVAL;
1004 }
1005
1006 if (arg_volatile != VOLATILE_NO && arg_read_only) {
1007 log_error("Cannot combine --read-only with --volatile. Note that --volatile already implies a read-only base hierarchy.");
1008 return -EINVAL;
1009 }
1010
1011 if (arg_expose_ports && !arg_private_network) {
1012 log_error("Cannot use --port= without private networking.");
1013 return -EINVAL;
1014 }
1015
1016 if (arg_userns && access("/proc/self/uid_map", F_OK) < 0)
1017 return log_error_errno(EOPNOTSUPP, "--private-users= is not supported, kernel compiled without user namespace support.");
1018
1019 arg_retain = (arg_retain | plus | (arg_private_network ? 1ULL << CAP_NET_ADMIN : 0)) & ~minus;
1020
1021 if (arg_boot && arg_kill_signal <= 0)
1022 arg_kill_signal = SIGRTMIN+3;
1023
1024 return 1;
1025 }
1026
1027 static int tmpfs_patch_options(const char *options, char **ret) {
1028 char *buf = NULL;
1029
1030 if (arg_userns && arg_uid_shift != 0) {
1031
1032 if (options)
1033 (void) asprintf(&buf, "%s,uid=" UID_FMT ",gid=" UID_FMT, options, arg_uid_shift, arg_uid_shift);
1034 else
1035 (void) asprintf(&buf, "uid=" UID_FMT ",gid=" UID_FMT, arg_uid_shift, arg_uid_shift);
1036 if (!buf)
1037 return -ENOMEM;
1038
1039 options = buf;
1040 }
1041
1042 #ifdef HAVE_SELINUX
1043 if (arg_selinux_apifs_context) {
1044 char *t;
1045
1046 if (options)
1047 t = strjoin(options, ",context=\"", arg_selinux_apifs_context, "\"", NULL);
1048 else
1049 t = strjoin("context=\"", arg_selinux_apifs_context, "\"", NULL);
1050 if (!t) {
1051 free(buf);
1052 return -ENOMEM;
1053 }
1054
1055 free(buf);
1056 buf = t;
1057 }
1058 #endif
1059
1060 *ret = buf;
1061 return !!buf;
1062 }
1063
1064 static int mount_all(const char *dest, bool userns) {
1065
1066 typedef struct MountPoint {
1067 const char *what;
1068 const char *where;
1069 const char *type;
1070 const char *options;
1071 unsigned long flags;
1072 bool fatal;
1073 bool userns;
1074 } MountPoint;
1075
1076 static const MountPoint mount_table[] = {
1077 { "proc", "/proc", "proc", NULL, MS_NOSUID|MS_NOEXEC|MS_NODEV, true, true },
1078 { "/proc/sys", "/proc/sys", NULL, NULL, MS_BIND, true, true }, /* Bind mount first */
1079 { NULL, "/proc/sys", NULL, NULL, MS_BIND|MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_REMOUNT, true, true }, /* Then, make it r/o */
1080 { "sysfs", "/sys", "sysfs", NULL, MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV, true, false },
1081 { "tmpfs", "/sys/fs/cgroup", "tmpfs", "mode=755", MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_STRICTATIME, true, false },
1082 { "tmpfs", "/dev", "tmpfs", "mode=755", MS_NOSUID|MS_STRICTATIME, true, false },
1083 { "tmpfs", "/dev/shm", "tmpfs", "mode=1777", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true, false },
1084 { "tmpfs", "/run", "tmpfs", "mode=755", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true, false },
1085 { "tmpfs", "/tmp", "tmpfs", "mode=1777", MS_STRICTATIME, true, false },
1086 #ifdef HAVE_SELINUX
1087 { "/sys/fs/selinux", "/sys/fs/selinux", NULL, NULL, MS_BIND, false, false }, /* Bind mount first */
1088 { NULL, "/sys/fs/selinux", NULL, NULL, MS_BIND|MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_REMOUNT, false, false }, /* Then, make it r/o */
1089 #endif
1090 };
1091
1092 unsigned k;
1093 int r;
1094
1095 for (k = 0; k < ELEMENTSOF(mount_table); k++) {
1096 _cleanup_free_ char *where = NULL, *options = NULL;
1097 const char *o;
1098
1099 if (userns != mount_table[k].userns)
1100 continue;
1101
1102 where = prefix_root(dest, mount_table[k].where);
1103 if (!where)
1104 return log_oom();
1105
1106 r = path_is_mount_point(where, AT_SYMLINK_FOLLOW);
1107 if (r < 0 && r != -ENOENT)
1108 return log_error_errno(r, "Failed to detect whether %s is a mount point: %m", where);
1109
1110 /* Skip this entry if it is not a remount. */
1111 if (mount_table[k].what && r > 0)
1112 continue;
1113
1114 r = mkdir_p(where, 0755);
1115 if (r < 0) {
1116 if (mount_table[k].fatal)
1117 return log_error_errno(r, "Failed to create directory %s: %m", where);
1118
1119 log_warning_errno(r, "Failed to create directory %s: %m", where);
1120 continue;
1121 }
1122
1123 o = mount_table[k].options;
1124 if (streq_ptr(mount_table[k].type, "tmpfs")) {
1125 r = tmpfs_patch_options(o, &options);
1126 if (r < 0)
1127 return log_oom();
1128 if (r > 0)
1129 o = options;
1130 }
1131
1132 if (mount(mount_table[k].what,
1133 where,
1134 mount_table[k].type,
1135 mount_table[k].flags,
1136 o) < 0) {
1137
1138 if (mount_table[k].fatal)
1139 return log_error_errno(errno, "mount(%s) failed: %m", where);
1140
1141 log_warning_errno(errno, "mount(%s) failed, ignoring: %m", where);
1142 }
1143 }
1144
1145 return 0;
1146 }
1147
1148 static int mount_bind(const char *dest, CustomMount *m) {
1149 struct stat source_st, dest_st;
1150 const char *where;
1151 int r;
1152
1153 assert(m);
1154
1155 if (stat(m->source, &source_st) < 0)
1156 return log_error_errno(errno, "Failed to stat %s: %m", m->source);
1157
1158 where = prefix_roota(dest, m->destination);
1159
1160 if (stat(where, &dest_st) >= 0) {
1161 if (S_ISDIR(source_st.st_mode) && !S_ISDIR(dest_st.st_mode)) {
1162 log_error("Cannot bind mount directory %s on file %s.", m->source, where);
1163 return -EINVAL;
1164 }
1165
1166 if (!S_ISDIR(source_st.st_mode) && S_ISDIR(dest_st.st_mode)) {
1167 log_error("Cannot bind mount file %s on directory %s.", m->source, where);
1168 return -EINVAL;
1169 }
1170
1171 } else if (errno == ENOENT) {
1172 r = mkdir_parents_label(where, 0755);
1173 if (r < 0)
1174 return log_error_errno(r, "Failed to make parents of %s: %m", where);
1175 } else {
1176 log_error_errno(errno, "Failed to stat %s: %m", where);
1177 return -errno;
1178 }
1179
1180 /* Create the mount point. Any non-directory file can be
1181 * mounted on any non-directory file (regular, fifo, socket,
1182 * char, block).
1183 */
1184 if (S_ISDIR(source_st.st_mode))
1185 r = mkdir_label(where, 0755);
1186 else
1187 r = touch(where);
1188 if (r < 0 && r != -EEXIST)
1189 return log_error_errno(r, "Failed to create mount point %s: %m", where);
1190
1191 if (mount(m->source, where, NULL, MS_BIND, NULL) < 0)
1192 return log_error_errno(errno, "mount(%s) failed: %m", where);
1193
1194 if (m->read_only) {
1195 r = bind_remount_recursive(where, true);
1196 if (r < 0)
1197 return log_error_errno(r, "Read-only bind mount failed: %m");
1198 }
1199
1200 return 0;
1201 }
1202
1203 static int mount_tmpfs(const char *dest, CustomMount *m) {
1204 const char *where, *options;
1205 _cleanup_free_ char *buf = NULL;
1206 int r;
1207
1208 assert(dest);
1209 assert(m);
1210
1211 where = prefix_roota(dest, m->destination);
1212
1213 r = mkdir_p_label(where, 0755);
1214 if (r < 0 && r != -EEXIST)
1215 return log_error_errno(r, "Creating mount point for tmpfs %s failed: %m", where);
1216
1217 r = tmpfs_patch_options(m->options, &buf);
1218 if (r < 0)
1219 return log_oom();
1220 options = r > 0 ? buf : m->options;
1221
1222 if (mount("tmpfs", where, "tmpfs", MS_NODEV|MS_STRICTATIME, options) < 0)
1223 return log_error_errno(errno, "tmpfs mount to %s failed: %m", where);
1224
1225 return 0;
1226 }
1227
1228 static int mount_overlay(const char *dest, CustomMount *m) {
1229 _cleanup_free_ char *lower = NULL;
1230 const char *where, *options;
1231 int r;
1232
1233 assert(dest);
1234 assert(m);
1235
1236 where = prefix_roota(dest, m->destination);
1237
1238 r = mkdir_label(where, 0755);
1239 if (r < 0 && r != -EEXIST)
1240 return log_error_errno(r, "Creating mount point for overlay %s failed: %m", where);
1241
1242 (void) mkdir_p_label(m->source, 0755);
1243
1244 strv_reverse(m->lower);
1245 lower = strv_join(m->lower, ":");
1246 strv_reverse(m->lower);
1247 if (!lower)
1248 return log_oom();
1249
1250 if (m->read_only)
1251 options = strjoina("lowerdir=", m->source, ":", lower);
1252 else {
1253 assert(m->work_dir);
1254 (void) mkdir_label(m->work_dir, 0700);
1255
1256 options = strjoina("lowerdir=", lower, ",upperdir=", m->source, ",workdir=", m->work_dir);
1257 }
1258
1259 if (mount("overlay", where, "overlay", m->read_only ? MS_RDONLY : 0, options) < 0)
1260 return log_error_errno(errno, "overlay mount to %s failed: %m", where);
1261
1262 return 0;
1263 }
1264
1265 static int mount_custom(const char *dest) {
1266 unsigned i;
1267 int r;
1268
1269 assert(dest);
1270
1271 for (i = 0; i < arg_n_custom_mounts; i++) {
1272 CustomMount *m = &arg_custom_mounts[i];
1273
1274 switch (m->type) {
1275
1276 case CUSTOM_MOUNT_BIND:
1277 r = mount_bind(dest, m);
1278 break;
1279
1280 case CUSTOM_MOUNT_TMPFS:
1281 r = mount_tmpfs(dest, m);
1282 break;
1283
1284 case CUSTOM_MOUNT_OVERLAY:
1285 r = mount_overlay(dest, m);
1286 break;
1287
1288 default:
1289 assert_not_reached("Unknown custom mount type");
1290 }
1291
1292 if (r < 0)
1293 return r;
1294 }
1295
1296 return 0;
1297 }
1298
1299 static int mount_cgroup_hierarchy(const char *dest, const char *controller, const char *hierarchy, bool read_only) {
1300 char *to;
1301 int r;
1302
1303 to = strjoina(dest, "/sys/fs/cgroup/", hierarchy);
1304
1305 r = path_is_mount_point(to, 0);
1306 if (r < 0 && r != -ENOENT)
1307 return log_error_errno(r, "Failed to determine if %s is mounted already: %m", to);
1308 if (r > 0)
1309 return 0;
1310
1311 mkdir_p(to, 0755);
1312
1313 /* The superblock mount options of the mount point need to be
1314 * identical to the hosts', and hence writable... */
1315 if (mount("cgroup", to, "cgroup", MS_NOSUID|MS_NOEXEC|MS_NODEV, controller) < 0)
1316 return log_error_errno(errno, "Failed to mount to %s: %m", to);
1317
1318 /* ... hence let's only make the bind mount read-only, not the
1319 * superblock. */
1320 if (read_only) {
1321 if (mount(NULL, to, NULL, MS_BIND|MS_REMOUNT|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_RDONLY, NULL) < 0)
1322 return log_error_errno(errno, "Failed to remount %s read-only: %m", to);
1323 }
1324 return 1;
1325 }
1326
1327 static int mount_cgroup(const char *dest) {
1328 _cleanup_set_free_free_ Set *controllers = NULL;
1329 const char *cgroup_root;
1330 int r;
1331
1332 controllers = set_new(&string_hash_ops);
1333 if (!controllers)
1334 return log_oom();
1335
1336 r = cg_kernel_controllers(controllers);
1337 if (r < 0)
1338 return log_error_errno(r, "Failed to determine cgroup controllers: %m");
1339
1340 for (;;) {
1341 _cleanup_free_ char *controller = NULL, *origin = NULL, *combined = NULL;
1342
1343 controller = set_steal_first(controllers);
1344 if (!controller)
1345 break;
1346
1347 origin = prefix_root("/sys/fs/cgroup/", controller);
1348 if (!origin)
1349 return log_oom();
1350
1351 r = readlink_malloc(origin, &combined);
1352 if (r == -EINVAL) {
1353 /* Not a symbolic link, but directly a single cgroup hierarchy */
1354
1355 r = mount_cgroup_hierarchy(dest, controller, controller, true);
1356 if (r < 0)
1357 return r;
1358
1359 } else if (r < 0)
1360 return log_error_errno(r, "Failed to read link %s: %m", origin);
1361 else {
1362 _cleanup_free_ char *target = NULL;
1363
1364 target = prefix_root(dest, origin);
1365 if (!target)
1366 return log_oom();
1367
1368 /* A symbolic link, a combination of controllers in one hierarchy */
1369
1370 if (!filename_is_valid(combined)) {
1371 log_warning("Ignoring invalid combined hierarchy %s.", combined);
1372 continue;
1373 }
1374
1375 r = mount_cgroup_hierarchy(dest, combined, combined, true);
1376 if (r < 0)
1377 return r;
1378
1379 r = symlink_idempotent(combined, target);
1380 if (r == -EINVAL) {
1381 log_error("Invalid existing symlink for combined hierarchy");
1382 return r;
1383 }
1384 if (r < 0)
1385 return log_error_errno(r, "Failed to create symlink for combined hierarchy: %m");
1386 }
1387 }
1388
1389 r = mount_cgroup_hierarchy(dest, "name=systemd,xattr", "systemd", false);
1390 if (r < 0)
1391 return r;
1392
1393 cgroup_root = prefix_roota(dest, "/sys/fs/cgroup");
1394 if (mount(NULL, cgroup_root, NULL, MS_REMOUNT|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_STRICTATIME|MS_RDONLY, "mode=755") < 0)
1395 return log_error_errno(errno, "Failed to remount %s read-only: %m", cgroup_root);
1396
1397 return 0;
1398 }
1399
1400 static int mount_systemd_cgroup_writable(const char *dest) {
1401 _cleanup_free_ char *own_cgroup_path = NULL;
1402 const char *systemd_root, *systemd_own;
1403 int r;
1404
1405 assert(dest);
1406
1407 r = cg_pid_get_path(NULL, 0, &own_cgroup_path);
1408 if (r < 0)
1409 return log_error_errno(r, "Failed to determine our own cgroup path: %m");
1410
1411 /* Make our own cgroup a (writable) bind mount */
1412 systemd_own = strjoina(dest, "/sys/fs/cgroup/systemd", own_cgroup_path);
1413 if (mount(systemd_own, systemd_own, NULL, MS_BIND, NULL) < 0)
1414 return log_error_errno(errno, "Failed to turn %s into a bind mount: %m", own_cgroup_path);
1415
1416 /* And then remount the systemd cgroup root read-only */
1417 systemd_root = prefix_roota(dest, "/sys/fs/cgroup/systemd");
1418 if (mount(NULL, systemd_root, NULL, MS_BIND|MS_REMOUNT|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_RDONLY, NULL) < 0)
1419 return log_error_errno(errno, "Failed to mount cgroup root read-only: %m");
1420
1421 return 0;
1422 }
1423
1424 static int userns_lchown(const char *p, uid_t uid, gid_t gid) {
1425 assert(p);
1426
1427 if (!arg_userns)
1428 return 0;
1429
1430 if (uid == UID_INVALID && gid == GID_INVALID)
1431 return 0;
1432
1433 if (uid != UID_INVALID) {
1434 uid += arg_uid_shift;
1435
1436 if (uid < arg_uid_shift || uid >= arg_uid_shift + arg_uid_range)
1437 return -EOVERFLOW;
1438 }
1439
1440 if (gid != GID_INVALID) {
1441 gid += (gid_t) arg_uid_shift;
1442
1443 if (gid < (gid_t) arg_uid_shift || gid >= (gid_t) (arg_uid_shift + arg_uid_range))
1444 return -EOVERFLOW;
1445 }
1446
1447 if (lchown(p, uid, gid) < 0)
1448 return -errno;
1449
1450 return 0;
1451 }
1452
1453 static int userns_mkdir(const char *root, const char *path, mode_t mode, uid_t uid, gid_t gid) {
1454 const char *q;
1455
1456 q = prefix_roota(root, path);
1457 if (mkdir(q, mode) < 0) {
1458 if (errno == EEXIST)
1459 return 0;
1460 return -errno;
1461 }
1462
1463 return userns_lchown(q, uid, gid);
1464 }
1465
1466 static int setup_timezone(const char *dest) {
1467 _cleanup_free_ char *p = NULL, *q = NULL;
1468 const char *where, *check, *what;
1469 char *z, *y;
1470 int r;
1471
1472 assert(dest);
1473
1474 /* Fix the timezone, if possible */
1475 r = readlink_malloc("/etc/localtime", &p);
1476 if (r < 0) {
1477 log_warning("/etc/localtime is not a symlink, not updating container timezone.");
1478 return 0;
1479 }
1480
1481 z = path_startswith(p, "../usr/share/zoneinfo/");
1482 if (!z)
1483 z = path_startswith(p, "/usr/share/zoneinfo/");
1484 if (!z) {
1485 log_warning("/etc/localtime does not point into /usr/share/zoneinfo/, not updating container timezone.");
1486 return 0;
1487 }
1488
1489 where = prefix_roota(dest, "/etc/localtime");
1490 r = readlink_malloc(where, &q);
1491 if (r >= 0) {
1492 y = path_startswith(q, "../usr/share/zoneinfo/");
1493 if (!y)
1494 y = path_startswith(q, "/usr/share/zoneinfo/");
1495
1496 /* Already pointing to the right place? Then do nothing .. */
1497 if (y && streq(y, z))
1498 return 0;
1499 }
1500
1501 check = strjoina("/usr/share/zoneinfo/", z);
1502 check = prefix_root(dest, check);
1503 if (laccess(check, F_OK) < 0) {
1504 log_warning("Timezone %s does not exist in container, not updating container timezone.", z);
1505 return 0;
1506 }
1507
1508 r = unlink(where);
1509 if (r < 0 && errno != ENOENT) {
1510 log_error_errno(errno, "Failed to remove existing timezone info %s in container: %m", where);
1511 return 0;
1512 }
1513
1514 what = strjoina("../usr/share/zoneinfo/", z);
1515 if (symlink(what, where) < 0) {
1516 log_error_errno(errno, "Failed to correct timezone of container: %m");
1517 return 0;
1518 }
1519
1520 r = userns_lchown(where, 0, 0);
1521 if (r < 0)
1522 return log_warning_errno(r, "Failed to chown /etc/localtime: %m");
1523
1524 return 0;
1525 }
1526
1527 static int setup_resolv_conf(const char *dest) {
1528 const char *where = NULL;
1529 int r;
1530
1531 assert(dest);
1532
1533 if (arg_private_network)
1534 return 0;
1535
1536 /* Fix resolv.conf, if possible */
1537 where = prefix_roota(dest, "/etc/resolv.conf");
1538
1539 r = copy_file("/etc/resolv.conf", where, O_TRUNC|O_NOFOLLOW, 0644, 0);
1540 if (r < 0) {
1541 /* If the file already exists as symlink, let's
1542 * suppress the warning, under the assumption that
1543 * resolved or something similar runs inside and the
1544 * symlink points there.
1545 *
1546 * If the disk image is read-only, there's also no
1547 * point in complaining.
1548 */
1549 log_full_errno(IN_SET(r, -ELOOP, -EROFS) ? LOG_DEBUG : LOG_WARNING, r,
1550 "Failed to copy /etc/resolv.conf to %s: %m", where);
1551 return 0;
1552 }
1553
1554 r = userns_lchown(where, 0, 0);
1555 if (r < 0)
1556 log_warning_errno(r, "Failed to chown /etc/resolv.conf: %m");
1557
1558 return 0;
1559 }
1560
1561 static int setup_volatile_state(const char *directory) {
1562 _cleanup_free_ char *buf = NULL;
1563 const char *p, *options;
1564 int r;
1565
1566 assert(directory);
1567
1568 if (arg_volatile != VOLATILE_STATE)
1569 return 0;
1570
1571 /* --volatile=state means we simply overmount /var
1572 with a tmpfs, and the rest read-only. */
1573
1574 r = bind_remount_recursive(directory, true);
1575 if (r < 0)
1576 return log_error_errno(r, "Failed to remount %s read-only: %m", directory);
1577
1578 p = prefix_roota(directory, "/var");
1579 r = mkdir(p, 0755);
1580 if (r < 0 && errno != EEXIST)
1581 return log_error_errno(errno, "Failed to create %s: %m", directory);
1582
1583 options = "mode=755";
1584 r = tmpfs_patch_options(options, &buf);
1585 if (r < 0)
1586 return log_oom();
1587 if (r > 0)
1588 options = buf;
1589
1590 if (mount("tmpfs", p, "tmpfs", MS_STRICTATIME, options) < 0)
1591 return log_error_errno(errno, "Failed to mount tmpfs to /var: %m");
1592
1593 return 0;
1594 }
1595
1596 static int setup_volatile(const char *directory) {
1597 bool tmpfs_mounted = false, bind_mounted = false;
1598 char template[] = "/tmp/nspawn-volatile-XXXXXX";
1599 _cleanup_free_ char *buf = NULL;
1600 const char *f, *t, *options;
1601 int r;
1602
1603 assert(directory);
1604
1605 if (arg_volatile != VOLATILE_YES)
1606 return 0;
1607
1608 /* --volatile=yes means we mount a tmpfs to the root dir, and
1609 the original /usr to use inside it, and that read-only. */
1610
1611 if (!mkdtemp(template))
1612 return log_error_errno(errno, "Failed to create temporary directory: %m");
1613
1614 options = "mode=755";
1615 r = tmpfs_patch_options(options, &buf);
1616 if (r < 0)
1617 return log_oom();
1618 if (r > 0)
1619 options = buf;
1620
1621 if (mount("tmpfs", template, "tmpfs", MS_STRICTATIME, options) < 0) {
1622 r = log_error_errno(errno, "Failed to mount tmpfs for root directory: %m");
1623 goto fail;
1624 }
1625
1626 tmpfs_mounted = true;
1627
1628 f = prefix_roota(directory, "/usr");
1629 t = prefix_roota(template, "/usr");
1630
1631 r = mkdir(t, 0755);
1632 if (r < 0 && errno != EEXIST) {
1633 r = log_error_errno(errno, "Failed to create %s: %m", t);
1634 goto fail;
1635 }
1636
1637 if (mount(f, t, NULL, MS_BIND|MS_REC, NULL) < 0) {
1638 r = log_error_errno(errno, "Failed to create /usr bind mount: %m");
1639 goto fail;
1640 }
1641
1642 bind_mounted = true;
1643
1644 r = bind_remount_recursive(t, true);
1645 if (r < 0) {
1646 log_error_errno(r, "Failed to remount %s read-only: %m", t);
1647 goto fail;
1648 }
1649
1650 if (mount(template, directory, NULL, MS_MOVE, NULL) < 0) {
1651 r = log_error_errno(errno, "Failed to move root mount: %m");
1652 goto fail;
1653 }
1654
1655 (void) rmdir(template);
1656
1657 return 0;
1658
1659 fail:
1660 if (bind_mounted)
1661 (void) umount(t);
1662
1663 if (tmpfs_mounted)
1664 (void) umount(template);
1665 (void) rmdir(template);
1666 return r;
1667 }
1668
1669 static char* id128_format_as_uuid(sd_id128_t id, char s[37]) {
1670 assert(s);
1671
1672 snprintf(s, 37,
1673 "%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
1674 SD_ID128_FORMAT_VAL(id));
1675
1676 return s;
1677 }
1678
1679 static int setup_boot_id(const char *dest) {
1680 const char *from, *to;
1681 sd_id128_t rnd = {};
1682 char as_uuid[37];
1683 int r;
1684
1685 if (arg_share_system)
1686 return 0;
1687
1688 /* Generate a new randomized boot ID, so that each boot-up of
1689 * the container gets a new one */
1690
1691 from = prefix_roota(dest, "/run/proc-sys-kernel-random-boot-id");
1692 to = prefix_roota(dest, "/proc/sys/kernel/random/boot_id");
1693
1694 r = sd_id128_randomize(&rnd);
1695 if (r < 0)
1696 return log_error_errno(r, "Failed to generate random boot id: %m");
1697
1698 id128_format_as_uuid(rnd, as_uuid);
1699
1700 r = write_string_file(from, as_uuid);
1701 if (r < 0)
1702 return log_error_errno(r, "Failed to write boot id: %m");
1703
1704 if (mount(from, to, NULL, MS_BIND, NULL) < 0)
1705 r = log_error_errno(errno, "Failed to bind mount boot id: %m");
1706 else if (mount(NULL, to, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY|MS_NOSUID|MS_NODEV, NULL) < 0)
1707 log_warning_errno(errno, "Failed to make boot id read-only: %m");
1708
1709 unlink(from);
1710 return r;
1711 }
1712
1713 static int copy_devnodes(const char *dest) {
1714
1715 static const char devnodes[] =
1716 "null\0"
1717 "zero\0"
1718 "full\0"
1719 "random\0"
1720 "urandom\0"
1721 "tty\0"
1722 "net/tun\0";
1723
1724 const char *d;
1725 int r = 0;
1726 _cleanup_umask_ mode_t u;
1727
1728 assert(dest);
1729
1730 u = umask(0000);
1731
1732 /* Create /dev/net, so that we can create /dev/net/tun in it */
1733 if (userns_mkdir(dest, "/dev/net", 0755, 0, 0) < 0)
1734 return log_error_errno(r, "Failed to create /dev/net directory: %m");
1735
1736 NULSTR_FOREACH(d, devnodes) {
1737 _cleanup_free_ char *from = NULL, *to = NULL;
1738 struct stat st;
1739
1740 from = strappend("/dev/", d);
1741 to = prefix_root(dest, from);
1742
1743 if (stat(from, &st) < 0) {
1744
1745 if (errno != ENOENT)
1746 return log_error_errno(errno, "Failed to stat %s: %m", from);
1747
1748 } else if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
1749
1750 log_error("%s is not a char or block device, cannot copy.", from);
1751 return -EIO;
1752
1753 } else {
1754 if (mknod(to, st.st_mode, st.st_rdev) < 0) {
1755 if (errno != EPERM)
1756 return log_error_errno(errno, "mknod(%s) failed: %m", to);
1757
1758 /* Some systems abusively restrict mknod but
1759 * allow bind mounts. */
1760 r = touch(to);
1761 if (r < 0)
1762 return log_error_errno(r, "touch (%s) failed: %m", to);
1763 if (mount(from, to, NULL, MS_BIND, NULL) < 0)
1764 return log_error_errno(errno, "Both mknod and bind mount (%s) failed: %m", to);
1765 }
1766
1767 r = userns_lchown(to, 0, 0);
1768 if (r < 0)
1769 return log_error_errno(r, "chown() of device node %s failed: %m", to);
1770 }
1771 }
1772
1773 return r;
1774 }
1775
1776 static int setup_pts(const char *dest) {
1777 _cleanup_free_ char *options = NULL;
1778 const char *p;
1779
1780 #ifdef HAVE_SELINUX
1781 if (arg_selinux_apifs_context)
1782 (void) asprintf(&options,
1783 "newinstance,ptmxmode=0666,mode=620,uid=" UID_FMT ",gid=" GID_FMT ",context=\"%s\"",
1784 arg_uid_shift,
1785 arg_uid_shift + TTY_GID,
1786 arg_selinux_apifs_context);
1787 else
1788 #endif
1789 (void) asprintf(&options,
1790 "newinstance,ptmxmode=0666,mode=620,uid=" UID_FMT ",gid=" GID_FMT,
1791 arg_uid_shift,
1792 arg_uid_shift + TTY_GID);
1793
1794 if (!options)
1795 return log_oom();
1796
1797 /* Mount /dev/pts itself */
1798 p = prefix_roota(dest, "/dev/pts");
1799 if (mkdir(p, 0755) < 0)
1800 return log_error_errno(errno, "Failed to create /dev/pts: %m");
1801 if (mount("devpts", p, "devpts", MS_NOSUID|MS_NOEXEC, options) < 0)
1802 return log_error_errno(errno, "Failed to mount /dev/pts: %m");
1803 if (userns_lchown(p, 0, 0) < 0)
1804 return log_error_errno(errno, "Failed to chown /dev/pts: %m");
1805
1806 /* Create /dev/ptmx symlink */
1807 p = prefix_roota(dest, "/dev/ptmx");
1808 if (symlink("pts/ptmx", p) < 0)
1809 return log_error_errno(errno, "Failed to create /dev/ptmx symlink: %m");
1810 if (userns_lchown(p, 0, 0) < 0)
1811 return log_error_errno(errno, "Failed to chown /dev/ptmx: %m");
1812
1813 /* And fix /dev/pts/ptmx ownership */
1814 p = prefix_roota(dest, "/dev/pts/ptmx");
1815 if (userns_lchown(p, 0, 0) < 0)
1816 return log_error_errno(errno, "Failed to chown /dev/pts/ptmx: %m");
1817
1818 return 0;
1819 }
1820
1821 static int setup_dev_console(const char *dest, const char *console) {
1822 _cleanup_umask_ mode_t u;
1823 const char *to;
1824 int r;
1825
1826 assert(dest);
1827 assert(console);
1828
1829 u = umask(0000);
1830
1831 r = chmod_and_chown(console, 0600, arg_uid_shift, arg_uid_shift);
1832 if (r < 0)
1833 return log_error_errno(r, "Failed to correct access mode for TTY: %m");
1834
1835 /* We need to bind mount the right tty to /dev/console since
1836 * ptys can only exist on pts file systems. To have something
1837 * to bind mount things on we create a empty regular file. */
1838
1839 to = prefix_roota(dest, "/dev/console");
1840 r = touch(to);
1841 if (r < 0)
1842 return log_error_errno(r, "touch() for /dev/console failed: %m");
1843
1844 if (mount(console, to, NULL, MS_BIND, NULL) < 0)
1845 return log_error_errno(errno, "Bind mount for /dev/console failed: %m");
1846
1847 return 0;
1848 }
1849
1850 static int setup_kmsg(const char *dest, int kmsg_socket) {
1851 const char *from, *to;
1852 _cleanup_umask_ mode_t u;
1853 int fd, k;
1854 union {
1855 struct cmsghdr cmsghdr;
1856 uint8_t buf[CMSG_SPACE(sizeof(int))];
1857 } control = {};
1858 struct msghdr mh = {
1859 .msg_control = &control,
1860 .msg_controllen = sizeof(control),
1861 };
1862 struct cmsghdr *cmsg;
1863
1864 assert(kmsg_socket >= 0);
1865
1866 u = umask(0000);
1867
1868 /* We create the kmsg FIFO as /run/kmsg, but immediately
1869 * delete it after bind mounting it to /proc/kmsg. While FIFOs
1870 * on the reading side behave very similar to /proc/kmsg,
1871 * their writing side behaves differently from /dev/kmsg in
1872 * that writing blocks when nothing is reading. In order to
1873 * avoid any problems with containers deadlocking due to this
1874 * we simply make /dev/kmsg unavailable to the container. */
1875 from = prefix_roota(dest, "/run/kmsg");
1876 to = prefix_roota(dest, "/proc/kmsg");
1877
1878 if (mkfifo(from, 0600) < 0)
1879 return log_error_errno(errno, "mkfifo() for /run/kmsg failed: %m");
1880 if (mount(from, to, NULL, MS_BIND, NULL) < 0)
1881 return log_error_errno(errno, "Bind mount for /proc/kmsg failed: %m");
1882
1883 fd = open(from, O_RDWR|O_NDELAY|O_CLOEXEC);
1884 if (fd < 0)
1885 return log_error_errno(errno, "Failed to open fifo: %m");
1886
1887 cmsg = CMSG_FIRSTHDR(&mh);
1888 cmsg->cmsg_level = SOL_SOCKET;
1889 cmsg->cmsg_type = SCM_RIGHTS;
1890 cmsg->cmsg_len = CMSG_LEN(sizeof(int));
1891 memcpy(CMSG_DATA(cmsg), &fd, sizeof(int));
1892
1893 mh.msg_controllen = cmsg->cmsg_len;
1894
1895 /* Store away the fd in the socket, so that it stays open as
1896 * long as we run the child */
1897 k = sendmsg(kmsg_socket, &mh, MSG_NOSIGNAL);
1898 safe_close(fd);
1899
1900 if (k < 0)
1901 return log_error_errno(errno, "Failed to send FIFO fd: %m");
1902
1903 /* And now make the FIFO unavailable as /run/kmsg... */
1904 (void) unlink(from);
1905
1906 return 0;
1907 }
1908
1909 static int send_rtnl(int send_fd) {
1910 union {
1911 struct cmsghdr cmsghdr;
1912 uint8_t buf[CMSG_SPACE(sizeof(int))];
1913 } control = {};
1914 struct msghdr mh = {
1915 .msg_control = &control,
1916 .msg_controllen = sizeof(control),
1917 };
1918 struct cmsghdr *cmsg;
1919 _cleanup_close_ int fd = -1;
1920 ssize_t k;
1921
1922 assert(send_fd >= 0);
1923
1924 if (!arg_expose_ports)
1925 return 0;
1926
1927 fd = socket(PF_NETLINK, SOCK_RAW|SOCK_CLOEXEC|SOCK_NONBLOCK, NETLINK_ROUTE);
1928 if (fd < 0)
1929 return log_error_errno(errno, "Failed to allocate container netlink: %m");
1930
1931 cmsg = CMSG_FIRSTHDR(&mh);
1932 cmsg->cmsg_level = SOL_SOCKET;
1933 cmsg->cmsg_type = SCM_RIGHTS;
1934 cmsg->cmsg_len = CMSG_LEN(sizeof(int));
1935 memcpy(CMSG_DATA(cmsg), &fd, sizeof(int));
1936
1937 mh.msg_controllen = cmsg->cmsg_len;
1938
1939 /* Store away the fd in the socket, so that it stays open as
1940 * long as we run the child */
1941 k = sendmsg(send_fd, &mh, MSG_NOSIGNAL);
1942 if (k < 0)
1943 return log_error_errno(errno, "Failed to send netlink fd: %m");
1944
1945 return 0;
1946 }
1947
1948 static int flush_ports(union in_addr_union *exposed) {
1949 ExposePort *p;
1950 int r, af = AF_INET;
1951
1952 assert(exposed);
1953
1954 if (!arg_expose_ports)
1955 return 0;
1956
1957 if (in_addr_is_null(af, exposed))
1958 return 0;
1959
1960 log_debug("Lost IP address.");
1961
1962 LIST_FOREACH(ports, p, arg_expose_ports) {
1963 r = fw_add_local_dnat(false,
1964 af,
1965 p->protocol,
1966 NULL,
1967 NULL, 0,
1968 NULL, 0,
1969 p->host_port,
1970 exposed,
1971 p->container_port,
1972 NULL);
1973 if (r < 0)
1974 log_warning_errno(r, "Failed to modify firewall: %m");
1975 }
1976
1977 *exposed = IN_ADDR_NULL;
1978 return 0;
1979 }
1980
1981 static int expose_ports(sd_netlink *rtnl, union in_addr_union *exposed) {
1982 _cleanup_free_ struct local_address *addresses = NULL;
1983 _cleanup_free_ char *pretty = NULL;
1984 union in_addr_union new_exposed;
1985 ExposePort *p;
1986 bool add;
1987 int af = AF_INET, r;
1988
1989 assert(exposed);
1990
1991 /* Invoked each time an address is added or removed inside the
1992 * container */
1993
1994 if (!arg_expose_ports)
1995 return 0;
1996
1997 r = local_addresses(rtnl, 0, af, &addresses);
1998 if (r < 0)
1999 return log_error_errno(r, "Failed to enumerate local addresses: %m");
2000
2001 add = r > 0 &&
2002 addresses[0].family == af &&
2003 addresses[0].scope < RT_SCOPE_LINK;
2004
2005 if (!add)
2006 return flush_ports(exposed);
2007
2008 new_exposed = addresses[0].address;
2009 if (in_addr_equal(af, exposed, &new_exposed))
2010 return 0;
2011
2012 in_addr_to_string(af, &new_exposed, &pretty);
2013 log_debug("New container IP is %s.", strna(pretty));
2014
2015 LIST_FOREACH(ports, p, arg_expose_ports) {
2016
2017 r = fw_add_local_dnat(true,
2018 af,
2019 p->protocol,
2020 NULL,
2021 NULL, 0,
2022 NULL, 0,
2023 p->host_port,
2024 &new_exposed,
2025 p->container_port,
2026 in_addr_is_null(af, exposed) ? NULL : exposed);
2027 if (r < 0)
2028 log_warning_errno(r, "Failed to modify firewall: %m");
2029 }
2030
2031 *exposed = new_exposed;
2032 return 0;
2033 }
2034
2035 static int on_address_change(sd_netlink *rtnl, sd_netlink_message *m, void *userdata) {
2036 union in_addr_union *exposed = userdata;
2037
2038 assert(rtnl);
2039 assert(m);
2040 assert(exposed);
2041
2042 expose_ports(rtnl, exposed);
2043 return 0;
2044 }
2045
2046 static int watch_rtnl(sd_event *event, int recv_fd, union in_addr_union *exposed, sd_netlink **ret) {
2047 union {
2048 struct cmsghdr cmsghdr;
2049 uint8_t buf[CMSG_SPACE(sizeof(int))];
2050 } control = {};
2051 struct msghdr mh = {
2052 .msg_control = &control,
2053 .msg_controllen = sizeof(control),
2054 };
2055 struct cmsghdr *cmsg;
2056 _cleanup_netlink_unref_ sd_netlink *rtnl = NULL;
2057 int fd, r;
2058 ssize_t k;
2059
2060 assert(event);
2061 assert(recv_fd >= 0);
2062 assert(ret);
2063
2064 if (!arg_expose_ports)
2065 return 0;
2066
2067 k = recvmsg(recv_fd, &mh, MSG_NOSIGNAL);
2068 if (k < 0)
2069 return log_error_errno(errno, "Failed to recv netlink fd: %m");
2070
2071 cmsg = CMSG_FIRSTHDR(&mh);
2072 assert(cmsg->cmsg_level == SOL_SOCKET);
2073 assert(cmsg->cmsg_type == SCM_RIGHTS);
2074 assert(cmsg->cmsg_len == CMSG_LEN(sizeof(int)));
2075 memcpy(&fd, CMSG_DATA(cmsg), sizeof(int));
2076
2077 r = sd_netlink_open_fd(&rtnl, fd);
2078 if (r < 0) {
2079 safe_close(fd);
2080 return log_error_errno(r, "Failed to create rtnl object: %m");
2081 }
2082
2083 r = sd_netlink_add_match(rtnl, RTM_NEWADDR, on_address_change, exposed);
2084 if (r < 0)
2085 return log_error_errno(r, "Failed to subscribe to RTM_NEWADDR messages: %m");
2086
2087 r = sd_netlink_add_match(rtnl, RTM_DELADDR, on_address_change, exposed);
2088 if (r < 0)
2089 return log_error_errno(r, "Failed to subscribe to RTM_DELADDR messages: %m");
2090
2091 r = sd_netlink_attach_event(rtnl, event, 0);
2092 if (r < 0)
2093 return log_error_errno(r, "Failed to add to even loop: %m");
2094
2095 *ret = rtnl;
2096 rtnl = NULL;
2097
2098 return 0;
2099 }
2100
2101 static int setup_hostname(void) {
2102
2103 if (arg_share_system)
2104 return 0;
2105
2106 if (sethostname_idempotent(arg_machine) < 0)
2107 return -errno;
2108
2109 return 0;
2110 }
2111
2112 static int setup_journal(const char *directory) {
2113 sd_id128_t machine_id, this_id;
2114 _cleanup_free_ char *b = NULL, *d = NULL;
2115 const char *etc_machine_id, *p, *q;
2116 char *id;
2117 int r;
2118
2119 /* Don't link journals in ephemeral mode */
2120 if (arg_ephemeral)
2121 return 0;
2122
2123 etc_machine_id = prefix_roota(directory, "/etc/machine-id");
2124
2125 r = read_one_line_file(etc_machine_id, &b);
2126 if (r == -ENOENT && arg_link_journal == LINK_AUTO)
2127 return 0;
2128 else if (r < 0)
2129 return log_error_errno(r, "Failed to read machine ID from %s: %m", etc_machine_id);
2130
2131 id = strstrip(b);
2132 if (isempty(id) && arg_link_journal == LINK_AUTO)
2133 return 0;
2134
2135 /* Verify validity */
2136 r = sd_id128_from_string(id, &machine_id);
2137 if (r < 0)
2138 return log_error_errno(r, "Failed to parse machine ID from %s: %m", etc_machine_id);
2139
2140 r = sd_id128_get_machine(&this_id);
2141 if (r < 0)
2142 return log_error_errno(r, "Failed to retrieve machine ID: %m");
2143
2144 if (sd_id128_equal(machine_id, this_id)) {
2145 log_full(arg_link_journal == LINK_AUTO ? LOG_WARNING : LOG_ERR,
2146 "Host and machine ids are equal (%s): refusing to link journals", id);
2147 if (arg_link_journal == LINK_AUTO)
2148 return 0;
2149 return -EEXIST;
2150 }
2151
2152 if (arg_link_journal == LINK_NO)
2153 return 0;
2154
2155 r = userns_mkdir(directory, "/var", 0755, 0, 0);
2156 if (r < 0)
2157 return log_error_errno(r, "Failed to create /var: %m");
2158
2159 r = userns_mkdir(directory, "/var/log", 0755, 0, 0);
2160 if (r < 0)
2161 return log_error_errno(r, "Failed to create /var/log: %m");
2162
2163 r = userns_mkdir(directory, "/var/log/journal", 0755, 0, 0);
2164 if (r < 0)
2165 return log_error_errno(r, "Failed to create /var/log/journal: %m");
2166
2167 p = strjoina("/var/log/journal/", id);
2168 q = prefix_roota(directory, p);
2169
2170 if (path_is_mount_point(p, 0) > 0) {
2171 if (arg_link_journal != LINK_AUTO) {
2172 log_error("%s: already a mount point, refusing to use for journal", p);
2173 return -EEXIST;
2174 }
2175
2176 return 0;
2177 }
2178
2179 if (path_is_mount_point(q, 0) > 0) {
2180 if (arg_link_journal != LINK_AUTO) {
2181 log_error("%s: already a mount point, refusing to use for journal", q);
2182 return -EEXIST;
2183 }
2184
2185 return 0;
2186 }
2187
2188 r = readlink_and_make_absolute(p, &d);
2189 if (r >= 0) {
2190 if ((arg_link_journal == LINK_GUEST ||
2191 arg_link_journal == LINK_AUTO) &&
2192 path_equal(d, q)) {
2193
2194 r = userns_mkdir(directory, p, 0755, 0, 0);
2195 if (r < 0)
2196 log_warning_errno(errno, "Failed to create directory %s: %m", q);
2197 return 0;
2198 }
2199
2200 if (unlink(p) < 0)
2201 return log_error_errno(errno, "Failed to remove symlink %s: %m", p);
2202 } else if (r == -EINVAL) {
2203
2204 if (arg_link_journal == LINK_GUEST &&
2205 rmdir(p) < 0) {
2206
2207 if (errno == ENOTDIR) {
2208 log_error("%s already exists and is neither a symlink nor a directory", p);
2209 return r;
2210 } else {
2211 log_error_errno(errno, "Failed to remove %s: %m", p);
2212 return -errno;
2213 }
2214 }
2215 } else if (r != -ENOENT) {
2216 log_error_errno(errno, "readlink(%s) failed: %m", p);
2217 return r;
2218 }
2219
2220 if (arg_link_journal == LINK_GUEST) {
2221
2222 if (symlink(q, p) < 0) {
2223 if (arg_link_journal_try) {
2224 log_debug_errno(errno, "Failed to symlink %s to %s, skipping journal setup: %m", q, p);
2225 return 0;
2226 } else {
2227 log_error_errno(errno, "Failed to symlink %s to %s: %m", q, p);
2228 return -errno;
2229 }
2230 }
2231
2232 r = userns_mkdir(directory, p, 0755, 0, 0);
2233 if (r < 0)
2234 log_warning_errno(errno, "Failed to create directory %s: %m", q);
2235 return 0;
2236 }
2237
2238 if (arg_link_journal == LINK_HOST) {
2239 /* don't create parents here -- if the host doesn't have
2240 * permanent journal set up, don't force it here */
2241 r = mkdir(p, 0755);
2242 if (r < 0) {
2243 if (arg_link_journal_try) {
2244 log_debug_errno(errno, "Failed to create %s, skipping journal setup: %m", p);
2245 return 0;
2246 } else {
2247 log_error_errno(errno, "Failed to create %s: %m", p);
2248 return r;
2249 }
2250 }
2251
2252 } else if (access(p, F_OK) < 0)
2253 return 0;
2254
2255 if (dir_is_empty(q) == 0)
2256 log_warning("%s is not empty, proceeding anyway.", q);
2257
2258 r = userns_mkdir(directory, p, 0755, 0, 0);
2259 if (r < 0) {
2260 log_error_errno(errno, "Failed to create %s: %m", q);
2261 return r;
2262 }
2263
2264 if (mount(p, q, NULL, MS_BIND, NULL) < 0)
2265 return log_error_errno(errno, "Failed to bind mount journal from host into guest: %m");
2266
2267 return 0;
2268 }
2269
2270 static int drop_capabilities(void) {
2271 return capability_bounding_set_drop(~arg_retain, false);
2272 }
2273
2274 static int register_machine(pid_t pid, int local_ifindex) {
2275 _cleanup_bus_error_free_ sd_bus_error error = SD_BUS_ERROR_NULL;
2276 _cleanup_bus_close_unref_ sd_bus *bus = NULL;
2277 int r;
2278
2279 if (!arg_register)
2280 return 0;
2281
2282 r = sd_bus_default_system(&bus);
2283 if (r < 0)
2284 return log_error_errno(r, "Failed to open system bus: %m");
2285
2286 if (arg_keep_unit) {
2287 r = sd_bus_call_method(
2288 bus,
2289 "org.freedesktop.machine1",
2290 "/org/freedesktop/machine1",
2291 "org.freedesktop.machine1.Manager",
2292 "RegisterMachineWithNetwork",
2293 &error,
2294 NULL,
2295 "sayssusai",
2296 arg_machine,
2297 SD_BUS_MESSAGE_APPEND_ID128(arg_uuid),
2298 "nspawn",
2299 "container",
2300 (uint32_t) pid,
2301 strempty(arg_directory),
2302 local_ifindex > 0 ? 1 : 0, local_ifindex);
2303 } else {
2304 _cleanup_bus_message_unref_ sd_bus_message *m = NULL;
2305 char **i;
2306 unsigned j;
2307
2308 r = sd_bus_message_new_method_call(
2309 bus,
2310 &m,
2311 "org.freedesktop.machine1",
2312 "/org/freedesktop/machine1",
2313 "org.freedesktop.machine1.Manager",
2314 "CreateMachineWithNetwork");
2315 if (r < 0)
2316 return bus_log_create_error(r);
2317
2318 r = sd_bus_message_append(
2319 m,
2320 "sayssusai",
2321 arg_machine,
2322 SD_BUS_MESSAGE_APPEND_ID128(arg_uuid),
2323 "nspawn",
2324 "container",
2325 (uint32_t) pid,
2326 strempty(arg_directory),
2327 local_ifindex > 0 ? 1 : 0, local_ifindex);
2328 if (r < 0)
2329 return bus_log_create_error(r);
2330
2331 r = sd_bus_message_open_container(m, 'a', "(sv)");
2332 if (r < 0)
2333 return bus_log_create_error(r);
2334
2335 if (!isempty(arg_slice)) {
2336 r = sd_bus_message_append(m, "(sv)", "Slice", "s", arg_slice);
2337 if (r < 0)
2338 return bus_log_create_error(r);
2339 }
2340
2341 r = sd_bus_message_append(m, "(sv)", "DevicePolicy", "s", "strict");
2342 if (r < 0)
2343 return bus_log_create_error(r);
2344
2345 /* If you make changes here, also make sure to update
2346 * systemd-nspawn@.service, to keep the device
2347 * policies in sync regardless if we are run with or
2348 * without the --keep-unit switch. */
2349 r = sd_bus_message_append(m, "(sv)", "DeviceAllow", "a(ss)", 9,
2350 /* Allow the container to
2351 * access and create the API
2352 * device nodes, so that
2353 * PrivateDevices= in the
2354 * container can work
2355 * fine */
2356 "/dev/null", "rwm",
2357 "/dev/zero", "rwm",
2358 "/dev/full", "rwm",
2359 "/dev/random", "rwm",
2360 "/dev/urandom", "rwm",
2361 "/dev/tty", "rwm",
2362 "/dev/net/tun", "rwm",
2363 /* Allow the container
2364 * access to ptys. However,
2365 * do not permit the
2366 * container to ever create
2367 * these device nodes. */
2368 "/dev/pts/ptmx", "rw",
2369 "char-pts", "rw");
2370 if (r < 0)
2371 return bus_log_create_error(r);
2372
2373 for (j = 0; j < arg_n_custom_mounts; j++) {
2374 CustomMount *cm = &arg_custom_mounts[j];
2375
2376 if (cm->type != CUSTOM_MOUNT_BIND)
2377 continue;
2378
2379 r = is_device_node(cm->source);
2380 if (r < 0)
2381 return log_error_errno(r, "Failed to stat %s: %m", cm->source);
2382
2383 if (r) {
2384 r = sd_bus_message_append(m, "(sv)", "DeviceAllow", "a(ss)", 1,
2385 cm->source, cm->read_only ? "r" : "rw");
2386 if (r < 0)
2387 return log_error_errno(r, "Failed to append message arguments: %m");
2388 }
2389 }
2390
2391 if (arg_kill_signal != 0) {
2392 r = sd_bus_message_append(m, "(sv)", "KillSignal", "i", arg_kill_signal);
2393 if (r < 0)
2394 return bus_log_create_error(r);
2395
2396 r = sd_bus_message_append(m, "(sv)", "KillMode", "s", "mixed");
2397 if (r < 0)
2398 return bus_log_create_error(r);
2399 }
2400
2401 STRV_FOREACH(i, arg_property) {
2402 r = sd_bus_message_open_container(m, 'r', "sv");
2403 if (r < 0)
2404 return bus_log_create_error(r);
2405
2406 r = bus_append_unit_property_assignment(m, *i);
2407 if (r < 0)
2408 return r;
2409
2410 r = sd_bus_message_close_container(m);
2411 if (r < 0)
2412 return bus_log_create_error(r);
2413 }
2414
2415 r = sd_bus_message_close_container(m);
2416 if (r < 0)
2417 return bus_log_create_error(r);
2418
2419 r = sd_bus_call(bus, m, 0, &error, NULL);
2420 }
2421
2422 if (r < 0) {
2423 log_error("Failed to register machine: %s", bus_error_message(&error, r));
2424 return r;
2425 }
2426
2427 return 0;
2428 }
2429
2430 static int terminate_machine(pid_t pid) {
2431 _cleanup_bus_error_free_ sd_bus_error error = SD_BUS_ERROR_NULL;
2432 _cleanup_bus_message_unref_ sd_bus_message *reply = NULL;
2433 _cleanup_bus_close_unref_ sd_bus *bus = NULL;
2434 const char *path;
2435 int r;
2436
2437 if (!arg_register)
2438 return 0;
2439
2440 /* If we are reusing the unit, then just exit, systemd will do
2441 * the right thing when we exit. */
2442 if (arg_keep_unit)
2443 return 0;
2444
2445 r = sd_bus_default_system(&bus);
2446 if (r < 0)
2447 return log_error_errno(r, "Failed to open system bus: %m");
2448
2449 r = sd_bus_call_method(
2450 bus,
2451 "org.freedesktop.machine1",
2452 "/org/freedesktop/machine1",
2453 "org.freedesktop.machine1.Manager",
2454 "GetMachineByPID",
2455 &error,
2456 &reply,
2457 "u",
2458 (uint32_t) pid);
2459 if (r < 0) {
2460 /* Note that the machine might already have been
2461 * cleaned up automatically, hence don't consider it a
2462 * failure if we cannot get the machine object. */
2463 log_debug("Failed to get machine: %s", bus_error_message(&error, r));
2464 return 0;
2465 }
2466
2467 r = sd_bus_message_read(reply, "o", &path);
2468 if (r < 0)
2469 return bus_log_parse_error(r);
2470
2471 r = sd_bus_call_method(
2472 bus,
2473 "org.freedesktop.machine1",
2474 path,
2475 "org.freedesktop.machine1.Machine",
2476 "Terminate",
2477 &error,
2478 NULL,
2479 NULL);
2480 if (r < 0) {
2481 log_debug("Failed to terminate machine: %s", bus_error_message(&error, r));
2482 return 0;
2483 }
2484
2485 return 0;
2486 }
2487
2488 static int reset_audit_loginuid(void) {
2489 _cleanup_free_ char *p = NULL;
2490 int r;
2491
2492 if (arg_share_system)
2493 return 0;
2494
2495 r = read_one_line_file("/proc/self/loginuid", &p);
2496 if (r == -ENOENT)
2497 return 0;
2498 if (r < 0)
2499 return log_error_errno(r, "Failed to read /proc/self/loginuid: %m");
2500
2501 /* Already reset? */
2502 if (streq(p, "4294967295"))
2503 return 0;
2504
2505 r = write_string_file("/proc/self/loginuid", "4294967295");
2506 if (r < 0) {
2507 log_error_errno(r,
2508 "Failed to reset audit login UID. This probably means that your kernel is too\n"
2509 "old and you have audit enabled. Note that the auditing subsystem is known to\n"
2510 "be incompatible with containers on old kernels. Please make sure to upgrade\n"
2511 "your kernel or to off auditing with 'audit=0' on the kernel command line before\n"
2512 "using systemd-nspawn. Sleeping for 5s... (%m)");
2513
2514 sleep(5);
2515 }
2516
2517 return 0;
2518 }
2519
2520 #define HOST_HASH_KEY SD_ID128_MAKE(1a,37,6f,c7,46,ec,45,0b,ad,a3,d5,31,06,60,5d,b1)
2521 #define CONTAINER_HASH_KEY SD_ID128_MAKE(c3,c4,f9,19,b5,57,b2,1c,e6,cf,14,27,03,9c,ee,a2)
2522 #define MACVLAN_HASH_KEY SD_ID128_MAKE(00,13,6d,bc,66,83,44,81,bb,0c,f9,51,1f,24,a6,6f)
2523
2524 static int generate_mac(struct ether_addr *mac, sd_id128_t hash_key, uint64_t idx) {
2525 uint8_t result[8];
2526 size_t l, sz;
2527 uint8_t *v, *i;
2528 int r;
2529
2530 l = strlen(arg_machine);
2531 sz = sizeof(sd_id128_t) + l;
2532 if (idx > 0)
2533 sz += sizeof(idx);
2534
2535 v = alloca(sz);
2536
2537 /* fetch some persistent data unique to the host */
2538 r = sd_id128_get_machine((sd_id128_t*) v);
2539 if (r < 0)
2540 return r;
2541
2542 /* combine with some data unique (on this host) to this
2543 * container instance */
2544 i = mempcpy(v + sizeof(sd_id128_t), arg_machine, l);
2545 if (idx > 0) {
2546 idx = htole64(idx);
2547 memcpy(i, &idx, sizeof(idx));
2548 }
2549
2550 /* Let's hash the host machine ID plus the container name. We
2551 * use a fixed, but originally randomly created hash key here. */
2552 siphash24(result, v, sz, hash_key.bytes);
2553
2554 assert_cc(ETH_ALEN <= sizeof(result));
2555 memcpy(mac->ether_addr_octet, result, ETH_ALEN);
2556
2557 /* see eth_random_addr in the kernel */
2558 mac->ether_addr_octet[0] &= 0xfe; /* clear multicast bit */
2559 mac->ether_addr_octet[0] |= 0x02; /* set local assignment bit (IEEE802) */
2560
2561 return 0;
2562 }
2563
2564 static int setup_veth(pid_t pid, char iface_name[IFNAMSIZ], int *ifi) {
2565 _cleanup_netlink_message_unref_ sd_netlink_message *m = NULL;
2566 _cleanup_netlink_unref_ sd_netlink *rtnl = NULL;
2567 struct ether_addr mac_host, mac_container;
2568 int r, i;
2569
2570 if (!arg_private_network)
2571 return 0;
2572
2573 if (!arg_network_veth)
2574 return 0;
2575
2576 /* Use two different interface name prefixes depending whether
2577 * we are in bridge mode or not. */
2578 snprintf(iface_name, IFNAMSIZ - 1, "%s-%s",
2579 arg_network_bridge ? "vb" : "ve", arg_machine);
2580
2581 r = generate_mac(&mac_container, CONTAINER_HASH_KEY, 0);
2582 if (r < 0)
2583 return log_error_errno(r, "Failed to generate predictable MAC address for container side: %m");
2584
2585 r = generate_mac(&mac_host, HOST_HASH_KEY, 0);
2586 if (r < 0)
2587 return log_error_errno(r, "Failed to generate predictable MAC address for host side: %m");
2588
2589 r = sd_netlink_open(&rtnl);
2590 if (r < 0)
2591 return log_error_errno(r, "Failed to connect to netlink: %m");
2592
2593 r = sd_rtnl_message_new_link(rtnl, &m, RTM_NEWLINK, 0);
2594 if (r < 0)
2595 return log_error_errno(r, "Failed to allocate netlink message: %m");
2596
2597 r = sd_netlink_message_append_string(m, IFLA_IFNAME, iface_name);
2598 if (r < 0)
2599 return log_error_errno(r, "Failed to add netlink interface name: %m");
2600
2601 r = sd_netlink_message_append_ether_addr(m, IFLA_ADDRESS, &mac_host);
2602 if (r < 0)
2603 return log_error_errno(r, "Failed to add netlink MAC address: %m");
2604
2605 r = sd_netlink_message_open_container(m, IFLA_LINKINFO);
2606 if (r < 0)
2607 return log_error_errno(r, "Failed to open netlink container: %m");
2608
2609 r = sd_netlink_message_open_container_union(m, IFLA_INFO_DATA, "veth");
2610 if (r < 0)
2611 return log_error_errno(r, "Failed to open netlink container: %m");
2612
2613 r = sd_netlink_message_open_container(m, VETH_INFO_PEER);
2614 if (r < 0)
2615 return log_error_errno(r, "Failed to open netlink container: %m");
2616
2617 r = sd_netlink_message_append_string(m, IFLA_IFNAME, "host0");
2618 if (r < 0)
2619 return log_error_errno(r, "Failed to add netlink interface name: %m");
2620
2621 r = sd_netlink_message_append_ether_addr(m, IFLA_ADDRESS, &mac_container);
2622 if (r < 0)
2623 return log_error_errno(r, "Failed to add netlink MAC address: %m");
2624
2625 r = sd_netlink_message_append_u32(m, IFLA_NET_NS_PID, pid);
2626 if (r < 0)
2627 return log_error_errno(r, "Failed to add netlink namespace field: %m");
2628
2629 r = sd_netlink_message_close_container(m);
2630 if (r < 0)
2631 return log_error_errno(r, "Failed to close netlink container: %m");
2632
2633 r = sd_netlink_message_close_container(m);
2634 if (r < 0)
2635 return log_error_errno(r, "Failed to close netlink container: %m");
2636
2637 r = sd_netlink_message_close_container(m);
2638 if (r < 0)
2639 return log_error_errno(r, "Failed to close netlink container: %m");
2640
2641 r = sd_netlink_call(rtnl, m, 0, NULL);
2642 if (r < 0)
2643 return log_error_errno(r, "Failed to add new veth interfaces (host0, %s): %m", iface_name);
2644
2645 i = (int) if_nametoindex(iface_name);
2646 if (i <= 0)
2647 return log_error_errno(errno, "Failed to resolve interface %s: %m", iface_name);
2648
2649 *ifi = i;
2650
2651 return 0;
2652 }
2653
2654 static int setup_bridge(const char veth_name[], int *ifi) {
2655 _cleanup_netlink_message_unref_ sd_netlink_message *m = NULL;
2656 _cleanup_netlink_unref_ sd_netlink *rtnl = NULL;
2657 int r, bridge;
2658
2659 if (!arg_private_network)
2660 return 0;
2661
2662 if (!arg_network_veth)
2663 return 0;
2664
2665 if (!arg_network_bridge)
2666 return 0;
2667
2668 bridge = (int) if_nametoindex(arg_network_bridge);
2669 if (bridge <= 0)
2670 return log_error_errno(errno, "Failed to resolve interface %s: %m", arg_network_bridge);
2671
2672 *ifi = bridge;
2673
2674 r = sd_netlink_open(&rtnl);
2675 if (r < 0)
2676 return log_error_errno(r, "Failed to connect to netlink: %m");
2677
2678 r = sd_rtnl_message_new_link(rtnl, &m, RTM_SETLINK, 0);
2679 if (r < 0)
2680 return log_error_errno(r, "Failed to allocate netlink message: %m");
2681
2682 r = sd_rtnl_message_link_set_flags(m, IFF_UP, IFF_UP);
2683 if (r < 0)
2684 return log_error_errno(r, "Failed to set IFF_UP flag: %m");
2685
2686 r = sd_netlink_message_append_string(m, IFLA_IFNAME, veth_name);
2687 if (r < 0)
2688 return log_error_errno(r, "Failed to add netlink interface name field: %m");
2689
2690 r = sd_netlink_message_append_u32(m, IFLA_MASTER, bridge);
2691 if (r < 0)
2692 return log_error_errno(r, "Failed to add netlink master field: %m");
2693
2694 r = sd_netlink_call(rtnl, m, 0, NULL);
2695 if (r < 0)
2696 return log_error_errno(r, "Failed to add veth interface to bridge: %m");
2697
2698 return 0;
2699 }
2700
2701 static int parse_interface(struct udev *udev, const char *name) {
2702 _cleanup_udev_device_unref_ struct udev_device *d = NULL;
2703 char ifi_str[2 + DECIMAL_STR_MAX(int)];
2704 int ifi;
2705
2706 ifi = (int) if_nametoindex(name);
2707 if (ifi <= 0)
2708 return log_error_errno(errno, "Failed to resolve interface %s: %m", name);
2709
2710 sprintf(ifi_str, "n%i", ifi);
2711 d = udev_device_new_from_device_id(udev, ifi_str);
2712 if (!d)
2713 return log_error_errno(errno, "Failed to get udev device for interface %s: %m", name);
2714
2715 if (udev_device_get_is_initialized(d) <= 0) {
2716 log_error("Network interface %s is not initialized yet.", name);
2717 return -EBUSY;
2718 }
2719
2720 return ifi;
2721 }
2722
2723 static int move_network_interfaces(pid_t pid) {
2724 _cleanup_udev_unref_ struct udev *udev = NULL;
2725 _cleanup_netlink_unref_ sd_netlink *rtnl = NULL;
2726 char **i;
2727 int r;
2728
2729 if (!arg_private_network)
2730 return 0;
2731
2732 if (strv_isempty(arg_network_interfaces))
2733 return 0;
2734
2735 r = sd_netlink_open(&rtnl);
2736 if (r < 0)
2737 return log_error_errno(r, "Failed to connect to netlink: %m");
2738
2739 udev = udev_new();
2740 if (!udev) {
2741 log_error("Failed to connect to udev.");
2742 return -ENOMEM;
2743 }
2744
2745 STRV_FOREACH(i, arg_network_interfaces) {
2746 _cleanup_netlink_message_unref_ sd_netlink_message *m = NULL;
2747 int ifi;
2748
2749 ifi = parse_interface(udev, *i);
2750 if (ifi < 0)
2751 return ifi;
2752
2753 r = sd_rtnl_message_new_link(rtnl, &m, RTM_SETLINK, ifi);
2754 if (r < 0)
2755 return log_error_errno(r, "Failed to allocate netlink message: %m");
2756
2757 r = sd_netlink_message_append_u32(m, IFLA_NET_NS_PID, pid);
2758 if (r < 0)
2759 return log_error_errno(r, "Failed to append namespace PID to netlink message: %m");
2760
2761 r = sd_netlink_call(rtnl, m, 0, NULL);
2762 if (r < 0)
2763 return log_error_errno(r, "Failed to move interface %s to namespace: %m", *i);
2764 }
2765
2766 return 0;
2767 }
2768
2769 static int setup_macvlan(pid_t pid) {
2770 _cleanup_udev_unref_ struct udev *udev = NULL;
2771 _cleanup_netlink_unref_ sd_netlink *rtnl = NULL;
2772 unsigned idx = 0;
2773 char **i;
2774 int r;
2775
2776 if (!arg_private_network)
2777 return 0;
2778
2779 if (strv_isempty(arg_network_macvlan))
2780 return 0;
2781
2782 r = sd_netlink_open(&rtnl);
2783 if (r < 0)
2784 return log_error_errno(r, "Failed to connect to netlink: %m");
2785
2786 udev = udev_new();
2787 if (!udev) {
2788 log_error("Failed to connect to udev.");
2789 return -ENOMEM;
2790 }
2791
2792 STRV_FOREACH(i, arg_network_macvlan) {
2793 _cleanup_netlink_message_unref_ sd_netlink_message *m = NULL;
2794 _cleanup_free_ char *n = NULL;
2795 struct ether_addr mac;
2796 int ifi;
2797
2798 ifi = parse_interface(udev, *i);
2799 if (ifi < 0)
2800 return ifi;
2801
2802 r = generate_mac(&mac, MACVLAN_HASH_KEY, idx++);
2803 if (r < 0)
2804 return log_error_errno(r, "Failed to create MACVLAN MAC address: %m");
2805
2806 r = sd_rtnl_message_new_link(rtnl, &m, RTM_NEWLINK, 0);
2807 if (r < 0)
2808 return log_error_errno(r, "Failed to allocate netlink message: %m");
2809
2810 r = sd_netlink_message_append_u32(m, IFLA_LINK, ifi);
2811 if (r < 0)
2812 return log_error_errno(r, "Failed to add netlink interface index: %m");
2813
2814 n = strappend("mv-", *i);
2815 if (!n)
2816 return log_oom();
2817
2818 strshorten(n, IFNAMSIZ-1);
2819
2820 r = sd_netlink_message_append_string(m, IFLA_IFNAME, n);
2821 if (r < 0)
2822 return log_error_errno(r, "Failed to add netlink interface name: %m");
2823
2824 r = sd_netlink_message_append_ether_addr(m, IFLA_ADDRESS, &mac);
2825 if (r < 0)
2826 return log_error_errno(r, "Failed to add netlink MAC address: %m");
2827
2828 r = sd_netlink_message_append_u32(m, IFLA_NET_NS_PID, pid);
2829 if (r < 0)
2830 return log_error_errno(r, "Failed to add netlink namespace field: %m");
2831
2832 r = sd_netlink_message_open_container(m, IFLA_LINKINFO);
2833 if (r < 0)
2834 return log_error_errno(r, "Failed to open netlink container: %m");
2835
2836 r = sd_netlink_message_open_container_union(m, IFLA_INFO_DATA, "macvlan");
2837 if (r < 0)
2838 return log_error_errno(r, "Failed to open netlink container: %m");
2839
2840 r = sd_netlink_message_append_u32(m, IFLA_MACVLAN_MODE, MACVLAN_MODE_BRIDGE);
2841 if (r < 0)
2842 return log_error_errno(r, "Failed to append macvlan mode: %m");
2843
2844 r = sd_netlink_message_close_container(m);
2845 if (r < 0)
2846 return log_error_errno(r, "Failed to close netlink container: %m");
2847
2848 r = sd_netlink_message_close_container(m);
2849 if (r < 0)
2850 return log_error_errno(r, "Failed to close netlink container: %m");
2851
2852 r = sd_netlink_call(rtnl, m, 0, NULL);
2853 if (r < 0)
2854 return log_error_errno(r, "Failed to add new macvlan interfaces: %m");
2855 }
2856
2857 return 0;
2858 }
2859
2860 static int setup_ipvlan(pid_t pid) {
2861 _cleanup_udev_unref_ struct udev *udev = NULL;
2862 _cleanup_netlink_unref_ sd_netlink *rtnl = NULL;
2863 char **i;
2864 int r;
2865
2866 if (!arg_private_network)
2867 return 0;
2868
2869 if (strv_isempty(arg_network_ipvlan))
2870 return 0;
2871
2872 r = sd_netlink_open(&rtnl);
2873 if (r < 0)
2874 return log_error_errno(r, "Failed to connect to netlink: %m");
2875
2876 udev = udev_new();
2877 if (!udev) {
2878 log_error("Failed to connect to udev.");
2879 return -ENOMEM;
2880 }
2881
2882 STRV_FOREACH(i, arg_network_ipvlan) {
2883 _cleanup_netlink_message_unref_ sd_netlink_message *m = NULL;
2884 _cleanup_free_ char *n = NULL;
2885 int ifi;
2886
2887 ifi = parse_interface(udev, *i);
2888 if (ifi < 0)
2889 return ifi;
2890
2891 r = sd_rtnl_message_new_link(rtnl, &m, RTM_NEWLINK, 0);
2892 if (r < 0)
2893 return log_error_errno(r, "Failed to allocate netlink message: %m");
2894
2895 r = sd_netlink_message_append_u32(m, IFLA_LINK, ifi);
2896 if (r < 0)
2897 return log_error_errno(r, "Failed to add netlink interface index: %m");
2898
2899 n = strappend("iv-", *i);
2900 if (!n)
2901 return log_oom();
2902
2903 strshorten(n, IFNAMSIZ-1);
2904
2905 r = sd_netlink_message_append_string(m, IFLA_IFNAME, n);
2906 if (r < 0)
2907 return log_error_errno(r, "Failed to add netlink interface name: %m");
2908
2909 r = sd_netlink_message_append_u32(m, IFLA_NET_NS_PID, pid);
2910 if (r < 0)
2911 return log_error_errno(r, "Failed to add netlink namespace field: %m");
2912
2913 r = sd_netlink_message_open_container(m, IFLA_LINKINFO);
2914 if (r < 0)
2915 return log_error_errno(r, "Failed to open netlink container: %m");
2916
2917 r = sd_netlink_message_open_container_union(m, IFLA_INFO_DATA, "ipvlan");
2918 if (r < 0)
2919 return log_error_errno(r, "Failed to open netlink container: %m");
2920
2921 r = sd_netlink_message_append_u16(m, IFLA_IPVLAN_MODE, IPVLAN_MODE_L2);
2922 if (r < 0)
2923 return log_error_errno(r, "Failed to add ipvlan mode: %m");
2924
2925 r = sd_netlink_message_close_container(m);
2926 if (r < 0)
2927 return log_error_errno(r, "Failed to close netlink container: %m");
2928
2929 r = sd_netlink_message_close_container(m);
2930 if (r < 0)
2931 return log_error_errno(r, "Failed to close netlink container: %m");
2932
2933 r = sd_netlink_call(rtnl, m, 0, NULL);
2934 if (r < 0)
2935 return log_error_errno(r, "Failed to add new ipvlan interfaces: %m");
2936 }
2937
2938 return 0;
2939 }
2940
2941 static int setup_seccomp(void) {
2942
2943 #ifdef HAVE_SECCOMP
2944 static const struct {
2945 uint64_t capability;
2946 int syscall_num;
2947 } blacklist[] = {
2948 { CAP_SYS_RAWIO, SCMP_SYS(iopl) },
2949 { CAP_SYS_RAWIO, SCMP_SYS(ioperm) },
2950 { CAP_SYS_BOOT, SCMP_SYS(kexec_load) },
2951 { CAP_SYS_ADMIN, SCMP_SYS(swapon) },
2952 { CAP_SYS_ADMIN, SCMP_SYS(swapoff) },
2953 { CAP_SYS_ADMIN, SCMP_SYS(open_by_handle_at) },
2954 { CAP_SYS_MODULE, SCMP_SYS(init_module) },
2955 { CAP_SYS_MODULE, SCMP_SYS(finit_module) },
2956 { CAP_SYS_MODULE, SCMP_SYS(delete_module) },
2957 { CAP_SYSLOG, SCMP_SYS(syslog) },
2958 };
2959
2960 scmp_filter_ctx seccomp;
2961 unsigned i;
2962 int r;
2963
2964 seccomp = seccomp_init(SCMP_ACT_ALLOW);
2965 if (!seccomp)
2966 return log_oom();
2967
2968 r = seccomp_add_secondary_archs(seccomp);
2969 if (r < 0) {
2970 log_error_errno(r, "Failed to add secondary archs to seccomp filter: %m");
2971 goto finish;
2972 }
2973
2974 for (i = 0; i < ELEMENTSOF(blacklist); i++) {
2975 if (arg_retain & (1ULL << blacklist[i].capability))
2976 continue;
2977
2978 r = seccomp_rule_add(seccomp, SCMP_ACT_ERRNO(EPERM), blacklist[i].syscall_num, 0);
2979 if (r == -EFAULT)
2980 continue; /* unknown syscall */
2981 if (r < 0) {
2982 log_error_errno(r, "Failed to block syscall: %m");
2983 goto finish;
2984 }
2985 }
2986
2987
2988 /*
2989 Audit is broken in containers, much of the userspace audit
2990 hookup will fail if running inside a container. We don't
2991 care and just turn off creation of audit sockets.
2992
2993 This will make socket(AF_NETLINK, *, NETLINK_AUDIT) fail
2994 with EAFNOSUPPORT which audit userspace uses as indication
2995 that audit is disabled in the kernel.
2996 */
2997
2998 r = seccomp_rule_add(
2999 seccomp,
3000 SCMP_ACT_ERRNO(EAFNOSUPPORT),
3001 SCMP_SYS(socket),
3002 2,
3003 SCMP_A0(SCMP_CMP_EQ, AF_NETLINK),
3004 SCMP_A2(SCMP_CMP_EQ, NETLINK_AUDIT));
3005 if (r < 0) {
3006 log_error_errno(r, "Failed to add audit seccomp rule: %m");
3007 goto finish;
3008 }
3009
3010 r = seccomp_attr_set(seccomp, SCMP_FLTATR_CTL_NNP, 0);
3011 if (r < 0) {
3012 log_error_errno(r, "Failed to unset NO_NEW_PRIVS: %m");
3013 goto finish;
3014 }
3015
3016 r = seccomp_load(seccomp);
3017 if (r == -EINVAL) {
3018 log_debug_errno(r, "Kernel is probably not configured with CONFIG_SECCOMP. Disabling seccomp audit filter: %m");
3019 r = 0;
3020 goto finish;
3021 }
3022 if (r < 0) {
3023 log_error_errno(r, "Failed to install seccomp audit filter: %m");
3024 goto finish;
3025 }
3026
3027 finish:
3028 seccomp_release(seccomp);
3029 return r;
3030 #else
3031 return 0;
3032 #endif
3033
3034 }
3035
3036 static int setup_propagate(const char *root) {
3037 const char *p, *q;
3038
3039 (void) mkdir_p("/run/systemd/nspawn/", 0755);
3040 (void) mkdir_p("/run/systemd/nspawn/propagate", 0600);
3041 p = strjoina("/run/systemd/nspawn/propagate/", arg_machine);
3042 (void) mkdir_p(p, 0600);
3043
3044 if (userns_mkdir(root, "/run/systemd", 0755, 0, 0) < 0)
3045 return log_error_errno(errno, "Failed to create /run/systemd: %m");
3046
3047 if (userns_mkdir(root, "/run/systemd/nspawn", 0755, 0, 0) < 0)
3048 return log_error_errno(errno, "Failed to create /run/systemd/nspawn: %m");
3049
3050 if (userns_mkdir(root, "/run/systemd/nspawn/incoming", 0600, 0, 0) < 0)
3051 return log_error_errno(errno, "Failed to create /run/systemd/nspawn/incoming: %m");
3052
3053 q = prefix_roota(root, "/run/systemd/nspawn/incoming");
3054 if (mount(p, q, NULL, MS_BIND, NULL) < 0)
3055 return log_error_errno(errno, "Failed to install propagation bind mount.");
3056
3057 if (mount(NULL, q, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY, NULL) < 0)
3058 return log_error_errno(errno, "Failed to make propagation mount read-only");
3059
3060 return 0;
3061 }
3062
3063 static int setup_image(char **device_path, int *loop_nr) {
3064 struct loop_info64 info = {
3065 .lo_flags = LO_FLAGS_AUTOCLEAR|LO_FLAGS_PARTSCAN
3066 };
3067 _cleanup_close_ int fd = -1, control = -1, loop = -1;
3068 _cleanup_free_ char* loopdev = NULL;
3069 struct stat st;
3070 int r, nr;
3071
3072 assert(device_path);
3073 assert(loop_nr);
3074 assert(arg_image);
3075
3076 fd = open(arg_image, O_CLOEXEC|(arg_read_only ? O_RDONLY : O_RDWR)|O_NONBLOCK|O_NOCTTY);
3077 if (fd < 0)
3078 return log_error_errno(errno, "Failed to open %s: %m", arg_image);
3079
3080 if (fstat(fd, &st) < 0)
3081 return log_error_errno(errno, "Failed to stat %s: %m", arg_image);
3082
3083 if (S_ISBLK(st.st_mode)) {
3084 char *p;
3085
3086 p = strdup(arg_image);
3087 if (!p)
3088 return log_oom();
3089
3090 *device_path = p;
3091
3092 *loop_nr = -1;
3093
3094 r = fd;
3095 fd = -1;
3096
3097 return r;
3098 }
3099
3100 if (!S_ISREG(st.st_mode)) {
3101 log_error_errno(errno, "%s is not a regular file or block device: %m", arg_image);
3102 return -EINVAL;
3103 }
3104
3105 control = open("/dev/loop-control", O_RDWR|O_CLOEXEC|O_NOCTTY|O_NONBLOCK);
3106 if (control < 0)
3107 return log_error_errno(errno, "Failed to open /dev/loop-control: %m");
3108
3109 nr = ioctl(control, LOOP_CTL_GET_FREE);
3110 if (nr < 0)
3111 return log_error_errno(errno, "Failed to allocate loop device: %m");
3112
3113 if (asprintf(&loopdev, "/dev/loop%i", nr) < 0)
3114 return log_oom();
3115
3116 loop = open(loopdev, O_CLOEXEC|(arg_read_only ? O_RDONLY : O_RDWR)|O_NONBLOCK|O_NOCTTY);
3117 if (loop < 0)
3118 return log_error_errno(errno, "Failed to open loop device %s: %m", loopdev);
3119
3120 if (ioctl(loop, LOOP_SET_FD, fd) < 0)
3121 return log_error_errno(errno, "Failed to set loopback file descriptor on %s: %m", loopdev);
3122
3123 if (arg_read_only)
3124 info.lo_flags |= LO_FLAGS_READ_ONLY;
3125
3126 if (ioctl(loop, LOOP_SET_STATUS64, &info) < 0)
3127 return log_error_errno(errno, "Failed to set loopback settings on %s: %m", loopdev);
3128
3129 *device_path = loopdev;
3130 loopdev = NULL;
3131
3132 *loop_nr = nr;
3133
3134 r = loop;
3135 loop = -1;
3136
3137 return r;
3138 }
3139
3140 #define PARTITION_TABLE_BLURB \
3141 "Note that the disk image needs to either contain only a single MBR partition of\n" \
3142 "type 0x83 that is marked bootable, or a single GPT partition of type " \
3143 "0FC63DAF-8483-4772-8E79-3D69D8477DE4 or follow\n" \
3144 " http://www.freedesktop.org/wiki/Specifications/DiscoverablePartitionsSpec/\n" \
3145 "to be bootable with systemd-nspawn."
3146
3147 static int dissect_image(
3148 int fd,
3149 char **root_device, bool *root_device_rw,
3150 char **home_device, bool *home_device_rw,
3151 char **srv_device, bool *srv_device_rw,
3152 bool *secondary) {
3153
3154 #ifdef HAVE_BLKID
3155 int home_nr = -1, srv_nr = -1;
3156 #ifdef GPT_ROOT_NATIVE
3157 int root_nr = -1;
3158 #endif
3159 #ifdef GPT_ROOT_SECONDARY
3160 int secondary_root_nr = -1;
3161 #endif
3162 _cleanup_free_ char *home = NULL, *root = NULL, *secondary_root = NULL, *srv = NULL, *generic = NULL;
3163 _cleanup_udev_enumerate_unref_ struct udev_enumerate *e = NULL;
3164 _cleanup_udev_device_unref_ struct udev_device *d = NULL;
3165 _cleanup_blkid_free_probe_ blkid_probe b = NULL;
3166 _cleanup_udev_unref_ struct udev *udev = NULL;
3167 struct udev_list_entry *first, *item;
3168 bool home_rw = true, root_rw = true, secondary_root_rw = true, srv_rw = true, generic_rw = true;
3169 bool is_gpt, is_mbr, multiple_generic = false;
3170 const char *pttype = NULL;
3171 blkid_partlist pl;
3172 struct stat st;
3173 unsigned i;
3174 int r;
3175
3176 assert(fd >= 0);
3177 assert(root_device);
3178 assert(home_device);
3179 assert(srv_device);
3180 assert(secondary);
3181 assert(arg_image);
3182
3183 b = blkid_new_probe();
3184 if (!b)
3185 return log_oom();
3186
3187 errno = 0;
3188 r = blkid_probe_set_device(b, fd, 0, 0);
3189 if (r != 0) {
3190 if (errno == 0)
3191 return log_oom();
3192
3193 log_error_errno(errno, "Failed to set device on blkid probe: %m");
3194 return -errno;
3195 }
3196
3197 blkid_probe_enable_partitions(b, 1);
3198 blkid_probe_set_partitions_flags(b, BLKID_PARTS_ENTRY_DETAILS);
3199
3200 errno = 0;
3201 r = blkid_do_safeprobe(b);
3202 if (r == -2 || r == 1) {
3203 log_error("Failed to identify any partition table on\n"
3204 " %s\n"
3205 PARTITION_TABLE_BLURB, arg_image);
3206 return -EINVAL;
3207 } else if (r != 0) {
3208 if (errno == 0)
3209 errno = EIO;
3210 log_error_errno(errno, "Failed to probe: %m");
3211 return -errno;
3212 }
3213
3214 (void) blkid_probe_lookup_value(b, "PTTYPE", &pttype, NULL);
3215
3216 is_gpt = streq_ptr(pttype, "gpt");
3217 is_mbr = streq_ptr(pttype, "dos");
3218
3219 if (!is_gpt && !is_mbr) {
3220 log_error("No GPT or MBR partition table discovered on\n"
3221 " %s\n"
3222 PARTITION_TABLE_BLURB, arg_image);
3223 return -EINVAL;
3224 }
3225
3226 errno = 0;
3227 pl = blkid_probe_get_partitions(b);
3228 if (!pl) {
3229 if (errno == 0)
3230 return log_oom();
3231
3232 log_error("Failed to list partitions of %s", arg_image);
3233 return -errno;
3234 }
3235
3236 udev = udev_new();
3237 if (!udev)
3238 return log_oom();
3239
3240 if (fstat(fd, &st) < 0)
3241 return log_error_errno(errno, "Failed to stat block device: %m");
3242
3243 d = udev_device_new_from_devnum(udev, 'b', st.st_rdev);
3244 if (!d)
3245 return log_oom();
3246
3247 for (i = 0;; i++) {
3248 int n, m;
3249
3250 if (i >= 10) {
3251 log_error("Kernel partitions never appeared.");
3252 return -ENXIO;
3253 }
3254
3255 e = udev_enumerate_new(udev);
3256 if (!e)
3257 return log_oom();
3258
3259 r = udev_enumerate_add_match_parent(e, d);
3260 if (r < 0)
3261 return log_oom();
3262
3263 r = udev_enumerate_scan_devices(e);
3264 if (r < 0)
3265 return log_error_errno(r, "Failed to scan for partition devices of %s: %m", arg_image);
3266
3267 /* Count the partitions enumerated by the kernel */
3268 n = 0;
3269 first = udev_enumerate_get_list_entry(e);
3270 udev_list_entry_foreach(item, first)
3271 n++;
3272
3273 /* Count the partitions enumerated by blkid */
3274 m = blkid_partlist_numof_partitions(pl);
3275 if (n == m + 1)
3276 break;
3277 if (n > m + 1) {
3278 log_error("blkid and kernel partition list do not match.");
3279 return -EIO;
3280 }
3281 if (n < m + 1) {
3282 unsigned j;
3283
3284 /* The kernel has probed fewer partitions than
3285 * blkid? Maybe the kernel prober is still
3286 * running or it got EBUSY because udev
3287 * already opened the device. Let's reprobe
3288 * the device, which is a synchronous call
3289 * that waits until probing is complete. */
3290
3291 for (j = 0; j < 20; j++) {
3292
3293 r = ioctl(fd, BLKRRPART, 0);
3294 if (r < 0)
3295 r = -errno;
3296 if (r >= 0 || r != -EBUSY)
3297 break;
3298
3299 /* If something else has the device
3300 * open, such as an udev rule, the
3301 * ioctl will return EBUSY. Since
3302 * there's no way to wait until it
3303 * isn't busy anymore, let's just wait
3304 * a bit, and try again.
3305 *
3306 * This is really something they
3307 * should fix in the kernel! */
3308
3309 usleep(50 * USEC_PER_MSEC);
3310 }
3311
3312 if (r < 0)
3313 return log_error_errno(r, "Failed to reread partition table: %m");
3314 }
3315
3316 e = udev_enumerate_unref(e);
3317 }
3318
3319 first = udev_enumerate_get_list_entry(e);
3320 udev_list_entry_foreach(item, first) {
3321 _cleanup_udev_device_unref_ struct udev_device *q;
3322 const char *node;
3323 unsigned long long flags;
3324 blkid_partition pp;
3325 dev_t qn;
3326 int nr;
3327
3328 errno = 0;
3329 q = udev_device_new_from_syspath(udev, udev_list_entry_get_name(item));
3330 if (!q) {
3331 if (!errno)
3332 errno = ENOMEM;
3333
3334 log_error_errno(errno, "Failed to get partition device of %s: %m", arg_image);
3335 return -errno;
3336 }
3337
3338 qn = udev_device_get_devnum(q);
3339 if (major(qn) == 0)
3340 continue;
3341
3342 if (st.st_rdev == qn)
3343 continue;
3344
3345 node = udev_device_get_devnode(q);
3346 if (!node)
3347 continue;
3348
3349 pp = blkid_partlist_devno_to_partition(pl, qn);
3350 if (!pp)
3351 continue;
3352
3353 flags = blkid_partition_get_flags(pp);
3354
3355 nr = blkid_partition_get_partno(pp);
3356 if (nr < 0)
3357 continue;
3358
3359 if (is_gpt) {
3360 sd_id128_t type_id;
3361 const char *stype;
3362
3363 if (flags & GPT_FLAG_NO_AUTO)
3364 continue;
3365
3366 stype = blkid_partition_get_type_string(pp);
3367 if (!stype)
3368 continue;
3369
3370 if (sd_id128_from_string(stype, &type_id) < 0)
3371 continue;
3372
3373 if (sd_id128_equal(type_id, GPT_HOME)) {
3374
3375 if (home && nr >= home_nr)
3376 continue;
3377
3378 home_nr = nr;
3379 home_rw = !(flags & GPT_FLAG_READ_ONLY);
3380
3381 r = free_and_strdup(&home, node);
3382 if (r < 0)
3383 return log_oom();
3384
3385 } else if (sd_id128_equal(type_id, GPT_SRV)) {
3386
3387 if (srv && nr >= srv_nr)
3388 continue;
3389
3390 srv_nr = nr;
3391 srv_rw = !(flags & GPT_FLAG_READ_ONLY);
3392
3393 r = free_and_strdup(&srv, node);
3394 if (r < 0)
3395 return log_oom();
3396 }
3397 #ifdef GPT_ROOT_NATIVE
3398 else if (sd_id128_equal(type_id, GPT_ROOT_NATIVE)) {
3399
3400 if (root && nr >= root_nr)
3401 continue;
3402
3403 root_nr = nr;
3404 root_rw = !(flags & GPT_FLAG_READ_ONLY);
3405
3406 r = free_and_strdup(&root, node);
3407 if (r < 0)
3408 return log_oom();
3409 }
3410 #endif
3411 #ifdef GPT_ROOT_SECONDARY
3412 else if (sd_id128_equal(type_id, GPT_ROOT_SECONDARY)) {
3413
3414 if (secondary_root && nr >= secondary_root_nr)
3415 continue;
3416
3417 secondary_root_nr = nr;
3418 secondary_root_rw = !(flags & GPT_FLAG_READ_ONLY);
3419
3420 r = free_and_strdup(&secondary_root, node);
3421 if (r < 0)
3422 return log_oom();
3423 }
3424 #endif
3425 else if (sd_id128_equal(type_id, GPT_LINUX_GENERIC)) {
3426
3427 if (generic)
3428 multiple_generic = true;
3429 else {
3430 generic_rw = !(flags & GPT_FLAG_READ_ONLY);
3431
3432 r = free_and_strdup(&generic, node);
3433 if (r < 0)
3434 return log_oom();
3435 }
3436 }
3437
3438 } else if (is_mbr) {
3439 int type;
3440
3441 if (flags != 0x80) /* Bootable flag */
3442 continue;
3443
3444 type = blkid_partition_get_type(pp);
3445 if (type != 0x83) /* Linux partition */
3446 continue;
3447
3448 if (generic)
3449 multiple_generic = true;
3450 else {
3451 generic_rw = true;
3452
3453 r = free_and_strdup(&root, node);
3454 if (r < 0)
3455 return log_oom();
3456 }
3457 }
3458 }
3459
3460 if (root) {
3461 *root_device = root;
3462 root = NULL;
3463
3464 *root_device_rw = root_rw;
3465 *secondary = false;
3466 } else if (secondary_root) {
3467 *root_device = secondary_root;
3468 secondary_root = NULL;
3469
3470 *root_device_rw = secondary_root_rw;
3471 *secondary = true;
3472 } else if (generic) {
3473
3474 /* There were no partitions with precise meanings
3475 * around, but we found generic partitions. In this
3476 * case, if there's only one, we can go ahead and boot
3477 * it, otherwise we bail out, because we really cannot
3478 * make any sense of it. */
3479
3480 if (multiple_generic) {
3481 log_error("Identified multiple bootable Linux partitions on\n"
3482 " %s\n"
3483 PARTITION_TABLE_BLURB, arg_image);
3484 return -EINVAL;
3485 }
3486
3487 *root_device = generic;
3488 generic = NULL;
3489
3490 *root_device_rw = generic_rw;
3491 *secondary = false;
3492 } else {
3493 log_error("Failed to identify root partition in disk image\n"
3494 " %s\n"
3495 PARTITION_TABLE_BLURB, arg_image);
3496 return -EINVAL;
3497 }
3498
3499 if (home) {
3500 *home_device = home;
3501 home = NULL;
3502
3503 *home_device_rw = home_rw;
3504 }
3505
3506 if (srv) {
3507 *srv_device = srv;
3508 srv = NULL;
3509
3510 *srv_device_rw = srv_rw;
3511 }
3512
3513 return 0;
3514 #else
3515 log_error("--image= is not supported, compiled without blkid support.");
3516 return -EOPNOTSUPP;
3517 #endif
3518 }
3519
3520 static int mount_device(const char *what, const char *where, const char *directory, bool rw) {
3521 #ifdef HAVE_BLKID
3522 _cleanup_blkid_free_probe_ blkid_probe b = NULL;
3523 const char *fstype, *p;
3524 int r;
3525
3526 assert(what);
3527 assert(where);
3528
3529 if (arg_read_only)
3530 rw = false;
3531
3532 if (directory)
3533 p = strjoina(where, directory);
3534 else
3535 p = where;
3536
3537 errno = 0;
3538 b = blkid_new_probe_from_filename(what);
3539 if (!b) {
3540 if (errno == 0)
3541 return log_oom();
3542 log_error_errno(errno, "Failed to allocate prober for %s: %m", what);
3543 return -errno;
3544 }
3545
3546 blkid_probe_enable_superblocks(b, 1);
3547 blkid_probe_set_superblocks_flags(b, BLKID_SUBLKS_TYPE);
3548
3549 errno = 0;
3550 r = blkid_do_safeprobe(b);
3551 if (r == -1 || r == 1) {
3552 log_error("Cannot determine file system type of %s", what);
3553 return -EINVAL;
3554 } else if (r != 0) {
3555 if (errno == 0)
3556 errno = EIO;
3557 log_error_errno(errno, "Failed to probe %s: %m", what);
3558 return -errno;
3559 }
3560
3561 errno = 0;
3562 if (blkid_probe_lookup_value(b, "TYPE", &fstype, NULL) < 0) {
3563 if (errno == 0)
3564 errno = EINVAL;
3565 log_error("Failed to determine file system type of %s", what);
3566 return -errno;
3567 }
3568
3569 if (streq(fstype, "crypto_LUKS")) {
3570 log_error("nspawn currently does not support LUKS disk images.");
3571 return -EOPNOTSUPP;
3572 }
3573
3574 if (mount(what, p, fstype, MS_NODEV|(rw ? 0 : MS_RDONLY), NULL) < 0)
3575 return log_error_errno(errno, "Failed to mount %s: %m", what);
3576
3577 return 0;
3578 #else
3579 log_error("--image= is not supported, compiled without blkid support.");
3580 return -EOPNOTSUPP;
3581 #endif
3582 }
3583
3584 static int mount_devices(
3585 const char *where,
3586 const char *root_device, bool root_device_rw,
3587 const char *home_device, bool home_device_rw,
3588 const char *srv_device, bool srv_device_rw) {
3589 int r;
3590
3591 assert(where);
3592
3593 if (root_device) {
3594 r = mount_device(root_device, arg_directory, NULL, root_device_rw);
3595 if (r < 0)
3596 return log_error_errno(r, "Failed to mount root directory: %m");
3597 }
3598
3599 if (home_device) {
3600 r = mount_device(home_device, arg_directory, "/home", home_device_rw);
3601 if (r < 0)
3602 return log_error_errno(r, "Failed to mount home directory: %m");
3603 }
3604
3605 if (srv_device) {
3606 r = mount_device(srv_device, arg_directory, "/srv", srv_device_rw);
3607 if (r < 0)
3608 return log_error_errno(r, "Failed to mount server data directory: %m");
3609 }
3610
3611 return 0;
3612 }
3613
3614 static void loop_remove(int nr, int *image_fd) {
3615 _cleanup_close_ int control = -1;
3616 int r;
3617
3618 if (nr < 0)
3619 return;
3620
3621 if (image_fd && *image_fd >= 0) {
3622 r = ioctl(*image_fd, LOOP_CLR_FD);
3623 if (r < 0)
3624 log_debug_errno(errno, "Failed to close loop image: %m");
3625 *image_fd = safe_close(*image_fd);
3626 }
3627
3628 control = open("/dev/loop-control", O_RDWR|O_CLOEXEC|O_NOCTTY|O_NONBLOCK);
3629 if (control < 0) {
3630 log_warning_errno(errno, "Failed to open /dev/loop-control: %m");
3631 return;
3632 }
3633
3634 r = ioctl(control, LOOP_CTL_REMOVE, nr);
3635 if (r < 0)
3636 log_debug_errno(errno, "Failed to remove loop %d: %m", nr);
3637 }
3638
3639 static int spawn_getent(const char *database, const char *key, pid_t *rpid) {
3640 int pipe_fds[2];
3641 pid_t pid;
3642
3643 assert(database);
3644 assert(key);
3645 assert(rpid);
3646
3647 if (pipe2(pipe_fds, O_CLOEXEC) < 0)
3648 return log_error_errno(errno, "Failed to allocate pipe: %m");
3649
3650 pid = fork();
3651 if (pid < 0)
3652 return log_error_errno(errno, "Failed to fork getent child: %m");
3653 else if (pid == 0) {
3654 int nullfd;
3655 char *empty_env = NULL;
3656
3657 if (dup3(pipe_fds[1], STDOUT_FILENO, 0) < 0)
3658 _exit(EXIT_FAILURE);
3659
3660 if (pipe_fds[0] > 2)
3661 safe_close(pipe_fds[0]);
3662 if (pipe_fds[1] > 2)
3663 safe_close(pipe_fds[1]);
3664
3665 nullfd = open("/dev/null", O_RDWR);
3666 if (nullfd < 0)
3667 _exit(EXIT_FAILURE);
3668
3669 if (dup3(nullfd, STDIN_FILENO, 0) < 0)
3670 _exit(EXIT_FAILURE);
3671
3672 if (dup3(nullfd, STDERR_FILENO, 0) < 0)
3673 _exit(EXIT_FAILURE);
3674
3675 if (nullfd > 2)
3676 safe_close(nullfd);
3677
3678 (void) reset_all_signal_handlers();
3679 (void) reset_signal_mask();
3680 close_all_fds(NULL, 0);
3681
3682 execle("/usr/bin/getent", "getent", database, key, NULL, &empty_env);
3683 execle("/bin/getent", "getent", database, key, NULL, &empty_env);
3684 _exit(EXIT_FAILURE);
3685 }
3686
3687 pipe_fds[1] = safe_close(pipe_fds[1]);
3688
3689 *rpid = pid;
3690
3691 return pipe_fds[0];
3692 }
3693
3694 static int change_uid_gid(char **_home) {
3695 char line[LINE_MAX], *x, *u, *g, *h;
3696 const char *word, *state;
3697 _cleanup_free_ uid_t *uids = NULL;
3698 _cleanup_free_ char *home = NULL;
3699 _cleanup_fclose_ FILE *f = NULL;
3700 _cleanup_close_ int fd = -1;
3701 unsigned n_uids = 0;
3702 size_t sz = 0, l;
3703 uid_t uid;
3704 gid_t gid;
3705 pid_t pid;
3706 int r;
3707
3708 assert(_home);
3709
3710 if (!arg_user || streq(arg_user, "root") || streq(arg_user, "0")) {
3711 /* Reset everything fully to 0, just in case */
3712
3713 r = reset_uid_gid();
3714 if (r < 0)
3715 return log_error_errno(r, "Failed to become root: %m");
3716
3717 *_home = NULL;
3718 return 0;
3719 }
3720
3721 /* First, get user credentials */
3722 fd = spawn_getent("passwd", arg_user, &pid);
3723 if (fd < 0)
3724 return fd;
3725
3726 f = fdopen(fd, "r");
3727 if (!f)
3728 return log_oom();
3729 fd = -1;
3730
3731 if (!fgets(line, sizeof(line), f)) {
3732
3733 if (!ferror(f)) {
3734 log_error("Failed to resolve user %s.", arg_user);
3735 return -ESRCH;
3736 }
3737
3738 log_error_errno(errno, "Failed to read from getent: %m");
3739 return -errno;
3740 }
3741
3742 truncate_nl(line);
3743
3744 wait_for_terminate_and_warn("getent passwd", pid, true);
3745
3746 x = strchr(line, ':');
3747 if (!x) {
3748 log_error("/etc/passwd entry has invalid user field.");
3749 return -EIO;
3750 }
3751
3752 u = strchr(x+1, ':');
3753 if (!u) {
3754 log_error("/etc/passwd entry has invalid password field.");
3755 return -EIO;
3756 }
3757
3758 u++;
3759 g = strchr(u, ':');
3760 if (!g) {
3761 log_error("/etc/passwd entry has invalid UID field.");
3762 return -EIO;
3763 }
3764
3765 *g = 0;
3766 g++;
3767 x = strchr(g, ':');
3768 if (!x) {
3769 log_error("/etc/passwd entry has invalid GID field.");
3770 return -EIO;
3771 }
3772
3773 *x = 0;
3774 h = strchr(x+1, ':');
3775 if (!h) {
3776 log_error("/etc/passwd entry has invalid GECOS field.");
3777 return -EIO;
3778 }
3779
3780 h++;
3781 x = strchr(h, ':');
3782 if (!x) {
3783 log_error("/etc/passwd entry has invalid home directory field.");
3784 return -EIO;
3785 }
3786
3787 *x = 0;
3788
3789 r = parse_uid(u, &uid);
3790 if (r < 0) {
3791 log_error("Failed to parse UID of user.");
3792 return -EIO;
3793 }
3794
3795 r = parse_gid(g, &gid);
3796 if (r < 0) {
3797 log_error("Failed to parse GID of user.");
3798 return -EIO;
3799 }
3800
3801 home = strdup(h);
3802 if (!home)
3803 return log_oom();
3804
3805 /* Second, get group memberships */
3806 fd = spawn_getent("initgroups", arg_user, &pid);
3807 if (fd < 0)
3808 return fd;
3809
3810 fclose(f);
3811 f = fdopen(fd, "r");
3812 if (!f)
3813 return log_oom();
3814 fd = -1;
3815
3816 if (!fgets(line, sizeof(line), f)) {
3817 if (!ferror(f)) {
3818 log_error("Failed to resolve user %s.", arg_user);
3819 return -ESRCH;
3820 }
3821
3822 log_error_errno(errno, "Failed to read from getent: %m");
3823 return -errno;
3824 }
3825
3826 truncate_nl(line);
3827
3828 wait_for_terminate_and_warn("getent initgroups", pid, true);
3829
3830 /* Skip over the username and subsequent separator whitespace */
3831 x = line;
3832 x += strcspn(x, WHITESPACE);
3833 x += strspn(x, WHITESPACE);
3834
3835 FOREACH_WORD(word, l, x, state) {
3836 char c[l+1];
3837
3838 memcpy(c, word, l);
3839 c[l] = 0;
3840
3841 if (!GREEDY_REALLOC(uids, sz, n_uids+1))
3842 return log_oom();
3843
3844 r = parse_uid(c, &uids[n_uids++]);
3845 if (r < 0) {
3846 log_error("Failed to parse group data from getent.");
3847 return -EIO;
3848 }
3849 }
3850
3851 r = mkdir_parents(home, 0775);
3852 if (r < 0)
3853 return log_error_errno(r, "Failed to make home root directory: %m");
3854
3855 r = mkdir_safe(home, 0755, uid, gid);
3856 if (r < 0 && r != -EEXIST)
3857 return log_error_errno(r, "Failed to make home directory: %m");
3858
3859 (void) fchown(STDIN_FILENO, uid, gid);
3860 (void) fchown(STDOUT_FILENO, uid, gid);
3861 (void) fchown(STDERR_FILENO, uid, gid);
3862
3863 if (setgroups(n_uids, uids) < 0)
3864 return log_error_errno(errno, "Failed to set auxiliary groups: %m");
3865
3866 if (setresgid(gid, gid, gid) < 0)
3867 return log_error_errno(errno, "setregid() failed: %m");
3868
3869 if (setresuid(uid, uid, uid) < 0)
3870 return log_error_errno(errno, "setreuid() failed: %m");
3871
3872 if (_home) {
3873 *_home = home;
3874 home = NULL;
3875 }
3876
3877 return 0;
3878 }
3879
3880 /*
3881 * Return values:
3882 * < 0 : wait_for_terminate() failed to get the state of the
3883 * container, the container was terminated by a signal, or
3884 * failed for an unknown reason. No change is made to the
3885 * container argument.
3886 * > 0 : The program executed in the container terminated with an
3887 * error. The exit code of the program executed in the
3888 * container is returned. The container argument has been set
3889 * to CONTAINER_TERMINATED.
3890 * 0 : The container is being rebooted, has been shut down or exited
3891 * successfully. The container argument has been set to either
3892 * CONTAINER_TERMINATED or CONTAINER_REBOOTED.
3893 *
3894 * That is, success is indicated by a return value of zero, and an
3895 * error is indicated by a non-zero value.
3896 */
3897 static int wait_for_container(pid_t pid, ContainerStatus *container) {
3898 siginfo_t status;
3899 int r;
3900
3901 r = wait_for_terminate(pid, &status);
3902 if (r < 0)
3903 return log_warning_errno(r, "Failed to wait for container: %m");
3904
3905 switch (status.si_code) {
3906
3907 case CLD_EXITED:
3908 if (status.si_status == 0) {
3909 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s exited successfully.", arg_machine);
3910
3911 } else
3912 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s failed with error code %i.", arg_machine, status.si_status);
3913
3914 *container = CONTAINER_TERMINATED;
3915 return status.si_status;
3916
3917 case CLD_KILLED:
3918 if (status.si_status == SIGINT) {
3919
3920 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s has been shut down.", arg_machine);
3921 *container = CONTAINER_TERMINATED;
3922 return 0;
3923
3924 } else if (status.si_status == SIGHUP) {
3925
3926 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s is being rebooted.", arg_machine);
3927 *container = CONTAINER_REBOOTED;
3928 return 0;
3929 }
3930
3931 /* CLD_KILLED fallthrough */
3932
3933 case CLD_DUMPED:
3934 log_error("Container %s terminated by signal %s.", arg_machine, signal_to_string(status.si_status));
3935 return -EIO;
3936
3937 default:
3938 log_error("Container %s failed due to unknown reason.", arg_machine);
3939 return -EIO;
3940 }
3941
3942 return r;
3943 }
3944
3945 static void nop_handler(int sig) {}
3946
3947 static int on_orderly_shutdown(sd_event_source *s, const struct signalfd_siginfo *si, void *userdata) {
3948 pid_t pid;
3949
3950 pid = PTR_TO_UINT32(userdata);
3951 if (pid > 0) {
3952 if (kill(pid, arg_kill_signal) >= 0) {
3953 log_info("Trying to halt container. Send SIGTERM again to trigger immediate termination.");
3954 sd_event_source_set_userdata(s, NULL);
3955 return 0;
3956 }
3957 }
3958
3959 sd_event_exit(sd_event_source_get_event(s), 0);
3960 return 0;
3961 }
3962
3963 static int determine_names(void) {
3964 int r;
3965
3966 if (!arg_image && !arg_directory) {
3967 if (arg_machine) {
3968 _cleanup_(image_unrefp) Image *i = NULL;
3969
3970 r = image_find(arg_machine, &i);
3971 if (r < 0)
3972 return log_error_errno(r, "Failed to find image for machine '%s': %m", arg_machine);
3973 else if (r == 0) {
3974 log_error("No image for machine '%s': %m", arg_machine);
3975 return -ENOENT;
3976 }
3977
3978 if (i->type == IMAGE_RAW)
3979 r = set_sanitized_path(&arg_image, i->path);
3980 else
3981 r = set_sanitized_path(&arg_directory, i->path);
3982 if (r < 0)
3983 return log_error_errno(r, "Invalid image directory: %m");
3984
3985 if (!arg_ephemeral)
3986 arg_read_only = arg_read_only || i->read_only;
3987 } else
3988 arg_directory = get_current_dir_name();
3989
3990 if (!arg_directory && !arg_machine) {
3991 log_error("Failed to determine path, please use -D or -i.");
3992 return -EINVAL;
3993 }
3994 }
3995
3996 if (!arg_machine) {
3997 if (arg_directory && path_equal(arg_directory, "/"))
3998 arg_machine = gethostname_malloc();
3999 else
4000 arg_machine = strdup(basename(arg_image ?: arg_directory));
4001
4002 if (!arg_machine)
4003 return log_oom();
4004
4005 hostname_cleanup(arg_machine, false);
4006 if (!machine_name_is_valid(arg_machine)) {
4007 log_error("Failed to determine machine name automatically, please use -M.");
4008 return -EINVAL;
4009 }
4010
4011 if (arg_ephemeral) {
4012 char *b;
4013
4014 /* Add a random suffix when this is an
4015 * ephemeral machine, so that we can run many
4016 * instances at once without manually having
4017 * to specify -M each time. */
4018
4019 if (asprintf(&b, "%s-%016" PRIx64, arg_machine, random_u64()) < 0)
4020 return log_oom();
4021
4022 free(arg_machine);
4023 arg_machine = b;
4024 }
4025 }
4026
4027 return 0;
4028 }
4029
4030 static int determine_uid_shift(const char *directory) {
4031 int r;
4032
4033 if (!arg_userns) {
4034 arg_uid_shift = 0;
4035 return 0;
4036 }
4037
4038 if (arg_uid_shift == UID_INVALID) {
4039 struct stat st;
4040
4041 r = stat(directory, &st);
4042 if (r < 0)
4043 return log_error_errno(errno, "Failed to determine UID base of %s: %m", directory);
4044
4045 arg_uid_shift = st.st_uid & UINT32_C(0xffff0000);
4046
4047 if (arg_uid_shift != (st.st_gid & UINT32_C(0xffff0000))) {
4048 log_error("UID and GID base of %s don't match.", directory);
4049 return -EINVAL;
4050 }
4051
4052 arg_uid_range = UINT32_C(0x10000);
4053 }
4054
4055 if (arg_uid_shift > (uid_t) -1 - arg_uid_range) {
4056 log_error("UID base too high for UID range.");
4057 return -EINVAL;
4058 }
4059
4060 log_info("Using user namespaces with base " UID_FMT " and range " UID_FMT ".", arg_uid_shift, arg_uid_range);
4061 return 0;
4062 }
4063
4064 static int inner_child(
4065 Barrier *barrier,
4066 const char *directory,
4067 bool secondary,
4068 int kmsg_socket,
4069 int rtnl_socket,
4070 FDSet *fds,
4071 int argc,
4072 char *argv[]) {
4073
4074 _cleanup_free_ char *home = NULL;
4075 unsigned n_env = 2;
4076 const char *envp[] = {
4077 "PATH=" DEFAULT_PATH_SPLIT_USR,
4078 "container=systemd-nspawn", /* LXC sets container=lxc, so follow the scheme here */
4079 NULL, /* TERM */
4080 NULL, /* HOME */
4081 NULL, /* USER */
4082 NULL, /* LOGNAME */
4083 NULL, /* container_uuid */
4084 NULL, /* LISTEN_FDS */
4085 NULL, /* LISTEN_PID */
4086 NULL
4087 };
4088
4089 _cleanup_strv_free_ char **env_use = NULL;
4090 int r;
4091
4092 assert(barrier);
4093 assert(directory);
4094 assert(kmsg_socket >= 0);
4095
4096 if (arg_userns) {
4097 /* Tell the parent, that it now can write the UID map. */
4098 (void) barrier_place(barrier); /* #1 */
4099
4100 /* Wait until the parent wrote the UID map */
4101 if (!barrier_place_and_sync(barrier)) { /* #2 */
4102 log_error("Parent died too early");
4103 return -ESRCH;
4104 }
4105 }
4106
4107 r = mount_all(NULL, true);
4108 if (r < 0)
4109 return r;
4110
4111 /* Wait until we are cgroup-ified, so that we
4112 * can mount the right cgroup path writable */
4113 if (!barrier_place_and_sync(barrier)) { /* #3 */
4114 log_error("Parent died too early");
4115 return -ESRCH;
4116 }
4117
4118 r = mount_systemd_cgroup_writable("");
4119 if (r < 0)
4120 return r;
4121
4122 r = reset_uid_gid();
4123 if (r < 0)
4124 return log_error_errno(r, "Couldn't become new root: %m");
4125
4126 r = setup_boot_id(NULL);
4127 if (r < 0)
4128 return r;
4129
4130 r = setup_kmsg(NULL, kmsg_socket);
4131 if (r < 0)
4132 return r;
4133 kmsg_socket = safe_close(kmsg_socket);
4134
4135 umask(0022);
4136
4137 if (setsid() < 0)
4138 return log_error_errno(errno, "setsid() failed: %m");
4139
4140 if (arg_private_network)
4141 loopback_setup();
4142
4143 r = send_rtnl(rtnl_socket);
4144 if (r < 0)
4145 return r;
4146 rtnl_socket = safe_close(rtnl_socket);
4147
4148 if (drop_capabilities() < 0)
4149 return log_error_errno(errno, "drop_capabilities() failed: %m");
4150
4151 setup_hostname();
4152
4153 if (arg_personality != PERSONALITY_INVALID) {
4154 if (personality(arg_personality) < 0)
4155 return log_error_errno(errno, "personality() failed: %m");
4156 } else if (secondary) {
4157 if (personality(PER_LINUX32) < 0)
4158 return log_error_errno(errno, "personality() failed: %m");
4159 }
4160
4161 #ifdef HAVE_SELINUX
4162 if (arg_selinux_context)
4163 if (setexeccon((security_context_t) arg_selinux_context) < 0)
4164 return log_error_errno(errno, "setexeccon(\"%s\") failed: %m", arg_selinux_context);
4165 #endif
4166
4167 r = change_uid_gid(&home);
4168 if (r < 0)
4169 return r;
4170
4171 envp[n_env] = strv_find_prefix(environ, "TERM=");
4172 if (envp[n_env])
4173 n_env ++;
4174
4175 if ((asprintf((char**)(envp + n_env++), "HOME=%s", home ? home: "/root") < 0) ||
4176 (asprintf((char**)(envp + n_env++), "USER=%s", arg_user ? arg_user : "root") < 0) ||
4177 (asprintf((char**)(envp + n_env++), "LOGNAME=%s", arg_user ? arg_user : "root") < 0))
4178 return log_oom();
4179
4180 if (!sd_id128_equal(arg_uuid, SD_ID128_NULL)) {
4181 char as_uuid[37];
4182
4183 if (asprintf((char**)(envp + n_env++), "container_uuid=%s", id128_format_as_uuid(arg_uuid, as_uuid)) < 0)
4184 return log_oom();
4185 }
4186
4187 if (fdset_size(fds) > 0) {
4188 r = fdset_cloexec(fds, false);
4189 if (r < 0)
4190 return log_error_errno(r, "Failed to unset O_CLOEXEC for file descriptors.");
4191
4192 if ((asprintf((char **)(envp + n_env++), "LISTEN_FDS=%u", fdset_size(fds)) < 0) ||
4193 (asprintf((char **)(envp + n_env++), "LISTEN_PID=1") < 0))
4194 return log_oom();
4195 }
4196
4197 env_use = strv_env_merge(2, envp, arg_setenv);
4198 if (!env_use)
4199 return log_oom();
4200
4201 /* Let the parent know that we are ready and
4202 * wait until the parent is ready with the
4203 * setup, too... */
4204 if (!barrier_place_and_sync(barrier)) { /* #4 */
4205 log_error("Parent died too early");
4206 return -ESRCH;
4207 }
4208
4209 /* Now, explicitly close the log, so that we
4210 * then can close all remaining fds. Closing
4211 * the log explicitly first has the benefit
4212 * that the logging subsystem knows about it,
4213 * and is thus ready to be reopened should we
4214 * need it again. Note that the other fds
4215 * closed here are at least the locking and
4216 * barrier fds. */
4217 log_close();
4218 (void) fdset_close_others(fds);
4219
4220 if (arg_boot) {
4221 char **a;
4222 size_t m;
4223
4224 /* Automatically search for the init system */
4225
4226 m = 1 + argc - optind;
4227 a = newa(char*, m + 1);
4228 memcpy(a + 1, argv + optind, m * sizeof(char*));
4229
4230 a[0] = (char*) "/usr/lib/systemd/systemd";
4231 execve(a[0], a, env_use);
4232
4233 a[0] = (char*) "/lib/systemd/systemd";
4234 execve(a[0], a, env_use);
4235
4236 a[0] = (char*) "/sbin/init";
4237 execve(a[0], a, env_use);
4238 } else if (argc > optind)
4239 execvpe(argv[optind], argv + optind, env_use);
4240 else {
4241 chdir(home ? home : "/root");
4242 execle("/bin/bash", "-bash", NULL, env_use);
4243 execle("/bin/sh", "-sh", NULL, env_use);
4244 }
4245
4246 (void) log_open();
4247 return log_error_errno(errno, "execv() failed: %m");
4248 }
4249
4250 static int outer_child(
4251 Barrier *barrier,
4252 const char *directory,
4253 const char *console,
4254 const char *root_device, bool root_device_rw,
4255 const char *home_device, bool home_device_rw,
4256 const char *srv_device, bool srv_device_rw,
4257 bool interactive,
4258 bool secondary,
4259 int pid_socket,
4260 int kmsg_socket,
4261 int rtnl_socket,
4262 FDSet *fds,
4263 int argc,
4264 char *argv[]) {
4265
4266 pid_t pid;
4267 ssize_t l;
4268 int r;
4269
4270 assert(barrier);
4271 assert(directory);
4272 assert(console);
4273 assert(pid_socket >= 0);
4274 assert(kmsg_socket >= 0);
4275
4276 if (prctl(PR_SET_PDEATHSIG, SIGKILL) < 0)
4277 return log_error_errno(errno, "PR_SET_PDEATHSIG failed: %m");
4278
4279 if (interactive) {
4280 close_nointr(STDIN_FILENO);
4281 close_nointr(STDOUT_FILENO);
4282 close_nointr(STDERR_FILENO);
4283
4284 r = open_terminal(console, O_RDWR);
4285 if (r != STDIN_FILENO) {
4286 if (r >= 0) {
4287 safe_close(r);
4288 r = -EINVAL;
4289 }
4290
4291 return log_error_errno(r, "Failed to open console: %m");
4292 }
4293
4294 if (dup2(STDIN_FILENO, STDOUT_FILENO) != STDOUT_FILENO ||
4295 dup2(STDIN_FILENO, STDERR_FILENO) != STDERR_FILENO)
4296 return log_error_errno(errno, "Failed to duplicate console: %m");
4297 }
4298
4299 r = reset_audit_loginuid();
4300 if (r < 0)
4301 return r;
4302
4303 /* Mark everything as slave, so that we still
4304 * receive mounts from the real root, but don't
4305 * propagate mounts to the real root. */
4306 if (mount(NULL, "/", NULL, MS_SLAVE|MS_REC, NULL) < 0)
4307 return log_error_errno(errno, "MS_SLAVE|MS_REC failed: %m");
4308
4309 r = mount_devices(directory,
4310 root_device, root_device_rw,
4311 home_device, home_device_rw,
4312 srv_device, srv_device_rw);
4313 if (r < 0)
4314 return r;
4315
4316 /* Turn directory into bind mount */
4317 if (mount(directory, directory, NULL, MS_BIND|MS_REC, NULL) < 0)
4318 return log_error_errno(errno, "Failed to make bind mount: %m");
4319
4320 r = setup_volatile(directory);
4321 if (r < 0)
4322 return r;
4323
4324 r = setup_volatile_state(directory);
4325 if (r < 0)
4326 return r;
4327
4328 r = base_filesystem_create(directory, arg_uid_shift, (gid_t) arg_uid_shift);
4329 if (r < 0)
4330 return r;
4331
4332 if (arg_read_only) {
4333 r = bind_remount_recursive(directory, true);
4334 if (r < 0)
4335 return log_error_errno(r, "Failed to make tree read-only: %m");
4336 }
4337
4338 r = mount_all(directory, false);
4339 if (r < 0)
4340 return r;
4341
4342 if (copy_devnodes(directory) < 0)
4343 return r;
4344
4345 dev_setup(directory, arg_uid_shift, arg_uid_shift);
4346
4347 if (setup_pts(directory) < 0)
4348 return r;
4349
4350 r = setup_propagate(directory);
4351 if (r < 0)
4352 return r;
4353
4354 r = setup_dev_console(directory, console);
4355 if (r < 0)
4356 return r;
4357
4358 r = setup_seccomp();
4359 if (r < 0)
4360 return r;
4361
4362 r = setup_timezone(directory);
4363 if (r < 0)
4364 return r;
4365
4366 r = setup_resolv_conf(directory);
4367 if (r < 0)
4368 return r;
4369
4370 r = setup_journal(directory);
4371 if (r < 0)
4372 return r;
4373
4374 r = mount_custom(directory);
4375 if (r < 0)
4376 return r;
4377
4378 r = mount_cgroup(directory);
4379 if (r < 0)
4380 return r;
4381
4382 r = mount_move_root(directory);
4383 if (r < 0)
4384 return log_error_errno(r, "Failed to move root directory: %m");
4385
4386 pid = raw_clone(SIGCHLD|CLONE_NEWNS|
4387 (arg_share_system ? 0 : CLONE_NEWIPC|CLONE_NEWPID|CLONE_NEWUTS) |
4388 (arg_private_network ? CLONE_NEWNET : 0) |
4389 (arg_userns ? CLONE_NEWUSER : 0),
4390 NULL);
4391 if (pid < 0)
4392 return log_error_errno(errno, "Failed to fork inner child: %m");
4393
4394 if (pid == 0) {
4395 pid_socket = safe_close(pid_socket);
4396
4397 /* The inner child has all namespaces that are
4398 * requested, so that we all are owned by the user if
4399 * user namespaces are turned on. */
4400
4401 r = inner_child(barrier, directory, secondary, kmsg_socket, rtnl_socket, fds, argc, argv);
4402 if (r < 0)
4403 _exit(EXIT_FAILURE);
4404
4405 _exit(EXIT_SUCCESS);
4406 }
4407
4408 l = send(pid_socket, &pid, sizeof(pid), MSG_NOSIGNAL);
4409 if (l < 0)
4410 return log_error_errno(errno, "Failed to send PID: %m");
4411 if (l != sizeof(pid)) {
4412 log_error("Short write while sending PID.");
4413 return -EIO;
4414 }
4415
4416 pid_socket = safe_close(pid_socket);
4417
4418 return 0;
4419 }
4420
4421 static int setup_uid_map(pid_t pid) {
4422 char uid_map[strlen("/proc//uid_map") + DECIMAL_STR_MAX(uid_t) + 1], line[DECIMAL_STR_MAX(uid_t)*3+3+1];
4423 int r;
4424
4425 assert(pid > 1);
4426
4427 xsprintf(uid_map, "/proc/" PID_FMT "/uid_map", pid);
4428 xsprintf(line, UID_FMT " " UID_FMT " " UID_FMT "\n", 0, arg_uid_shift, arg_uid_range);
4429 r = write_string_file(uid_map, line);
4430 if (r < 0)
4431 return log_error_errno(r, "Failed to write UID map: %m");
4432
4433 /* We always assign the same UID and GID ranges */
4434 xsprintf(uid_map, "/proc/" PID_FMT "/gid_map", pid);
4435 r = write_string_file(uid_map, line);
4436 if (r < 0)
4437 return log_error_errno(r, "Failed to write GID map: %m");
4438
4439 return 0;
4440 }
4441
4442 static int chown_cgroup(pid_t pid) {
4443 _cleanup_free_ char *path = NULL, *fs = NULL;
4444 _cleanup_close_ int fd = -1;
4445 const char *fn;
4446 int r;
4447
4448 r = cg_pid_get_path(NULL, pid, &path);
4449 if (r < 0)
4450 return log_error_errno(r, "Failed to get container cgroup path: %m");
4451
4452 r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, path, NULL, &fs);
4453 if (r < 0)
4454 return log_error_errno(r, "Failed to get file system path for container cgroup: %m");
4455
4456 fd = open(fs, O_RDONLY|O_CLOEXEC|O_DIRECTORY);
4457 if (fd < 0)
4458 return log_error_errno(errno, "Failed to open %s: %m", fs);
4459
4460 FOREACH_STRING(fn, ".", "tasks", "notify_on_release", "cgroup.procs", "cgroup.clone_children")
4461 if (fchownat(fd, fn, arg_uid_shift, arg_uid_shift, 0) < 0)
4462 log_warning_errno(errno, "Failed to chown() cgroup file %s, ignoring: %m", fn);
4463
4464 return 0;
4465 }
4466
4467 int main(int argc, char *argv[]) {
4468
4469 _cleanup_free_ char *device_path = NULL, *root_device = NULL, *home_device = NULL, *srv_device = NULL, *console = NULL;
4470 bool root_device_rw = true, home_device_rw = true, srv_device_rw = true;
4471 _cleanup_close_ int master = -1, image_fd = -1;
4472 _cleanup_fdset_free_ FDSet *fds = NULL;
4473 int r, n_fd_passed, loop_nr = -1;
4474 char veth_name[IFNAMSIZ];
4475 bool secondary = false, remove_subvol = false;
4476 sigset_t mask_chld;
4477 pid_t pid = 0;
4478 int ret = EXIT_SUCCESS;
4479 union in_addr_union exposed = {};
4480 _cleanup_release_lock_file_ LockFile tree_global_lock = LOCK_FILE_INIT, tree_local_lock = LOCK_FILE_INIT;
4481 bool interactive;
4482
4483 log_parse_environment();
4484 log_open();
4485
4486 r = parse_argv(argc, argv);
4487 if (r <= 0)
4488 goto finish;
4489
4490 r = determine_names();
4491 if (r < 0)
4492 goto finish;
4493
4494 r = determine_uid_shift(arg_directory);
4495 if (r < 0)
4496 return r;
4497
4498 if (geteuid() != 0) {
4499 log_error("Need to be root.");
4500 r = -EPERM;
4501 goto finish;
4502 }
4503
4504 n_fd_passed = sd_listen_fds(false);
4505 if (n_fd_passed > 0) {
4506 r = fdset_new_listen_fds(&fds, false);
4507 if (r < 0) {
4508 log_error_errno(r, "Failed to collect file descriptors: %m");
4509 goto finish;
4510 }
4511 }
4512
4513 if (arg_directory) {
4514 assert(!arg_image);
4515
4516 if (path_equal(arg_directory, "/") && !arg_ephemeral) {
4517 log_error("Spawning container on root directory is not supported. Consider using --ephemeral.");
4518 r = -EINVAL;
4519 goto finish;
4520 }
4521
4522 if (arg_ephemeral) {
4523 _cleanup_free_ char *np = NULL;
4524
4525 /* If the specified path is a mount point we
4526 * generate the new snapshot immediately
4527 * inside it under a random name. However if
4528 * the specified is not a mount point we
4529 * create the new snapshot in the parent
4530 * directory, just next to it. */
4531 r = path_is_mount_point(arg_directory, 0);
4532 if (r < 0) {
4533 log_error_errno(r, "Failed to determine whether directory %s is mount point: %m", arg_directory);
4534 goto finish;
4535 }
4536 if (r > 0)
4537 r = tempfn_random_child(arg_directory, "machine.", &np);
4538 else
4539 r = tempfn_random(arg_directory, "machine.", &np);
4540 if (r < 0) {
4541 log_error_errno(r, "Failed to generate name for snapshot: %m");
4542 goto finish;
4543 }
4544
4545 r = image_path_lock(np, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
4546 if (r < 0) {
4547 log_error_errno(r, "Failed to lock %s: %m", np);
4548 goto finish;
4549 }
4550
4551 r = btrfs_subvol_snapshot(arg_directory, np, (arg_read_only ? BTRFS_SNAPSHOT_READ_ONLY : 0) | BTRFS_SNAPSHOT_FALLBACK_COPY | BTRFS_SNAPSHOT_RECURSIVE);
4552 if (r < 0) {
4553 log_error_errno(r, "Failed to create snapshot %s from %s: %m", np, arg_directory);
4554 goto finish;
4555 }
4556
4557 free(arg_directory);
4558 arg_directory = np;
4559 np = NULL;
4560
4561 remove_subvol = true;
4562
4563 } else {
4564 r = image_path_lock(arg_directory, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
4565 if (r == -EBUSY) {
4566 log_error_errno(r, "Directory tree %s is currently busy.", arg_directory);
4567 goto finish;
4568 }
4569 if (r < 0) {
4570 log_error_errno(r, "Failed to lock %s: %m", arg_directory);
4571 return r;
4572 }
4573
4574 if (arg_template) {
4575 r = btrfs_subvol_snapshot(arg_template, arg_directory, (arg_read_only ? BTRFS_SNAPSHOT_READ_ONLY : 0) | BTRFS_SNAPSHOT_FALLBACK_COPY | BTRFS_SNAPSHOT_RECURSIVE);
4576 if (r == -EEXIST) {
4577 if (!arg_quiet)
4578 log_info("Directory %s already exists, not populating from template %s.", arg_directory, arg_template);
4579 } else if (r < 0) {
4580 log_error_errno(r, "Couldn't create snapshot %s from %s: %m", arg_directory, arg_template);
4581 goto finish;
4582 } else {
4583 if (!arg_quiet)
4584 log_info("Populated %s from template %s.", arg_directory, arg_template);
4585 }
4586 }
4587 }
4588
4589 if (arg_boot) {
4590 if (path_is_os_tree(arg_directory) <= 0) {
4591 log_error("Directory %s doesn't look like an OS root directory (os-release file is missing). Refusing.", arg_directory);
4592 r = -EINVAL;
4593 goto finish;
4594 }
4595 } else {
4596 const char *p;
4597
4598 p = strjoina(arg_directory,
4599 argc > optind && path_is_absolute(argv[optind]) ? argv[optind] : "/usr/bin/");
4600 if (access(p, F_OK) < 0) {
4601 log_error("Directory %s lacks the binary to execute or doesn't look like a binary tree. Refusing.", arg_directory);
4602 r = -EINVAL;
4603 goto finish;
4604 }
4605 }
4606
4607 } else {
4608 char template[] = "/tmp/nspawn-root-XXXXXX";
4609
4610 assert(arg_image);
4611 assert(!arg_template);
4612
4613 r = image_path_lock(arg_image, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
4614 if (r == -EBUSY) {
4615 r = log_error_errno(r, "Disk image %s is currently busy.", arg_image);
4616 goto finish;
4617 }
4618 if (r < 0) {
4619 r = log_error_errno(r, "Failed to create image lock: %m");
4620 goto finish;
4621 }
4622
4623 if (!mkdtemp(template)) {
4624 log_error_errno(errno, "Failed to create temporary directory: %m");
4625 r = -errno;
4626 goto finish;
4627 }
4628
4629 arg_directory = strdup(template);
4630 if (!arg_directory) {
4631 r = log_oom();
4632 goto finish;
4633 }
4634
4635 image_fd = setup_image(&device_path, &loop_nr);
4636 if (image_fd < 0) {
4637 r = image_fd;
4638 goto finish;
4639 }
4640
4641 r = dissect_image(image_fd,
4642 &root_device, &root_device_rw,
4643 &home_device, &home_device_rw,
4644 &srv_device, &srv_device_rw,
4645 &secondary);
4646 if (r < 0)
4647 goto finish;
4648 }
4649
4650 r = custom_mounts_prepare();
4651 if (r < 0)
4652 goto finish;
4653
4654 interactive =
4655 isatty(STDIN_FILENO) > 0 &&
4656 isatty(STDOUT_FILENO) > 0;
4657
4658 master = posix_openpt(O_RDWR|O_NOCTTY|O_CLOEXEC|O_NDELAY);
4659 if (master < 0) {
4660 r = log_error_errno(errno, "Failed to acquire pseudo tty: %m");
4661 goto finish;
4662 }
4663
4664 r = ptsname_malloc(master, &console);
4665 if (r < 0) {
4666 r = log_error_errno(r, "Failed to determine tty name: %m");
4667 goto finish;
4668 }
4669
4670 if (unlockpt(master) < 0) {
4671 r = log_error_errno(errno, "Failed to unlock tty: %m");
4672 goto finish;
4673 }
4674
4675 if (!arg_quiet)
4676 log_info("Spawning container %s on %s.\nPress ^] three times within 1s to kill container.",
4677 arg_machine, arg_image ?: arg_directory);
4678
4679 assert_se(sigprocmask_many(SIG_BLOCK, NULL, SIGCHLD, SIGWINCH, SIGTERM, SIGINT, -1) >= 0);
4680
4681 assert_se(sigemptyset(&mask_chld) == 0);
4682 assert_se(sigaddset(&mask_chld, SIGCHLD) == 0);
4683
4684 if (prctl(PR_SET_CHILD_SUBREAPER, 1) < 0) {
4685 r = log_error_errno(errno, "Failed to become subreaper: %m");
4686 goto finish;
4687 }
4688
4689 for (;;) {
4690 _cleanup_close_pair_ int kmsg_socket_pair[2] = { -1, -1 }, rtnl_socket_pair[2] = { -1, -1 }, pid_socket_pair[2] = { -1, -1 };
4691 ContainerStatus container_status;
4692 _cleanup_(barrier_destroy) Barrier barrier = BARRIER_NULL;
4693 static const struct sigaction sa = {
4694 .sa_handler = nop_handler,
4695 .sa_flags = SA_NOCLDSTOP,
4696 };
4697 int ifi = 0;
4698 ssize_t l;
4699 _cleanup_event_unref_ sd_event *event = NULL;
4700 _cleanup_(pty_forward_freep) PTYForward *forward = NULL;
4701 _cleanup_netlink_unref_ sd_netlink *rtnl = NULL;
4702 char last_char = 0;
4703
4704 r = barrier_create(&barrier);
4705 if (r < 0) {
4706 log_error_errno(r, "Cannot initialize IPC barrier: %m");
4707 goto finish;
4708 }
4709
4710 if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC, 0, kmsg_socket_pair) < 0) {
4711 r = log_error_errno(errno, "Failed to create kmsg socket pair: %m");
4712 goto finish;
4713 }
4714
4715 if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC, 0, rtnl_socket_pair) < 0) {
4716 r = log_error_errno(errno, "Failed to create rtnl socket pair: %m");
4717 goto finish;
4718 }
4719
4720 if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC, 0, pid_socket_pair) < 0) {
4721 r = log_error_errno(errno, "Failed to create pid socket pair: %m");
4722 goto finish;
4723 }
4724
4725 /* Child can be killed before execv(), so handle SIGCHLD
4726 * in order to interrupt parent's blocking calls and
4727 * give it a chance to call wait() and terminate. */
4728 r = sigprocmask(SIG_UNBLOCK, &mask_chld, NULL);
4729 if (r < 0) {
4730 r = log_error_errno(errno, "Failed to change the signal mask: %m");
4731 goto finish;
4732 }
4733
4734 r = sigaction(SIGCHLD, &sa, NULL);
4735 if (r < 0) {
4736 r = log_error_errno(errno, "Failed to install SIGCHLD handler: %m");
4737 goto finish;
4738 }
4739
4740 pid = raw_clone(SIGCHLD|CLONE_NEWNS, NULL);
4741 if (pid < 0) {
4742 if (errno == EINVAL)
4743 r = log_error_errno(errno, "clone() failed, do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in): %m");
4744 else
4745 r = log_error_errno(errno, "clone() failed: %m");
4746
4747 goto finish;
4748 }
4749
4750 if (pid == 0) {
4751 /* The outer child only has a file system namespace. */
4752 barrier_set_role(&barrier, BARRIER_CHILD);
4753
4754 master = safe_close(master);
4755
4756 kmsg_socket_pair[0] = safe_close(kmsg_socket_pair[0]);
4757 rtnl_socket_pair[0] = safe_close(rtnl_socket_pair[0]);
4758 pid_socket_pair[0] = safe_close(pid_socket_pair[0]);
4759
4760 (void) reset_all_signal_handlers();
4761 (void) reset_signal_mask();
4762
4763 r = outer_child(&barrier,
4764 arg_directory,
4765 console,
4766 root_device, root_device_rw,
4767 home_device, home_device_rw,
4768 srv_device, srv_device_rw,
4769 interactive,
4770 secondary,
4771 pid_socket_pair[1],
4772 kmsg_socket_pair[1],
4773 rtnl_socket_pair[1],
4774 fds,
4775 argc, argv);
4776 if (r < 0)
4777 _exit(EXIT_FAILURE);
4778
4779 _exit(EXIT_SUCCESS);
4780 }
4781
4782 barrier_set_role(&barrier, BARRIER_PARENT);
4783
4784 fdset_free(fds);
4785 fds = NULL;
4786
4787 kmsg_socket_pair[1] = safe_close(kmsg_socket_pair[1]);
4788 rtnl_socket_pair[1] = safe_close(rtnl_socket_pair[1]);
4789 pid_socket_pair[1] = safe_close(pid_socket_pair[1]);
4790
4791 /* Wait for the outer child. */
4792 r = wait_for_terminate_and_warn("namespace helper", pid, NULL);
4793 if (r < 0)
4794 goto finish;
4795 if (r != 0) {
4796 r = -EIO;
4797 goto finish;
4798 }
4799 pid = 0;
4800
4801 /* And now retrieve the PID of the inner child. */
4802 l = recv(pid_socket_pair[0], &pid, sizeof(pid), 0);
4803 if (l < 0) {
4804 r = log_error_errno(errno, "Failed to read inner child PID: %m");
4805 goto finish;
4806 }
4807 if (l != sizeof(pid)) {
4808 log_error("Short read while reading inner child PID: %m");
4809 r = EIO;
4810 goto finish;
4811 }
4812
4813 log_debug("Init process invoked as PID " PID_FMT, pid);
4814
4815 if (arg_userns) {
4816 if (!barrier_place_and_sync(&barrier)) { /* #1 */
4817 log_error("Child died too early.");
4818 r = -ESRCH;
4819 goto finish;
4820 }
4821
4822 r = setup_uid_map(pid);
4823 if (r < 0)
4824 goto finish;
4825
4826 (void) barrier_place(&barrier); /* #2 */
4827 }
4828
4829 r = move_network_interfaces(pid);
4830 if (r < 0)
4831 goto finish;
4832
4833 r = setup_veth(pid, veth_name, &ifi);
4834 if (r < 0)
4835 goto finish;
4836
4837 r = setup_bridge(veth_name, &ifi);
4838 if (r < 0)
4839 goto finish;
4840
4841 r = setup_macvlan(pid);
4842 if (r < 0)
4843 goto finish;
4844
4845 r = setup_ipvlan(pid);
4846 if (r < 0)
4847 goto finish;
4848
4849 r = register_machine(pid, ifi);
4850 if (r < 0)
4851 goto finish;
4852
4853 r = chown_cgroup(pid);
4854 if (r < 0)
4855 goto finish;
4856
4857 /* Notify the child that the parent is ready with all
4858 * its setup (including cgroup-ification), and that
4859 * the child can now hand over control to the code to
4860 * run inside the container. */
4861 (void) barrier_place(&barrier); /* #3 */
4862
4863 /* Block SIGCHLD here, before notifying child.
4864 * process_pty() will handle it with the other signals. */
4865 assert_se(sigprocmask(SIG_BLOCK, &mask_chld, NULL) >= 0);
4866
4867 /* Reset signal to default */
4868 r = default_signals(SIGCHLD, -1);
4869 if (r < 0) {
4870 log_error_errno(r, "Failed to reset SIGCHLD: %m");
4871 goto finish;
4872 }
4873
4874 /* Let the child know that we are ready and wait that the child is completely ready now. */
4875 if (!barrier_place_and_sync(&barrier)) { /* #5 */
4876 log_error("Client died too early.");
4877 r = -ESRCH;
4878 goto finish;
4879 }
4880
4881 sd_notifyf(false,
4882 "READY=1\n"
4883 "STATUS=Container running.\n"
4884 "X_NSPAWN_LEADER_PID=" PID_FMT, pid);
4885
4886 r = sd_event_new(&event);
4887 if (r < 0) {
4888 log_error_errno(r, "Failed to get default event source: %m");
4889 goto finish;
4890 }
4891
4892 if (arg_kill_signal > 0) {
4893 /* Try to kill the init system on SIGINT or SIGTERM */
4894 sd_event_add_signal(event, NULL, SIGINT, on_orderly_shutdown, UINT32_TO_PTR(pid));
4895 sd_event_add_signal(event, NULL, SIGTERM, on_orderly_shutdown, UINT32_TO_PTR(pid));
4896 } else {
4897 /* Immediately exit */
4898 sd_event_add_signal(event, NULL, SIGINT, NULL, NULL);
4899 sd_event_add_signal(event, NULL, SIGTERM, NULL, NULL);
4900 }
4901
4902 /* simply exit on sigchld */
4903 sd_event_add_signal(event, NULL, SIGCHLD, NULL, NULL);
4904
4905 if (arg_expose_ports) {
4906 r = watch_rtnl(event, rtnl_socket_pair[0], &exposed, &rtnl);
4907 if (r < 0)
4908 goto finish;
4909
4910 (void) expose_ports(rtnl, &exposed);
4911 }
4912
4913 rtnl_socket_pair[0] = safe_close(rtnl_socket_pair[0]);
4914
4915 r = pty_forward_new(event, master, true, !interactive, &forward);
4916 if (r < 0) {
4917 log_error_errno(r, "Failed to create PTY forwarder: %m");
4918 goto finish;
4919 }
4920
4921 r = sd_event_loop(event);
4922 if (r < 0) {
4923 log_error_errno(r, "Failed to run event loop: %m");
4924 goto finish;
4925 }
4926
4927 pty_forward_get_last_char(forward, &last_char);
4928
4929 forward = pty_forward_free(forward);
4930
4931 if (!arg_quiet && last_char != '\n')
4932 putc('\n', stdout);
4933
4934 /* Kill if it is not dead yet anyway */
4935 terminate_machine(pid);
4936
4937 /* Normally redundant, but better safe than sorry */
4938 kill(pid, SIGKILL);
4939
4940 r = wait_for_container(pid, &container_status);
4941 pid = 0;
4942
4943 if (r < 0)
4944 /* We failed to wait for the container, or the
4945 * container exited abnormally */
4946 goto finish;
4947 else if (r > 0 || container_status == CONTAINER_TERMINATED){
4948 /* The container exited with a non-zero
4949 * status, or with zero status and no reboot
4950 * was requested. */
4951 ret = r;
4952 break;
4953 }
4954
4955 /* CONTAINER_REBOOTED, loop again */
4956
4957 if (arg_keep_unit) {
4958 /* Special handling if we are running as a
4959 * service: instead of simply restarting the
4960 * machine we want to restart the entire
4961 * service, so let's inform systemd about this
4962 * with the special exit code 133. The service
4963 * file uses RestartForceExitStatus=133 so
4964 * that this results in a full nspawn
4965 * restart. This is necessary since we might
4966 * have cgroup parameters set we want to have
4967 * flushed out. */
4968 ret = 133;
4969 r = 0;
4970 break;
4971 }
4972
4973 flush_ports(&exposed);
4974 }
4975
4976 finish:
4977 sd_notify(false,
4978 "STOPPING=1\n"
4979 "STATUS=Terminating...");
4980
4981 if (pid > 0)
4982 kill(pid, SIGKILL);
4983
4984 /* Try to flush whatever is still queued in the pty */
4985 if (master >= 0)
4986 (void) copy_bytes(master, STDOUT_FILENO, (off_t) -1, false);
4987
4988 loop_remove(loop_nr, &image_fd);
4989
4990 if (remove_subvol && arg_directory) {
4991 int k;
4992
4993 k = btrfs_subvol_remove(arg_directory, true);
4994 if (k < 0)
4995 log_warning_errno(k, "Cannot remove subvolume '%s', ignoring: %m", arg_directory);
4996 }
4997
4998 if (arg_machine) {
4999 const char *p;
5000
5001 p = strjoina("/run/systemd/nspawn/propagate/", arg_machine);
5002 (void) rm_rf(p, REMOVE_ROOT);
5003 }
5004
5005 free(arg_directory);
5006 free(arg_template);
5007 free(arg_image);
5008 free(arg_machine);
5009 free(arg_user);
5010 strv_free(arg_setenv);
5011 strv_free(arg_network_interfaces);
5012 strv_free(arg_network_macvlan);
5013 strv_free(arg_network_ipvlan);
5014 custom_mount_free_all();
5015
5016 flush_ports(&exposed);
5017
5018 while (arg_expose_ports) {
5019 ExposePort *p = arg_expose_ports;
5020 LIST_REMOVE(ports, arg_expose_ports, p);
5021 free(p);
5022 }
5023
5024 return r < 0 ? EXIT_FAILURE : ret;
5025 }