]> git.ipfire.org Git - thirdparty/systemd.git/blob - src/nspawn/nspawn.c
Merge pull request #516 from utezduyar/consistent-get-callback-return
[thirdparty/systemd.git] / src / nspawn / nspawn.c
1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
2
3 /***
4 This file is part of systemd.
5
6 Copyright 2010 Lennart Poettering
7
8 systemd is free software; you can redistribute it and/or modify it
9 under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 2.1 of the License, or
11 (at your option) any later version.
12
13 systemd is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 Lesser General Public License for more details.
17
18 You should have received a copy of the GNU Lesser General Public License
19 along with systemd; If not, see <http://www.gnu.org/licenses/>.
20 ***/
21
22 #include <signal.h>
23 #include <sched.h>
24 #include <unistd.h>
25 #include <sys/types.h>
26 #include <sys/mount.h>
27 #include <stdlib.h>
28 #include <string.h>
29 #include <stdio.h>
30 #include <errno.h>
31 #include <sys/prctl.h>
32 #include <getopt.h>
33 #include <grp.h>
34 #include <linux/fs.h>
35 #include <sys/socket.h>
36 #include <linux/netlink.h>
37 #include <net/if.h>
38 #include <linux/veth.h>
39 #include <sys/personality.h>
40 #include <linux/loop.h>
41 #include <sys/file.h>
42
43 #ifdef HAVE_SELINUX
44 #include <selinux/selinux.h>
45 #endif
46
47 #ifdef HAVE_SECCOMP
48 #include <seccomp.h>
49 #endif
50
51 #ifdef HAVE_BLKID
52 #include <blkid/blkid.h>
53 #endif
54
55 #include "sd-daemon.h"
56 #include "sd-bus.h"
57 #include "sd-id128.h"
58 #include "sd-netlink.h"
59 #include "random-util.h"
60 #include "log.h"
61 #include "util.h"
62 #include "mkdir.h"
63 #include "rm-rf.h"
64 #include "macro.h"
65 #include "missing.h"
66 #include "cgroup-util.h"
67 #include "strv.h"
68 #include "path-util.h"
69 #include "loopback-setup.h"
70 #include "dev-setup.h"
71 #include "fdset.h"
72 #include "build.h"
73 #include "fileio.h"
74 #include "bus-util.h"
75 #include "bus-error.h"
76 #include "ptyfwd.h"
77 #include "env-util.h"
78 #include "netlink-util.h"
79 #include "udev-util.h"
80 #include "blkid-util.h"
81 #include "gpt.h"
82 #include "siphash24.h"
83 #include "copy.h"
84 #include "base-filesystem.h"
85 #include "barrier.h"
86 #include "event-util.h"
87 #include "capability.h"
88 #include "cap-list.h"
89 #include "btrfs-util.h"
90 #include "machine-image.h"
91 #include "list.h"
92 #include "in-addr-util.h"
93 #include "firewall-util.h"
94 #include "local-addresses.h"
95 #include "formats-util.h"
96 #include "process-util.h"
97 #include "terminal-util.h"
98 #include "hostname-util.h"
99 #include "signal-util.h"
100
101 #ifdef HAVE_SECCOMP
102 #include "seccomp-util.h"
103 #endif
104
105 typedef struct ExposePort {
106 int protocol;
107 uint16_t host_port;
108 uint16_t container_port;
109 LIST_FIELDS(struct ExposePort, ports);
110 } ExposePort;
111
112 typedef enum ContainerStatus {
113 CONTAINER_TERMINATED,
114 CONTAINER_REBOOTED
115 } ContainerStatus;
116
117 typedef enum LinkJournal {
118 LINK_NO,
119 LINK_AUTO,
120 LINK_HOST,
121 LINK_GUEST
122 } LinkJournal;
123
124 typedef enum Volatile {
125 VOLATILE_NO,
126 VOLATILE_YES,
127 VOLATILE_STATE,
128 } Volatile;
129
130 typedef enum CustomMountType {
131 CUSTOM_MOUNT_BIND,
132 CUSTOM_MOUNT_TMPFS,
133 CUSTOM_MOUNT_OVERLAY,
134 } CustomMountType;
135
136 typedef struct CustomMount {
137 CustomMountType type;
138 bool read_only;
139 char *source; /* for overlayfs this is the upper directory */
140 char *destination;
141 char *options;
142 char *work_dir;
143 char **lower;
144 } CustomMount;
145
146 static char *arg_directory = NULL;
147 static char *arg_template = NULL;
148 static char *arg_user = NULL;
149 static sd_id128_t arg_uuid = {};
150 static char *arg_machine = NULL;
151 static const char *arg_selinux_context = NULL;
152 static const char *arg_selinux_apifs_context = NULL;
153 static const char *arg_slice = NULL;
154 static bool arg_private_network = false;
155 static bool arg_read_only = false;
156 static bool arg_boot = false;
157 static bool arg_ephemeral = false;
158 static LinkJournal arg_link_journal = LINK_AUTO;
159 static bool arg_link_journal_try = false;
160 static uint64_t arg_retain =
161 (1ULL << CAP_CHOWN) |
162 (1ULL << CAP_DAC_OVERRIDE) |
163 (1ULL << CAP_DAC_READ_SEARCH) |
164 (1ULL << CAP_FOWNER) |
165 (1ULL << CAP_FSETID) |
166 (1ULL << CAP_IPC_OWNER) |
167 (1ULL << CAP_KILL) |
168 (1ULL << CAP_LEASE) |
169 (1ULL << CAP_LINUX_IMMUTABLE) |
170 (1ULL << CAP_NET_BIND_SERVICE) |
171 (1ULL << CAP_NET_BROADCAST) |
172 (1ULL << CAP_NET_RAW) |
173 (1ULL << CAP_SETGID) |
174 (1ULL << CAP_SETFCAP) |
175 (1ULL << CAP_SETPCAP) |
176 (1ULL << CAP_SETUID) |
177 (1ULL << CAP_SYS_ADMIN) |
178 (1ULL << CAP_SYS_CHROOT) |
179 (1ULL << CAP_SYS_NICE) |
180 (1ULL << CAP_SYS_PTRACE) |
181 (1ULL << CAP_SYS_TTY_CONFIG) |
182 (1ULL << CAP_SYS_RESOURCE) |
183 (1ULL << CAP_SYS_BOOT) |
184 (1ULL << CAP_AUDIT_WRITE) |
185 (1ULL << CAP_AUDIT_CONTROL) |
186 (1ULL << CAP_MKNOD);
187 static CustomMount *arg_custom_mounts = NULL;
188 static unsigned arg_n_custom_mounts = 0;
189 static char **arg_setenv = NULL;
190 static bool arg_quiet = false;
191 static bool arg_share_system = false;
192 static bool arg_register = true;
193 static bool arg_keep_unit = false;
194 static char **arg_network_interfaces = NULL;
195 static char **arg_network_macvlan = NULL;
196 static char **arg_network_ipvlan = NULL;
197 static bool arg_network_veth = false;
198 static const char *arg_network_bridge = NULL;
199 static unsigned long arg_personality = PERSONALITY_INVALID;
200 static char *arg_image = NULL;
201 static Volatile arg_volatile = VOLATILE_NO;
202 static ExposePort *arg_expose_ports = NULL;
203 static char **arg_property = NULL;
204 static uid_t arg_uid_shift = UID_INVALID, arg_uid_range = 0x10000U;
205 static bool arg_userns = false;
206 static int arg_kill_signal = 0;
207
208 static void help(void) {
209 printf("%s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
210 "Spawn a minimal namespace container for debugging, testing and building.\n\n"
211 " -h --help Show this help\n"
212 " --version Print version string\n"
213 " -q --quiet Do not show status information\n"
214 " -D --directory=PATH Root directory for the container\n"
215 " --template=PATH Initialize root directory from template directory,\n"
216 " if missing\n"
217 " -x --ephemeral Run container with snapshot of root directory, and\n"
218 " remove it after exit\n"
219 " -i --image=PATH File system device or disk image for the container\n"
220 " -b --boot Boot up full system (i.e. invoke init)\n"
221 " -u --user=USER Run the command under specified user or uid\n"
222 " -M --machine=NAME Set the machine name for the container\n"
223 " --uuid=UUID Set a specific machine UUID for the container\n"
224 " -S --slice=SLICE Place the container in the specified slice\n"
225 " --property=NAME=VALUE Set scope unit property\n"
226 " --private-users[=UIDBASE[:NUIDS]]\n"
227 " Run within user namespace\n"
228 " --private-network Disable network in container\n"
229 " --network-interface=INTERFACE\n"
230 " Assign an existing network interface to the\n"
231 " container\n"
232 " --network-macvlan=INTERFACE\n"
233 " Create a macvlan network interface based on an\n"
234 " existing network interface to the container\n"
235 " --network-ipvlan=INTERFACE\n"
236 " Create a ipvlan network interface based on an\n"
237 " existing network interface to the container\n"
238 " -n --network-veth Add a virtual ethernet connection between host\n"
239 " and container\n"
240 " --network-bridge=INTERFACE\n"
241 " Add a virtual ethernet connection between host\n"
242 " and container and add it to an existing bridge on\n"
243 " the host\n"
244 " -p --port=[PROTOCOL:]HOSTPORT[:CONTAINERPORT]\n"
245 " Expose a container IP port on the host\n"
246 " -Z --selinux-context=SECLABEL\n"
247 " Set the SELinux security context to be used by\n"
248 " processes in the container\n"
249 " -L --selinux-apifs-context=SECLABEL\n"
250 " Set the SELinux security context to be used by\n"
251 " API/tmpfs file systems in the container\n"
252 " --capability=CAP In addition to the default, retain specified\n"
253 " capability\n"
254 " --drop-capability=CAP Drop the specified capability from the default set\n"
255 " --kill-signal=SIGNAL Select signal to use for shutting down PID 1\n"
256 " --link-journal=MODE Link up guest journal, one of no, auto, guest, host,\n"
257 " try-guest, try-host\n"
258 " -j Equivalent to --link-journal=try-guest\n"
259 " --read-only Mount the root directory read-only\n"
260 " --bind=PATH[:PATH] Bind mount a file or directory from the host into\n"
261 " the container\n"
262 " --bind-ro=PATH[:PATH] Similar, but creates a read-only bind mount\n"
263 " --tmpfs=PATH:[OPTIONS] Mount an empty tmpfs to the specified directory\n"
264 " --overlay=PATH[:PATH...]:PATH\n"
265 " Create an overlay mount from the host to \n"
266 " the container\n"
267 " --overlay-ro=PATH[:PATH...]:PATH\n"
268 " Similar, but creates a read-only overlay mount\n"
269 " --setenv=NAME=VALUE Pass an environment variable to PID 1\n"
270 " --share-system Share system namespaces with host\n"
271 " --register=BOOLEAN Register container as machine\n"
272 " --keep-unit Do not register a scope for the machine, reuse\n"
273 " the service unit nspawn is running in\n"
274 " --volatile[=MODE] Run the system in volatile mode\n"
275 , program_invocation_short_name);
276 }
277
278 static CustomMount* custom_mount_add(CustomMountType t) {
279 CustomMount *c, *ret;
280
281 c = realloc(arg_custom_mounts, (arg_n_custom_mounts + 1) * sizeof(CustomMount));
282 if (!c)
283 return NULL;
284
285 arg_custom_mounts = c;
286 ret = arg_custom_mounts + arg_n_custom_mounts;
287 arg_n_custom_mounts++;
288
289 *ret = (CustomMount) { .type = t };
290
291 return ret;
292 }
293
294 static void custom_mount_free_all(void) {
295 unsigned i;
296
297 for (i = 0; i < arg_n_custom_mounts; i++) {
298 CustomMount *m = &arg_custom_mounts[i];
299
300 free(m->source);
301 free(m->destination);
302 free(m->options);
303
304 if (m->work_dir) {
305 (void) rm_rf(m->work_dir, REMOVE_ROOT|REMOVE_PHYSICAL);
306 free(m->work_dir);
307 }
308
309 strv_free(m->lower);
310 }
311
312 free(arg_custom_mounts);
313 arg_custom_mounts = NULL;
314 arg_n_custom_mounts = 0;
315 }
316
317 static int custom_mount_compare(const void *a, const void *b) {
318 const CustomMount *x = a, *y = b;
319 int r;
320
321 r = path_compare(x->destination, y->destination);
322 if (r != 0)
323 return r;
324
325 if (x->type < y->type)
326 return -1;
327 if (x->type > y->type)
328 return 1;
329
330 return 0;
331 }
332
333 static int custom_mounts_prepare(void) {
334 unsigned i;
335 int r;
336
337 /* Ensure the mounts are applied prefix first. */
338 qsort_safe(arg_custom_mounts, arg_n_custom_mounts, sizeof(CustomMount), custom_mount_compare);
339
340 /* Allocate working directories for the overlay file systems that need it */
341 for (i = 0; i < arg_n_custom_mounts; i++) {
342 CustomMount *m = &arg_custom_mounts[i];
343
344 if (arg_userns && arg_uid_shift == UID_INVALID && path_equal(m->destination, "/")) {
345 log_error("--private-users with automatic UID shift may not be combined with custom root mounts.");
346 return -EINVAL;
347 }
348
349 if (m->type != CUSTOM_MOUNT_OVERLAY)
350 continue;
351
352 if (m->work_dir)
353 continue;
354
355 if (m->read_only)
356 continue;
357
358 r = tempfn_random(m->source, NULL, &m->work_dir);
359 if (r < 0)
360 return log_error_errno(r, "Failed to generate work directory from %s: %m", m->source);
361 }
362
363 return 0;
364 }
365
366 static int set_sanitized_path(char **b, const char *path) {
367 char *p;
368
369 assert(b);
370 assert(path);
371
372 p = canonicalize_file_name(path);
373 if (!p) {
374 if (errno != ENOENT)
375 return -errno;
376
377 p = path_make_absolute_cwd(path);
378 if (!p)
379 return -ENOMEM;
380 }
381
382 free(*b);
383 *b = path_kill_slashes(p);
384 return 0;
385 }
386
387 static int parse_argv(int argc, char *argv[]) {
388
389 enum {
390 ARG_VERSION = 0x100,
391 ARG_PRIVATE_NETWORK,
392 ARG_UUID,
393 ARG_READ_ONLY,
394 ARG_CAPABILITY,
395 ARG_DROP_CAPABILITY,
396 ARG_LINK_JOURNAL,
397 ARG_BIND,
398 ARG_BIND_RO,
399 ARG_TMPFS,
400 ARG_OVERLAY,
401 ARG_OVERLAY_RO,
402 ARG_SETENV,
403 ARG_SHARE_SYSTEM,
404 ARG_REGISTER,
405 ARG_KEEP_UNIT,
406 ARG_NETWORK_INTERFACE,
407 ARG_NETWORK_MACVLAN,
408 ARG_NETWORK_IPVLAN,
409 ARG_NETWORK_BRIDGE,
410 ARG_PERSONALITY,
411 ARG_VOLATILE,
412 ARG_TEMPLATE,
413 ARG_PROPERTY,
414 ARG_PRIVATE_USERS,
415 ARG_KILL_SIGNAL,
416 };
417
418 static const struct option options[] = {
419 { "help", no_argument, NULL, 'h' },
420 { "version", no_argument, NULL, ARG_VERSION },
421 { "directory", required_argument, NULL, 'D' },
422 { "template", required_argument, NULL, ARG_TEMPLATE },
423 { "ephemeral", no_argument, NULL, 'x' },
424 { "user", required_argument, NULL, 'u' },
425 { "private-network", no_argument, NULL, ARG_PRIVATE_NETWORK },
426 { "boot", no_argument, NULL, 'b' },
427 { "uuid", required_argument, NULL, ARG_UUID },
428 { "read-only", no_argument, NULL, ARG_READ_ONLY },
429 { "capability", required_argument, NULL, ARG_CAPABILITY },
430 { "drop-capability", required_argument, NULL, ARG_DROP_CAPABILITY },
431 { "link-journal", required_argument, NULL, ARG_LINK_JOURNAL },
432 { "bind", required_argument, NULL, ARG_BIND },
433 { "bind-ro", required_argument, NULL, ARG_BIND_RO },
434 { "tmpfs", required_argument, NULL, ARG_TMPFS },
435 { "overlay", required_argument, NULL, ARG_OVERLAY },
436 { "overlay-ro", required_argument, NULL, ARG_OVERLAY_RO },
437 { "machine", required_argument, NULL, 'M' },
438 { "slice", required_argument, NULL, 'S' },
439 { "setenv", required_argument, NULL, ARG_SETENV },
440 { "selinux-context", required_argument, NULL, 'Z' },
441 { "selinux-apifs-context", required_argument, NULL, 'L' },
442 { "quiet", no_argument, NULL, 'q' },
443 { "share-system", no_argument, NULL, ARG_SHARE_SYSTEM },
444 { "register", required_argument, NULL, ARG_REGISTER },
445 { "keep-unit", no_argument, NULL, ARG_KEEP_UNIT },
446 { "network-interface", required_argument, NULL, ARG_NETWORK_INTERFACE },
447 { "network-macvlan", required_argument, NULL, ARG_NETWORK_MACVLAN },
448 { "network-ipvlan", required_argument, NULL, ARG_NETWORK_IPVLAN },
449 { "network-veth", no_argument, NULL, 'n' },
450 { "network-bridge", required_argument, NULL, ARG_NETWORK_BRIDGE },
451 { "personality", required_argument, NULL, ARG_PERSONALITY },
452 { "image", required_argument, NULL, 'i' },
453 { "volatile", optional_argument, NULL, ARG_VOLATILE },
454 { "port", required_argument, NULL, 'p' },
455 { "property", required_argument, NULL, ARG_PROPERTY },
456 { "private-users", optional_argument, NULL, ARG_PRIVATE_USERS },
457 { "kill-signal", required_argument, NULL, ARG_KILL_SIGNAL },
458 {}
459 };
460
461 int c, r;
462 uint64_t plus = 0, minus = 0;
463
464 assert(argc >= 0);
465 assert(argv);
466
467 while ((c = getopt_long(argc, argv, "+hD:u:bL:M:jS:Z:qi:xp:n", options, NULL)) >= 0)
468
469 switch (c) {
470
471 case 'h':
472 help();
473 return 0;
474
475 case ARG_VERSION:
476 puts(PACKAGE_STRING);
477 puts(SYSTEMD_FEATURES);
478 return 0;
479
480 case 'D':
481 r = set_sanitized_path(&arg_directory, optarg);
482 if (r < 0)
483 return log_error_errno(r, "Invalid root directory: %m");
484
485 break;
486
487 case ARG_TEMPLATE:
488 r = set_sanitized_path(&arg_template, optarg);
489 if (r < 0)
490 return log_error_errno(r, "Invalid template directory: %m");
491
492 break;
493
494 case 'i':
495 r = set_sanitized_path(&arg_image, optarg);
496 if (r < 0)
497 return log_error_errno(r, "Invalid image path: %m");
498
499 break;
500
501 case 'x':
502 arg_ephemeral = true;
503 break;
504
505 case 'u':
506 free(arg_user);
507 arg_user = strdup(optarg);
508 if (!arg_user)
509 return log_oom();
510
511 break;
512
513 case ARG_NETWORK_BRIDGE:
514 arg_network_bridge = optarg;
515
516 /* fall through */
517
518 case 'n':
519 arg_network_veth = true;
520 arg_private_network = true;
521 break;
522
523 case ARG_NETWORK_INTERFACE:
524 if (strv_extend(&arg_network_interfaces, optarg) < 0)
525 return log_oom();
526
527 arg_private_network = true;
528 break;
529
530 case ARG_NETWORK_MACVLAN:
531 if (strv_extend(&arg_network_macvlan, optarg) < 0)
532 return log_oom();
533
534 arg_private_network = true;
535 break;
536
537 case ARG_NETWORK_IPVLAN:
538 if (strv_extend(&arg_network_ipvlan, optarg) < 0)
539 return log_oom();
540
541 /* fall through */
542
543 case ARG_PRIVATE_NETWORK:
544 arg_private_network = true;
545 break;
546
547 case 'b':
548 arg_boot = true;
549 break;
550
551 case ARG_UUID:
552 r = sd_id128_from_string(optarg, &arg_uuid);
553 if (r < 0) {
554 log_error("Invalid UUID: %s", optarg);
555 return r;
556 }
557 break;
558
559 case 'S':
560 arg_slice = optarg;
561 break;
562
563 case 'M':
564 if (isempty(optarg)) {
565 free(arg_machine);
566 arg_machine = NULL;
567 } else {
568 if (!machine_name_is_valid(optarg)) {
569 log_error("Invalid machine name: %s", optarg);
570 return -EINVAL;
571 }
572
573 r = free_and_strdup(&arg_machine, optarg);
574 if (r < 0)
575 return log_oom();
576
577 break;
578 }
579
580 case 'Z':
581 arg_selinux_context = optarg;
582 break;
583
584 case 'L':
585 arg_selinux_apifs_context = optarg;
586 break;
587
588 case ARG_READ_ONLY:
589 arg_read_only = true;
590 break;
591
592 case ARG_CAPABILITY:
593 case ARG_DROP_CAPABILITY: {
594 const char *state, *word;
595 size_t length;
596
597 FOREACH_WORD_SEPARATOR(word, length, optarg, ",", state) {
598 _cleanup_free_ char *t;
599
600 t = strndup(word, length);
601 if (!t)
602 return log_oom();
603
604 if (streq(t, "all")) {
605 if (c == ARG_CAPABILITY)
606 plus = (uint64_t) -1;
607 else
608 minus = (uint64_t) -1;
609 } else {
610 int cap;
611
612 cap = capability_from_name(t);
613 if (cap < 0) {
614 log_error("Failed to parse capability %s.", t);
615 return -EINVAL;
616 }
617
618 if (c == ARG_CAPABILITY)
619 plus |= 1ULL << (uint64_t) cap;
620 else
621 minus |= 1ULL << (uint64_t) cap;
622 }
623 }
624
625 break;
626 }
627
628 case 'j':
629 arg_link_journal = LINK_GUEST;
630 arg_link_journal_try = true;
631 break;
632
633 case ARG_LINK_JOURNAL:
634 if (streq(optarg, "auto")) {
635 arg_link_journal = LINK_AUTO;
636 arg_link_journal_try = false;
637 } else if (streq(optarg, "no")) {
638 arg_link_journal = LINK_NO;
639 arg_link_journal_try = false;
640 } else if (streq(optarg, "guest")) {
641 arg_link_journal = LINK_GUEST;
642 arg_link_journal_try = false;
643 } else if (streq(optarg, "host")) {
644 arg_link_journal = LINK_HOST;
645 arg_link_journal_try = false;
646 } else if (streq(optarg, "try-guest")) {
647 arg_link_journal = LINK_GUEST;
648 arg_link_journal_try = true;
649 } else if (streq(optarg, "try-host")) {
650 arg_link_journal = LINK_HOST;
651 arg_link_journal_try = true;
652 } else {
653 log_error("Failed to parse link journal mode %s", optarg);
654 return -EINVAL;
655 }
656
657 break;
658
659 case ARG_BIND:
660 case ARG_BIND_RO: {
661 _cleanup_free_ char *source = NULL, *destination = NULL;
662 CustomMount *m;
663 char *e;
664
665 e = strchr(optarg, ':');
666 if (e) {
667 source = strndup(optarg, e - optarg);
668 destination = strdup(e + 1);
669 } else {
670 source = strdup(optarg);
671 destination = strdup(optarg);
672 }
673
674 if (!source || !destination)
675 return log_oom();
676
677 if (!path_is_absolute(source) || !path_is_absolute(destination)) {
678 log_error("Invalid bind mount specification: %s", optarg);
679 return -EINVAL;
680 }
681
682 m = custom_mount_add(CUSTOM_MOUNT_BIND);
683 if (!m)
684 return log_oom();
685
686 m->source = source;
687 m->destination = destination;
688 m->read_only = c == ARG_BIND_RO;
689
690 source = destination = NULL;
691
692 break;
693 }
694
695 case ARG_TMPFS: {
696 _cleanup_free_ char *path = NULL, *opts = NULL;
697 CustomMount *m;
698 char *e;
699
700 e = strchr(optarg, ':');
701 if (e) {
702 path = strndup(optarg, e - optarg);
703 opts = strdup(e + 1);
704 } else {
705 path = strdup(optarg);
706 opts = strdup("mode=0755");
707 }
708
709 if (!path || !opts)
710 return log_oom();
711
712 if (!path_is_absolute(path)) {
713 log_error("Invalid tmpfs specification: %s", optarg);
714 return -EINVAL;
715 }
716
717 m = custom_mount_add(CUSTOM_MOUNT_TMPFS);
718 if (!m)
719 return log_oom();
720
721 m->destination = path;
722 m->options = opts;
723
724 path = opts = NULL;
725
726 break;
727 }
728
729 case ARG_OVERLAY:
730 case ARG_OVERLAY_RO: {
731 _cleanup_free_ char *upper = NULL, *destination = NULL;
732 _cleanup_strv_free_ char **lower = NULL;
733 CustomMount *m;
734 unsigned n = 0;
735 char **i;
736
737 lower = strv_split(optarg, ":");
738 if (!lower)
739 return log_oom();
740
741 STRV_FOREACH(i, lower) {
742 if (!path_is_absolute(*i)) {
743 log_error("Overlay path %s is not absolute.", *i);
744 return -EINVAL;
745 }
746
747 n++;
748 }
749
750 if (n < 2) {
751 log_error("--overlay= needs at least two colon-separated directories specified.");
752 return -EINVAL;
753 }
754
755 if (n == 2) {
756 /* If two parameters are specified,
757 * the first one is the lower, the
758 * second one the upper directory. And
759 * we'll also define the destination
760 * mount point the same as the upper. */
761 upper = lower[1];
762 lower[1] = NULL;
763
764 destination = strdup(upper);
765 if (!destination)
766 return log_oom();
767
768 } else {
769 upper = lower[n - 2];
770 destination = lower[n - 1];
771 lower[n - 2] = NULL;
772 }
773
774 m = custom_mount_add(CUSTOM_MOUNT_OVERLAY);
775 if (!m)
776 return log_oom();
777
778 m->destination = destination;
779 m->source = upper;
780 m->lower = lower;
781 m->read_only = c == ARG_OVERLAY_RO;
782
783 upper = destination = NULL;
784 lower = NULL;
785
786 break;
787 }
788
789 case ARG_SETENV: {
790 char **n;
791
792 if (!env_assignment_is_valid(optarg)) {
793 log_error("Environment variable assignment '%s' is not valid.", optarg);
794 return -EINVAL;
795 }
796
797 n = strv_env_set(arg_setenv, optarg);
798 if (!n)
799 return log_oom();
800
801 strv_free(arg_setenv);
802 arg_setenv = n;
803 break;
804 }
805
806 case 'q':
807 arg_quiet = true;
808 break;
809
810 case ARG_SHARE_SYSTEM:
811 arg_share_system = true;
812 break;
813
814 case ARG_REGISTER:
815 r = parse_boolean(optarg);
816 if (r < 0) {
817 log_error("Failed to parse --register= argument: %s", optarg);
818 return r;
819 }
820
821 arg_register = r;
822 break;
823
824 case ARG_KEEP_UNIT:
825 arg_keep_unit = true;
826 break;
827
828 case ARG_PERSONALITY:
829
830 arg_personality = personality_from_string(optarg);
831 if (arg_personality == PERSONALITY_INVALID) {
832 log_error("Unknown or unsupported personality '%s'.", optarg);
833 return -EINVAL;
834 }
835
836 break;
837
838 case ARG_VOLATILE:
839
840 if (!optarg)
841 arg_volatile = VOLATILE_YES;
842 else {
843 r = parse_boolean(optarg);
844 if (r < 0) {
845 if (streq(optarg, "state"))
846 arg_volatile = VOLATILE_STATE;
847 else {
848 log_error("Failed to parse --volatile= argument: %s", optarg);
849 return r;
850 }
851 } else
852 arg_volatile = r ? VOLATILE_YES : VOLATILE_NO;
853 }
854
855 break;
856
857 case 'p': {
858 const char *split, *e;
859 uint16_t container_port, host_port;
860 int protocol;
861 ExposePort *p;
862
863 if ((e = startswith(optarg, "tcp:")))
864 protocol = IPPROTO_TCP;
865 else if ((e = startswith(optarg, "udp:")))
866 protocol = IPPROTO_UDP;
867 else {
868 e = optarg;
869 protocol = IPPROTO_TCP;
870 }
871
872 split = strchr(e, ':');
873 if (split) {
874 char v[split - e + 1];
875
876 memcpy(v, e, split - e);
877 v[split - e] = 0;
878
879 r = safe_atou16(v, &host_port);
880 if (r < 0 || host_port <= 0) {
881 log_error("Failed to parse host port: %s", optarg);
882 return -EINVAL;
883 }
884
885 r = safe_atou16(split + 1, &container_port);
886 } else {
887 r = safe_atou16(e, &container_port);
888 host_port = container_port;
889 }
890
891 if (r < 0 || container_port <= 0) {
892 log_error("Failed to parse host port: %s", optarg);
893 return -EINVAL;
894 }
895
896 LIST_FOREACH(ports, p, arg_expose_ports) {
897 if (p->protocol == protocol && p->host_port == host_port) {
898 log_error("Duplicate port specification: %s", optarg);
899 return -EINVAL;
900 }
901 }
902
903 p = new(ExposePort, 1);
904 if (!p)
905 return log_oom();
906
907 p->protocol = protocol;
908 p->host_port = host_port;
909 p->container_port = container_port;
910
911 LIST_PREPEND(ports, arg_expose_ports, p);
912
913 break;
914 }
915
916 case ARG_PROPERTY:
917 if (strv_extend(&arg_property, optarg) < 0)
918 return log_oom();
919
920 break;
921
922 case ARG_PRIVATE_USERS:
923 if (optarg) {
924 _cleanup_free_ char *buffer = NULL;
925 const char *range, *shift;
926
927 range = strchr(optarg, ':');
928 if (range) {
929 buffer = strndup(optarg, range - optarg);
930 if (!buffer)
931 return log_oom();
932 shift = buffer;
933
934 range++;
935 if (safe_atou32(range, &arg_uid_range) < 0 || arg_uid_range <= 0) {
936 log_error("Failed to parse UID range: %s", range);
937 return -EINVAL;
938 }
939 } else
940 shift = optarg;
941
942 if (parse_uid(shift, &arg_uid_shift) < 0) {
943 log_error("Failed to parse UID: %s", optarg);
944 return -EINVAL;
945 }
946 }
947
948 arg_userns = true;
949 break;
950
951 case ARG_KILL_SIGNAL:
952 arg_kill_signal = signal_from_string_try_harder(optarg);
953 if (arg_kill_signal < 0) {
954 log_error("Cannot parse signal: %s", optarg);
955 return -EINVAL;
956 }
957
958 break;
959
960 case '?':
961 return -EINVAL;
962
963 default:
964 assert_not_reached("Unhandled option");
965 }
966
967 if (arg_share_system)
968 arg_register = false;
969
970 if (arg_boot && arg_share_system) {
971 log_error("--boot and --share-system may not be combined.");
972 return -EINVAL;
973 }
974
975 if (arg_keep_unit && cg_pid_get_owner_uid(0, NULL) >= 0) {
976 log_error("--keep-unit may not be used when invoked from a user session.");
977 return -EINVAL;
978 }
979
980 if (arg_directory && arg_image) {
981 log_error("--directory= and --image= may not be combined.");
982 return -EINVAL;
983 }
984
985 if (arg_template && arg_image) {
986 log_error("--template= and --image= may not be combined.");
987 return -EINVAL;
988 }
989
990 if (arg_template && !(arg_directory || arg_machine)) {
991 log_error("--template= needs --directory= or --machine=.");
992 return -EINVAL;
993 }
994
995 if (arg_ephemeral && arg_template) {
996 log_error("--ephemeral and --template= may not be combined.");
997 return -EINVAL;
998 }
999
1000 if (arg_ephemeral && arg_image) {
1001 log_error("--ephemeral and --image= may not be combined.");
1002 return -EINVAL;
1003 }
1004
1005 if (arg_ephemeral && !IN_SET(arg_link_journal, LINK_NO, LINK_AUTO)) {
1006 log_error("--ephemeral and --link-journal= may not be combined.");
1007 return -EINVAL;
1008 }
1009
1010 if (arg_volatile != VOLATILE_NO && arg_read_only) {
1011 log_error("Cannot combine --read-only with --volatile. Note that --volatile already implies a read-only base hierarchy.");
1012 return -EINVAL;
1013 }
1014
1015 if (arg_expose_ports && !arg_private_network) {
1016 log_error("Cannot use --port= without private networking.");
1017 return -EINVAL;
1018 }
1019
1020 if (arg_userns && access("/proc/self/uid_map", F_OK) < 0)
1021 return log_error_errno(EOPNOTSUPP, "--private-users= is not supported, kernel compiled without user namespace support.");
1022
1023 arg_retain = (arg_retain | plus | (arg_private_network ? 1ULL << CAP_NET_ADMIN : 0)) & ~minus;
1024
1025 if (arg_boot && arg_kill_signal <= 0)
1026 arg_kill_signal = SIGRTMIN+3;
1027
1028 return 1;
1029 }
1030
1031 static int tmpfs_patch_options(const char *options, char **ret) {
1032 char *buf = NULL;
1033
1034 if (arg_userns && arg_uid_shift != 0) {
1035 assert(arg_uid_shift != UID_INVALID);
1036
1037 if (options)
1038 (void) asprintf(&buf, "%s,uid=" UID_FMT ",gid=" UID_FMT, options, arg_uid_shift, arg_uid_shift);
1039 else
1040 (void) asprintf(&buf, "uid=" UID_FMT ",gid=" UID_FMT, arg_uid_shift, arg_uid_shift);
1041 if (!buf)
1042 return -ENOMEM;
1043
1044 options = buf;
1045 }
1046
1047 #ifdef HAVE_SELINUX
1048 if (arg_selinux_apifs_context) {
1049 char *t;
1050
1051 if (options)
1052 t = strjoin(options, ",context=\"", arg_selinux_apifs_context, "\"", NULL);
1053 else
1054 t = strjoin("context=\"", arg_selinux_apifs_context, "\"", NULL);
1055 if (!t) {
1056 free(buf);
1057 return -ENOMEM;
1058 }
1059
1060 free(buf);
1061 buf = t;
1062 }
1063 #endif
1064
1065 *ret = buf;
1066 return !!buf;
1067 }
1068
1069 static int mount_all(const char *dest, bool userns) {
1070
1071 typedef struct MountPoint {
1072 const char *what;
1073 const char *where;
1074 const char *type;
1075 const char *options;
1076 unsigned long flags;
1077 bool fatal;
1078 bool userns;
1079 } MountPoint;
1080
1081 static const MountPoint mount_table[] = {
1082 { "proc", "/proc", "proc", NULL, MS_NOSUID|MS_NOEXEC|MS_NODEV, true, true },
1083 { "/proc/sys", "/proc/sys", NULL, NULL, MS_BIND, true, true }, /* Bind mount first */
1084 { NULL, "/proc/sys", NULL, NULL, MS_BIND|MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_REMOUNT, true, true }, /* Then, make it r/o */
1085 { "sysfs", "/sys", "sysfs", NULL, MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV, true, false },
1086 { "tmpfs", "/sys/fs/cgroup", "tmpfs", "mode=755", MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_STRICTATIME, true, false },
1087 { "tmpfs", "/dev", "tmpfs", "mode=755", MS_NOSUID|MS_STRICTATIME, true, false },
1088 { "tmpfs", "/dev/shm", "tmpfs", "mode=1777", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true, false },
1089 { "tmpfs", "/run", "tmpfs", "mode=755", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true, false },
1090 { "tmpfs", "/tmp", "tmpfs", "mode=1777", MS_STRICTATIME, true, false },
1091 #ifdef HAVE_SELINUX
1092 { "/sys/fs/selinux", "/sys/fs/selinux", NULL, NULL, MS_BIND, false, false }, /* Bind mount first */
1093 { NULL, "/sys/fs/selinux", NULL, NULL, MS_BIND|MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_REMOUNT, false, false }, /* Then, make it r/o */
1094 #endif
1095 };
1096
1097 unsigned k;
1098 int r;
1099
1100 for (k = 0; k < ELEMENTSOF(mount_table); k++) {
1101 _cleanup_free_ char *where = NULL, *options = NULL;
1102 const char *o;
1103
1104 if (userns != mount_table[k].userns)
1105 continue;
1106
1107 where = prefix_root(dest, mount_table[k].where);
1108 if (!where)
1109 return log_oom();
1110
1111 r = path_is_mount_point(where, AT_SYMLINK_FOLLOW);
1112 if (r < 0 && r != -ENOENT)
1113 return log_error_errno(r, "Failed to detect whether %s is a mount point: %m", where);
1114
1115 /* Skip this entry if it is not a remount. */
1116 if (mount_table[k].what && r > 0)
1117 continue;
1118
1119 r = mkdir_p(where, 0755);
1120 if (r < 0) {
1121 if (mount_table[k].fatal)
1122 return log_error_errno(r, "Failed to create directory %s: %m", where);
1123
1124 log_warning_errno(r, "Failed to create directory %s: %m", where);
1125 continue;
1126 }
1127
1128 o = mount_table[k].options;
1129 if (streq_ptr(mount_table[k].type, "tmpfs")) {
1130 r = tmpfs_patch_options(o, &options);
1131 if (r < 0)
1132 return log_oom();
1133 if (r > 0)
1134 o = options;
1135 }
1136
1137 if (mount(mount_table[k].what,
1138 where,
1139 mount_table[k].type,
1140 mount_table[k].flags,
1141 o) < 0) {
1142
1143 if (mount_table[k].fatal)
1144 return log_error_errno(errno, "mount(%s) failed: %m", where);
1145
1146 log_warning_errno(errno, "mount(%s) failed, ignoring: %m", where);
1147 }
1148 }
1149
1150 return 0;
1151 }
1152
1153 static int mount_bind(const char *dest, CustomMount *m) {
1154 struct stat source_st, dest_st;
1155 const char *where;
1156 int r;
1157
1158 assert(m);
1159
1160 if (stat(m->source, &source_st) < 0)
1161 return log_error_errno(errno, "Failed to stat %s: %m", m->source);
1162
1163 where = prefix_roota(dest, m->destination);
1164
1165 if (stat(where, &dest_st) >= 0) {
1166 if (S_ISDIR(source_st.st_mode) && !S_ISDIR(dest_st.st_mode)) {
1167 log_error("Cannot bind mount directory %s on file %s.", m->source, where);
1168 return -EINVAL;
1169 }
1170
1171 if (!S_ISDIR(source_st.st_mode) && S_ISDIR(dest_st.st_mode)) {
1172 log_error("Cannot bind mount file %s on directory %s.", m->source, where);
1173 return -EINVAL;
1174 }
1175
1176 } else if (errno == ENOENT) {
1177 r = mkdir_parents_label(where, 0755);
1178 if (r < 0)
1179 return log_error_errno(r, "Failed to make parents of %s: %m", where);
1180 } else {
1181 log_error_errno(errno, "Failed to stat %s: %m", where);
1182 return -errno;
1183 }
1184
1185 /* Create the mount point. Any non-directory file can be
1186 * mounted on any non-directory file (regular, fifo, socket,
1187 * char, block).
1188 */
1189 if (S_ISDIR(source_st.st_mode))
1190 r = mkdir_label(where, 0755);
1191 else
1192 r = touch(where);
1193 if (r < 0 && r != -EEXIST)
1194 return log_error_errno(r, "Failed to create mount point %s: %m", where);
1195
1196 if (mount(m->source, where, NULL, MS_BIND, NULL) < 0)
1197 return log_error_errno(errno, "mount(%s) failed: %m", where);
1198
1199 if (m->read_only) {
1200 r = bind_remount_recursive(where, true);
1201 if (r < 0)
1202 return log_error_errno(r, "Read-only bind mount failed: %m");
1203 }
1204
1205 return 0;
1206 }
1207
1208 static int mount_tmpfs(const char *dest, CustomMount *m) {
1209 const char *where, *options;
1210 _cleanup_free_ char *buf = NULL;
1211 int r;
1212
1213 assert(dest);
1214 assert(m);
1215
1216 where = prefix_roota(dest, m->destination);
1217
1218 r = mkdir_p_label(where, 0755);
1219 if (r < 0 && r != -EEXIST)
1220 return log_error_errno(r, "Creating mount point for tmpfs %s failed: %m", where);
1221
1222 r = tmpfs_patch_options(m->options, &buf);
1223 if (r < 0)
1224 return log_oom();
1225 options = r > 0 ? buf : m->options;
1226
1227 if (mount("tmpfs", where, "tmpfs", MS_NODEV|MS_STRICTATIME, options) < 0)
1228 return log_error_errno(errno, "tmpfs mount to %s failed: %m", where);
1229
1230 return 0;
1231 }
1232
1233 static int mount_overlay(const char *dest, CustomMount *m) {
1234 _cleanup_free_ char *lower = NULL;
1235 const char *where, *options;
1236 int r;
1237
1238 assert(dest);
1239 assert(m);
1240
1241 where = prefix_roota(dest, m->destination);
1242
1243 r = mkdir_label(where, 0755);
1244 if (r < 0 && r != -EEXIST)
1245 return log_error_errno(r, "Creating mount point for overlay %s failed: %m", where);
1246
1247 (void) mkdir_p_label(m->source, 0755);
1248
1249 strv_reverse(m->lower);
1250 lower = strv_join(m->lower, ":");
1251 strv_reverse(m->lower);
1252 if (!lower)
1253 return log_oom();
1254
1255 if (m->read_only)
1256 options = strjoina("lowerdir=", m->source, ":", lower);
1257 else {
1258 assert(m->work_dir);
1259 (void) mkdir_label(m->work_dir, 0700);
1260
1261 options = strjoina("lowerdir=", lower, ",upperdir=", m->source, ",workdir=", m->work_dir);
1262 }
1263
1264 if (mount("overlay", where, "overlay", m->read_only ? MS_RDONLY : 0, options) < 0)
1265 return log_error_errno(errno, "overlay mount to %s failed: %m", where);
1266
1267 return 0;
1268 }
1269
1270 static int mount_custom(const char *dest) {
1271 unsigned i;
1272 int r;
1273
1274 assert(dest);
1275
1276 for (i = 0; i < arg_n_custom_mounts; i++) {
1277 CustomMount *m = &arg_custom_mounts[i];
1278
1279 switch (m->type) {
1280
1281 case CUSTOM_MOUNT_BIND:
1282 r = mount_bind(dest, m);
1283 break;
1284
1285 case CUSTOM_MOUNT_TMPFS:
1286 r = mount_tmpfs(dest, m);
1287 break;
1288
1289 case CUSTOM_MOUNT_OVERLAY:
1290 r = mount_overlay(dest, m);
1291 break;
1292
1293 default:
1294 assert_not_reached("Unknown custom mount type");
1295 }
1296
1297 if (r < 0)
1298 return r;
1299 }
1300
1301 return 0;
1302 }
1303
1304 static int mount_cgroup_hierarchy(const char *dest, const char *controller, const char *hierarchy, bool read_only) {
1305 char *to;
1306 int r;
1307
1308 to = strjoina(dest, "/sys/fs/cgroup/", hierarchy);
1309
1310 r = path_is_mount_point(to, 0);
1311 if (r < 0 && r != -ENOENT)
1312 return log_error_errno(r, "Failed to determine if %s is mounted already: %m", to);
1313 if (r > 0)
1314 return 0;
1315
1316 mkdir_p(to, 0755);
1317
1318 /* The superblock mount options of the mount point need to be
1319 * identical to the hosts', and hence writable... */
1320 if (mount("cgroup", to, "cgroup", MS_NOSUID|MS_NOEXEC|MS_NODEV, controller) < 0)
1321 return log_error_errno(errno, "Failed to mount to %s: %m", to);
1322
1323 /* ... hence let's only make the bind mount read-only, not the
1324 * superblock. */
1325 if (read_only) {
1326 if (mount(NULL, to, NULL, MS_BIND|MS_REMOUNT|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_RDONLY, NULL) < 0)
1327 return log_error_errno(errno, "Failed to remount %s read-only: %m", to);
1328 }
1329 return 1;
1330 }
1331
1332 static int mount_cgroup(const char *dest) {
1333 _cleanup_set_free_free_ Set *controllers = NULL;
1334 const char *cgroup_root;
1335 int r;
1336
1337 controllers = set_new(&string_hash_ops);
1338 if (!controllers)
1339 return log_oom();
1340
1341 r = cg_kernel_controllers(controllers);
1342 if (r < 0)
1343 return log_error_errno(r, "Failed to determine cgroup controllers: %m");
1344
1345 for (;;) {
1346 _cleanup_free_ char *controller = NULL, *origin = NULL, *combined = NULL;
1347
1348 controller = set_steal_first(controllers);
1349 if (!controller)
1350 break;
1351
1352 origin = prefix_root("/sys/fs/cgroup/", controller);
1353 if (!origin)
1354 return log_oom();
1355
1356 r = readlink_malloc(origin, &combined);
1357 if (r == -EINVAL) {
1358 /* Not a symbolic link, but directly a single cgroup hierarchy */
1359
1360 r = mount_cgroup_hierarchy(dest, controller, controller, true);
1361 if (r < 0)
1362 return r;
1363
1364 } else if (r < 0)
1365 return log_error_errno(r, "Failed to read link %s: %m", origin);
1366 else {
1367 _cleanup_free_ char *target = NULL;
1368
1369 target = prefix_root(dest, origin);
1370 if (!target)
1371 return log_oom();
1372
1373 /* A symbolic link, a combination of controllers in one hierarchy */
1374
1375 if (!filename_is_valid(combined)) {
1376 log_warning("Ignoring invalid combined hierarchy %s.", combined);
1377 continue;
1378 }
1379
1380 r = mount_cgroup_hierarchy(dest, combined, combined, true);
1381 if (r < 0)
1382 return r;
1383
1384 r = symlink_idempotent(combined, target);
1385 if (r == -EINVAL) {
1386 log_error("Invalid existing symlink for combined hierarchy");
1387 return r;
1388 }
1389 if (r < 0)
1390 return log_error_errno(r, "Failed to create symlink for combined hierarchy: %m");
1391 }
1392 }
1393
1394 r = mount_cgroup_hierarchy(dest, "name=systemd,xattr", "systemd", false);
1395 if (r < 0)
1396 return r;
1397
1398 cgroup_root = prefix_roota(dest, "/sys/fs/cgroup");
1399 if (mount(NULL, cgroup_root, NULL, MS_REMOUNT|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_STRICTATIME|MS_RDONLY, "mode=755") < 0)
1400 return log_error_errno(errno, "Failed to remount %s read-only: %m", cgroup_root);
1401
1402 return 0;
1403 }
1404
1405 static int mount_systemd_cgroup_writable(const char *dest) {
1406 _cleanup_free_ char *own_cgroup_path = NULL;
1407 const char *systemd_root, *systemd_own;
1408 int r;
1409
1410 assert(dest);
1411
1412 r = cg_pid_get_path(NULL, 0, &own_cgroup_path);
1413 if (r < 0)
1414 return log_error_errno(r, "Failed to determine our own cgroup path: %m");
1415
1416 /* Make our own cgroup a (writable) bind mount */
1417 systemd_own = strjoina(dest, "/sys/fs/cgroup/systemd", own_cgroup_path);
1418 if (mount(systemd_own, systemd_own, NULL, MS_BIND, NULL) < 0)
1419 return log_error_errno(errno, "Failed to turn %s into a bind mount: %m", own_cgroup_path);
1420
1421 /* And then remount the systemd cgroup root read-only */
1422 systemd_root = prefix_roota(dest, "/sys/fs/cgroup/systemd");
1423 if (mount(NULL, systemd_root, NULL, MS_BIND|MS_REMOUNT|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_RDONLY, NULL) < 0)
1424 return log_error_errno(errno, "Failed to mount cgroup root read-only: %m");
1425
1426 return 0;
1427 }
1428
1429 static int userns_lchown(const char *p, uid_t uid, gid_t gid) {
1430 assert(p);
1431
1432 if (!arg_userns)
1433 return 0;
1434
1435 if (uid == UID_INVALID && gid == GID_INVALID)
1436 return 0;
1437
1438 if (uid != UID_INVALID) {
1439 uid += arg_uid_shift;
1440
1441 if (uid < arg_uid_shift || uid >= arg_uid_shift + arg_uid_range)
1442 return -EOVERFLOW;
1443 }
1444
1445 if (gid != GID_INVALID) {
1446 gid += (gid_t) arg_uid_shift;
1447
1448 if (gid < (gid_t) arg_uid_shift || gid >= (gid_t) (arg_uid_shift + arg_uid_range))
1449 return -EOVERFLOW;
1450 }
1451
1452 if (lchown(p, uid, gid) < 0)
1453 return -errno;
1454
1455 return 0;
1456 }
1457
1458 static int userns_mkdir(const char *root, const char *path, mode_t mode, uid_t uid, gid_t gid) {
1459 const char *q;
1460
1461 q = prefix_roota(root, path);
1462 if (mkdir(q, mode) < 0) {
1463 if (errno == EEXIST)
1464 return 0;
1465 return -errno;
1466 }
1467
1468 return userns_lchown(q, uid, gid);
1469 }
1470
1471 static int setup_timezone(const char *dest) {
1472 _cleanup_free_ char *p = NULL, *q = NULL;
1473 const char *where, *check, *what;
1474 char *z, *y;
1475 int r;
1476
1477 assert(dest);
1478
1479 /* Fix the timezone, if possible */
1480 r = readlink_malloc("/etc/localtime", &p);
1481 if (r < 0) {
1482 log_warning("/etc/localtime is not a symlink, not updating container timezone.");
1483 return 0;
1484 }
1485
1486 z = path_startswith(p, "../usr/share/zoneinfo/");
1487 if (!z)
1488 z = path_startswith(p, "/usr/share/zoneinfo/");
1489 if (!z) {
1490 log_warning("/etc/localtime does not point into /usr/share/zoneinfo/, not updating container timezone.");
1491 return 0;
1492 }
1493
1494 where = prefix_roota(dest, "/etc/localtime");
1495 r = readlink_malloc(where, &q);
1496 if (r >= 0) {
1497 y = path_startswith(q, "../usr/share/zoneinfo/");
1498 if (!y)
1499 y = path_startswith(q, "/usr/share/zoneinfo/");
1500
1501 /* Already pointing to the right place? Then do nothing .. */
1502 if (y && streq(y, z))
1503 return 0;
1504 }
1505
1506 check = strjoina("/usr/share/zoneinfo/", z);
1507 check = prefix_root(dest, check);
1508 if (laccess(check, F_OK) < 0) {
1509 log_warning("Timezone %s does not exist in container, not updating container timezone.", z);
1510 return 0;
1511 }
1512
1513 r = unlink(where);
1514 if (r < 0 && errno != ENOENT) {
1515 log_error_errno(errno, "Failed to remove existing timezone info %s in container: %m", where);
1516 return 0;
1517 }
1518
1519 what = strjoina("../usr/share/zoneinfo/", z);
1520 if (symlink(what, where) < 0) {
1521 log_error_errno(errno, "Failed to correct timezone of container: %m");
1522 return 0;
1523 }
1524
1525 r = userns_lchown(where, 0, 0);
1526 if (r < 0)
1527 return log_warning_errno(r, "Failed to chown /etc/localtime: %m");
1528
1529 return 0;
1530 }
1531
1532 static int setup_resolv_conf(const char *dest) {
1533 const char *where = NULL;
1534 int r;
1535
1536 assert(dest);
1537
1538 if (arg_private_network)
1539 return 0;
1540
1541 /* Fix resolv.conf, if possible */
1542 where = prefix_roota(dest, "/etc/resolv.conf");
1543
1544 r = copy_file("/etc/resolv.conf", where, O_TRUNC|O_NOFOLLOW, 0644, 0);
1545 if (r < 0) {
1546 /* If the file already exists as symlink, let's
1547 * suppress the warning, under the assumption that
1548 * resolved or something similar runs inside and the
1549 * symlink points there.
1550 *
1551 * If the disk image is read-only, there's also no
1552 * point in complaining.
1553 */
1554 log_full_errno(IN_SET(r, -ELOOP, -EROFS) ? LOG_DEBUG : LOG_WARNING, r,
1555 "Failed to copy /etc/resolv.conf to %s: %m", where);
1556 return 0;
1557 }
1558
1559 r = userns_lchown(where, 0, 0);
1560 if (r < 0)
1561 log_warning_errno(r, "Failed to chown /etc/resolv.conf: %m");
1562
1563 return 0;
1564 }
1565
1566 static int setup_volatile_state(const char *directory) {
1567 _cleanup_free_ char *buf = NULL;
1568 const char *p, *options;
1569 int r;
1570
1571 assert(directory);
1572
1573 if (arg_volatile != VOLATILE_STATE)
1574 return 0;
1575
1576 /* --volatile=state means we simply overmount /var
1577 with a tmpfs, and the rest read-only. */
1578
1579 r = bind_remount_recursive(directory, true);
1580 if (r < 0)
1581 return log_error_errno(r, "Failed to remount %s read-only: %m", directory);
1582
1583 p = prefix_roota(directory, "/var");
1584 r = mkdir(p, 0755);
1585 if (r < 0 && errno != EEXIST)
1586 return log_error_errno(errno, "Failed to create %s: %m", directory);
1587
1588 options = "mode=755";
1589 r = tmpfs_patch_options(options, &buf);
1590 if (r < 0)
1591 return log_oom();
1592 if (r > 0)
1593 options = buf;
1594
1595 if (mount("tmpfs", p, "tmpfs", MS_STRICTATIME, options) < 0)
1596 return log_error_errno(errno, "Failed to mount tmpfs to /var: %m");
1597
1598 return 0;
1599 }
1600
1601 static int setup_volatile(const char *directory) {
1602 bool tmpfs_mounted = false, bind_mounted = false;
1603 char template[] = "/tmp/nspawn-volatile-XXXXXX";
1604 _cleanup_free_ char *buf = NULL;
1605 const char *f, *t, *options;
1606 int r;
1607
1608 assert(directory);
1609
1610 if (arg_volatile != VOLATILE_YES)
1611 return 0;
1612
1613 /* --volatile=yes means we mount a tmpfs to the root dir, and
1614 the original /usr to use inside it, and that read-only. */
1615
1616 if (!mkdtemp(template))
1617 return log_error_errno(errno, "Failed to create temporary directory: %m");
1618
1619 options = "mode=755";
1620 r = tmpfs_patch_options(options, &buf);
1621 if (r < 0)
1622 return log_oom();
1623 if (r > 0)
1624 options = buf;
1625
1626 if (mount("tmpfs", template, "tmpfs", MS_STRICTATIME, options) < 0) {
1627 r = log_error_errno(errno, "Failed to mount tmpfs for root directory: %m");
1628 goto fail;
1629 }
1630
1631 tmpfs_mounted = true;
1632
1633 f = prefix_roota(directory, "/usr");
1634 t = prefix_roota(template, "/usr");
1635
1636 r = mkdir(t, 0755);
1637 if (r < 0 && errno != EEXIST) {
1638 r = log_error_errno(errno, "Failed to create %s: %m", t);
1639 goto fail;
1640 }
1641
1642 if (mount(f, t, NULL, MS_BIND|MS_REC, NULL) < 0) {
1643 r = log_error_errno(errno, "Failed to create /usr bind mount: %m");
1644 goto fail;
1645 }
1646
1647 bind_mounted = true;
1648
1649 r = bind_remount_recursive(t, true);
1650 if (r < 0) {
1651 log_error_errno(r, "Failed to remount %s read-only: %m", t);
1652 goto fail;
1653 }
1654
1655 if (mount(template, directory, NULL, MS_MOVE, NULL) < 0) {
1656 r = log_error_errno(errno, "Failed to move root mount: %m");
1657 goto fail;
1658 }
1659
1660 (void) rmdir(template);
1661
1662 return 0;
1663
1664 fail:
1665 if (bind_mounted)
1666 (void) umount(t);
1667
1668 if (tmpfs_mounted)
1669 (void) umount(template);
1670 (void) rmdir(template);
1671 return r;
1672 }
1673
1674 static char* id128_format_as_uuid(sd_id128_t id, char s[37]) {
1675 assert(s);
1676
1677 snprintf(s, 37,
1678 "%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
1679 SD_ID128_FORMAT_VAL(id));
1680
1681 return s;
1682 }
1683
1684 static int setup_boot_id(const char *dest) {
1685 const char *from, *to;
1686 sd_id128_t rnd = {};
1687 char as_uuid[37];
1688 int r;
1689
1690 if (arg_share_system)
1691 return 0;
1692
1693 /* Generate a new randomized boot ID, so that each boot-up of
1694 * the container gets a new one */
1695
1696 from = prefix_roota(dest, "/run/proc-sys-kernel-random-boot-id");
1697 to = prefix_roota(dest, "/proc/sys/kernel/random/boot_id");
1698
1699 r = sd_id128_randomize(&rnd);
1700 if (r < 0)
1701 return log_error_errno(r, "Failed to generate random boot id: %m");
1702
1703 id128_format_as_uuid(rnd, as_uuid);
1704
1705 r = write_string_file(from, as_uuid, WRITE_STRING_FILE_CREATE);
1706 if (r < 0)
1707 return log_error_errno(r, "Failed to write boot id: %m");
1708
1709 if (mount(from, to, NULL, MS_BIND, NULL) < 0)
1710 r = log_error_errno(errno, "Failed to bind mount boot id: %m");
1711 else if (mount(NULL, to, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY|MS_NOSUID|MS_NODEV, NULL) < 0)
1712 log_warning_errno(errno, "Failed to make boot id read-only: %m");
1713
1714 unlink(from);
1715 return r;
1716 }
1717
1718 static int copy_devnodes(const char *dest) {
1719
1720 static const char devnodes[] =
1721 "null\0"
1722 "zero\0"
1723 "full\0"
1724 "random\0"
1725 "urandom\0"
1726 "tty\0"
1727 "net/tun\0";
1728
1729 const char *d;
1730 int r = 0;
1731 _cleanup_umask_ mode_t u;
1732
1733 assert(dest);
1734
1735 u = umask(0000);
1736
1737 /* Create /dev/net, so that we can create /dev/net/tun in it */
1738 if (userns_mkdir(dest, "/dev/net", 0755, 0, 0) < 0)
1739 return log_error_errno(r, "Failed to create /dev/net directory: %m");
1740
1741 NULSTR_FOREACH(d, devnodes) {
1742 _cleanup_free_ char *from = NULL, *to = NULL;
1743 struct stat st;
1744
1745 from = strappend("/dev/", d);
1746 to = prefix_root(dest, from);
1747
1748 if (stat(from, &st) < 0) {
1749
1750 if (errno != ENOENT)
1751 return log_error_errno(errno, "Failed to stat %s: %m", from);
1752
1753 } else if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
1754
1755 log_error("%s is not a char or block device, cannot copy.", from);
1756 return -EIO;
1757
1758 } else {
1759 if (mknod(to, st.st_mode, st.st_rdev) < 0) {
1760 if (errno != EPERM)
1761 return log_error_errno(errno, "mknod(%s) failed: %m", to);
1762
1763 /* Some systems abusively restrict mknod but
1764 * allow bind mounts. */
1765 r = touch(to);
1766 if (r < 0)
1767 return log_error_errno(r, "touch (%s) failed: %m", to);
1768 if (mount(from, to, NULL, MS_BIND, NULL) < 0)
1769 return log_error_errno(errno, "Both mknod and bind mount (%s) failed: %m", to);
1770 }
1771
1772 r = userns_lchown(to, 0, 0);
1773 if (r < 0)
1774 return log_error_errno(r, "chown() of device node %s failed: %m", to);
1775 }
1776 }
1777
1778 return r;
1779 }
1780
1781 static int setup_pts(const char *dest) {
1782 _cleanup_free_ char *options = NULL;
1783 const char *p;
1784
1785 #ifdef HAVE_SELINUX
1786 if (arg_selinux_apifs_context)
1787 (void) asprintf(&options,
1788 "newinstance,ptmxmode=0666,mode=620,uid=" UID_FMT ",gid=" GID_FMT ",context=\"%s\"",
1789 arg_uid_shift,
1790 arg_uid_shift + TTY_GID,
1791 arg_selinux_apifs_context);
1792 else
1793 #endif
1794 (void) asprintf(&options,
1795 "newinstance,ptmxmode=0666,mode=620,uid=" UID_FMT ",gid=" GID_FMT,
1796 arg_uid_shift,
1797 arg_uid_shift + TTY_GID);
1798
1799 if (!options)
1800 return log_oom();
1801
1802 /* Mount /dev/pts itself */
1803 p = prefix_roota(dest, "/dev/pts");
1804 if (mkdir(p, 0755) < 0)
1805 return log_error_errno(errno, "Failed to create /dev/pts: %m");
1806 if (mount("devpts", p, "devpts", MS_NOSUID|MS_NOEXEC, options) < 0)
1807 return log_error_errno(errno, "Failed to mount /dev/pts: %m");
1808 if (userns_lchown(p, 0, 0) < 0)
1809 return log_error_errno(errno, "Failed to chown /dev/pts: %m");
1810
1811 /* Create /dev/ptmx symlink */
1812 p = prefix_roota(dest, "/dev/ptmx");
1813 if (symlink("pts/ptmx", p) < 0)
1814 return log_error_errno(errno, "Failed to create /dev/ptmx symlink: %m");
1815 if (userns_lchown(p, 0, 0) < 0)
1816 return log_error_errno(errno, "Failed to chown /dev/ptmx: %m");
1817
1818 /* And fix /dev/pts/ptmx ownership */
1819 p = prefix_roota(dest, "/dev/pts/ptmx");
1820 if (userns_lchown(p, 0, 0) < 0)
1821 return log_error_errno(errno, "Failed to chown /dev/pts/ptmx: %m");
1822
1823 return 0;
1824 }
1825
1826 static int setup_dev_console(const char *dest, const char *console) {
1827 _cleanup_umask_ mode_t u;
1828 const char *to;
1829 int r;
1830
1831 assert(dest);
1832 assert(console);
1833
1834 u = umask(0000);
1835
1836 r = chmod_and_chown(console, 0600, arg_uid_shift, arg_uid_shift);
1837 if (r < 0)
1838 return log_error_errno(r, "Failed to correct access mode for TTY: %m");
1839
1840 /* We need to bind mount the right tty to /dev/console since
1841 * ptys can only exist on pts file systems. To have something
1842 * to bind mount things on we create a empty regular file. */
1843
1844 to = prefix_roota(dest, "/dev/console");
1845 r = touch(to);
1846 if (r < 0)
1847 return log_error_errno(r, "touch() for /dev/console failed: %m");
1848
1849 if (mount(console, to, NULL, MS_BIND, NULL) < 0)
1850 return log_error_errno(errno, "Bind mount for /dev/console failed: %m");
1851
1852 return 0;
1853 }
1854
1855 static int setup_kmsg(const char *dest, int kmsg_socket) {
1856 const char *from, *to;
1857 _cleanup_umask_ mode_t u;
1858 int fd, k;
1859 union {
1860 struct cmsghdr cmsghdr;
1861 uint8_t buf[CMSG_SPACE(sizeof(int))];
1862 } control = {};
1863 struct msghdr mh = {
1864 .msg_control = &control,
1865 .msg_controllen = sizeof(control),
1866 };
1867 struct cmsghdr *cmsg;
1868
1869 assert(kmsg_socket >= 0);
1870
1871 u = umask(0000);
1872
1873 /* We create the kmsg FIFO as /run/kmsg, but immediately
1874 * delete it after bind mounting it to /proc/kmsg. While FIFOs
1875 * on the reading side behave very similar to /proc/kmsg,
1876 * their writing side behaves differently from /dev/kmsg in
1877 * that writing blocks when nothing is reading. In order to
1878 * avoid any problems with containers deadlocking due to this
1879 * we simply make /dev/kmsg unavailable to the container. */
1880 from = prefix_roota(dest, "/run/kmsg");
1881 to = prefix_roota(dest, "/proc/kmsg");
1882
1883 if (mkfifo(from, 0600) < 0)
1884 return log_error_errno(errno, "mkfifo() for /run/kmsg failed: %m");
1885 if (mount(from, to, NULL, MS_BIND, NULL) < 0)
1886 return log_error_errno(errno, "Bind mount for /proc/kmsg failed: %m");
1887
1888 fd = open(from, O_RDWR|O_NDELAY|O_CLOEXEC);
1889 if (fd < 0)
1890 return log_error_errno(errno, "Failed to open fifo: %m");
1891
1892 cmsg = CMSG_FIRSTHDR(&mh);
1893 cmsg->cmsg_level = SOL_SOCKET;
1894 cmsg->cmsg_type = SCM_RIGHTS;
1895 cmsg->cmsg_len = CMSG_LEN(sizeof(int));
1896 memcpy(CMSG_DATA(cmsg), &fd, sizeof(int));
1897
1898 mh.msg_controllen = cmsg->cmsg_len;
1899
1900 /* Store away the fd in the socket, so that it stays open as
1901 * long as we run the child */
1902 k = sendmsg(kmsg_socket, &mh, MSG_NOSIGNAL);
1903 safe_close(fd);
1904
1905 if (k < 0)
1906 return log_error_errno(errno, "Failed to send FIFO fd: %m");
1907
1908 /* And now make the FIFO unavailable as /run/kmsg... */
1909 (void) unlink(from);
1910
1911 return 0;
1912 }
1913
1914 static int send_rtnl(int send_fd) {
1915 union {
1916 struct cmsghdr cmsghdr;
1917 uint8_t buf[CMSG_SPACE(sizeof(int))];
1918 } control = {};
1919 struct msghdr mh = {
1920 .msg_control = &control,
1921 .msg_controllen = sizeof(control),
1922 };
1923 struct cmsghdr *cmsg;
1924 _cleanup_close_ int fd = -1;
1925 ssize_t k;
1926
1927 assert(send_fd >= 0);
1928
1929 if (!arg_expose_ports)
1930 return 0;
1931
1932 fd = socket(PF_NETLINK, SOCK_RAW|SOCK_CLOEXEC|SOCK_NONBLOCK, NETLINK_ROUTE);
1933 if (fd < 0)
1934 return log_error_errno(errno, "Failed to allocate container netlink: %m");
1935
1936 cmsg = CMSG_FIRSTHDR(&mh);
1937 cmsg->cmsg_level = SOL_SOCKET;
1938 cmsg->cmsg_type = SCM_RIGHTS;
1939 cmsg->cmsg_len = CMSG_LEN(sizeof(int));
1940 memcpy(CMSG_DATA(cmsg), &fd, sizeof(int));
1941
1942 mh.msg_controllen = cmsg->cmsg_len;
1943
1944 /* Store away the fd in the socket, so that it stays open as
1945 * long as we run the child */
1946 k = sendmsg(send_fd, &mh, MSG_NOSIGNAL);
1947 if (k < 0)
1948 return log_error_errno(errno, "Failed to send netlink fd: %m");
1949
1950 return 0;
1951 }
1952
1953 static int flush_ports(union in_addr_union *exposed) {
1954 ExposePort *p;
1955 int r, af = AF_INET;
1956
1957 assert(exposed);
1958
1959 if (!arg_expose_ports)
1960 return 0;
1961
1962 if (in_addr_is_null(af, exposed))
1963 return 0;
1964
1965 log_debug("Lost IP address.");
1966
1967 LIST_FOREACH(ports, p, arg_expose_ports) {
1968 r = fw_add_local_dnat(false,
1969 af,
1970 p->protocol,
1971 NULL,
1972 NULL, 0,
1973 NULL, 0,
1974 p->host_port,
1975 exposed,
1976 p->container_port,
1977 NULL);
1978 if (r < 0)
1979 log_warning_errno(r, "Failed to modify firewall: %m");
1980 }
1981
1982 *exposed = IN_ADDR_NULL;
1983 return 0;
1984 }
1985
1986 static int expose_ports(sd_netlink *rtnl, union in_addr_union *exposed) {
1987 _cleanup_free_ struct local_address *addresses = NULL;
1988 _cleanup_free_ char *pretty = NULL;
1989 union in_addr_union new_exposed;
1990 ExposePort *p;
1991 bool add;
1992 int af = AF_INET, r;
1993
1994 assert(exposed);
1995
1996 /* Invoked each time an address is added or removed inside the
1997 * container */
1998
1999 if (!arg_expose_ports)
2000 return 0;
2001
2002 r = local_addresses(rtnl, 0, af, &addresses);
2003 if (r < 0)
2004 return log_error_errno(r, "Failed to enumerate local addresses: %m");
2005
2006 add = r > 0 &&
2007 addresses[0].family == af &&
2008 addresses[0].scope < RT_SCOPE_LINK;
2009
2010 if (!add)
2011 return flush_ports(exposed);
2012
2013 new_exposed = addresses[0].address;
2014 if (in_addr_equal(af, exposed, &new_exposed))
2015 return 0;
2016
2017 in_addr_to_string(af, &new_exposed, &pretty);
2018 log_debug("New container IP is %s.", strna(pretty));
2019
2020 LIST_FOREACH(ports, p, arg_expose_ports) {
2021
2022 r = fw_add_local_dnat(true,
2023 af,
2024 p->protocol,
2025 NULL,
2026 NULL, 0,
2027 NULL, 0,
2028 p->host_port,
2029 &new_exposed,
2030 p->container_port,
2031 in_addr_is_null(af, exposed) ? NULL : exposed);
2032 if (r < 0)
2033 log_warning_errno(r, "Failed to modify firewall: %m");
2034 }
2035
2036 *exposed = new_exposed;
2037 return 0;
2038 }
2039
2040 static int on_address_change(sd_netlink *rtnl, sd_netlink_message *m, void *userdata) {
2041 union in_addr_union *exposed = userdata;
2042
2043 assert(rtnl);
2044 assert(m);
2045 assert(exposed);
2046
2047 expose_ports(rtnl, exposed);
2048 return 0;
2049 }
2050
2051 static int watch_rtnl(sd_event *event, int recv_fd, union in_addr_union *exposed, sd_netlink **ret) {
2052 union {
2053 struct cmsghdr cmsghdr;
2054 uint8_t buf[CMSG_SPACE(sizeof(int))];
2055 } control = {};
2056 struct msghdr mh = {
2057 .msg_control = &control,
2058 .msg_controllen = sizeof(control),
2059 };
2060 struct cmsghdr *cmsg;
2061 _cleanup_netlink_unref_ sd_netlink *rtnl = NULL;
2062 int fd, r;
2063 ssize_t k;
2064
2065 assert(event);
2066 assert(recv_fd >= 0);
2067 assert(ret);
2068
2069 if (!arg_expose_ports)
2070 return 0;
2071
2072 k = recvmsg(recv_fd, &mh, MSG_NOSIGNAL);
2073 if (k < 0)
2074 return log_error_errno(errno, "Failed to recv netlink fd: %m");
2075
2076 cmsg = CMSG_FIRSTHDR(&mh);
2077 assert(cmsg->cmsg_level == SOL_SOCKET);
2078 assert(cmsg->cmsg_type == SCM_RIGHTS);
2079 assert(cmsg->cmsg_len == CMSG_LEN(sizeof(int)));
2080 memcpy(&fd, CMSG_DATA(cmsg), sizeof(int));
2081
2082 r = sd_netlink_open_fd(&rtnl, fd);
2083 if (r < 0) {
2084 safe_close(fd);
2085 return log_error_errno(r, "Failed to create rtnl object: %m");
2086 }
2087
2088 r = sd_netlink_add_match(rtnl, RTM_NEWADDR, on_address_change, exposed);
2089 if (r < 0)
2090 return log_error_errno(r, "Failed to subscribe to RTM_NEWADDR messages: %m");
2091
2092 r = sd_netlink_add_match(rtnl, RTM_DELADDR, on_address_change, exposed);
2093 if (r < 0)
2094 return log_error_errno(r, "Failed to subscribe to RTM_DELADDR messages: %m");
2095
2096 r = sd_netlink_attach_event(rtnl, event, 0);
2097 if (r < 0)
2098 return log_error_errno(r, "Failed to add to even loop: %m");
2099
2100 *ret = rtnl;
2101 rtnl = NULL;
2102
2103 return 0;
2104 }
2105
2106 static int setup_hostname(void) {
2107
2108 if (arg_share_system)
2109 return 0;
2110
2111 if (sethostname_idempotent(arg_machine) < 0)
2112 return -errno;
2113
2114 return 0;
2115 }
2116
2117 static int setup_journal(const char *directory) {
2118 sd_id128_t machine_id, this_id;
2119 _cleanup_free_ char *b = NULL, *d = NULL;
2120 const char *etc_machine_id, *p, *q;
2121 char *id;
2122 int r;
2123
2124 /* Don't link journals in ephemeral mode */
2125 if (arg_ephemeral)
2126 return 0;
2127
2128 etc_machine_id = prefix_roota(directory, "/etc/machine-id");
2129
2130 r = read_one_line_file(etc_machine_id, &b);
2131 if (r == -ENOENT && arg_link_journal == LINK_AUTO)
2132 return 0;
2133 else if (r < 0)
2134 return log_error_errno(r, "Failed to read machine ID from %s: %m", etc_machine_id);
2135
2136 id = strstrip(b);
2137 if (isempty(id) && arg_link_journal == LINK_AUTO)
2138 return 0;
2139
2140 /* Verify validity */
2141 r = sd_id128_from_string(id, &machine_id);
2142 if (r < 0)
2143 return log_error_errno(r, "Failed to parse machine ID from %s: %m", etc_machine_id);
2144
2145 r = sd_id128_get_machine(&this_id);
2146 if (r < 0)
2147 return log_error_errno(r, "Failed to retrieve machine ID: %m");
2148
2149 if (sd_id128_equal(machine_id, this_id)) {
2150 log_full(arg_link_journal == LINK_AUTO ? LOG_WARNING : LOG_ERR,
2151 "Host and machine ids are equal (%s): refusing to link journals", id);
2152 if (arg_link_journal == LINK_AUTO)
2153 return 0;
2154 return -EEXIST;
2155 }
2156
2157 if (arg_link_journal == LINK_NO)
2158 return 0;
2159
2160 r = userns_mkdir(directory, "/var", 0755, 0, 0);
2161 if (r < 0)
2162 return log_error_errno(r, "Failed to create /var: %m");
2163
2164 r = userns_mkdir(directory, "/var/log", 0755, 0, 0);
2165 if (r < 0)
2166 return log_error_errno(r, "Failed to create /var/log: %m");
2167
2168 r = userns_mkdir(directory, "/var/log/journal", 0755, 0, 0);
2169 if (r < 0)
2170 return log_error_errno(r, "Failed to create /var/log/journal: %m");
2171
2172 p = strjoina("/var/log/journal/", id);
2173 q = prefix_roota(directory, p);
2174
2175 if (path_is_mount_point(p, 0) > 0) {
2176 if (arg_link_journal != LINK_AUTO) {
2177 log_error("%s: already a mount point, refusing to use for journal", p);
2178 return -EEXIST;
2179 }
2180
2181 return 0;
2182 }
2183
2184 if (path_is_mount_point(q, 0) > 0) {
2185 if (arg_link_journal != LINK_AUTO) {
2186 log_error("%s: already a mount point, refusing to use for journal", q);
2187 return -EEXIST;
2188 }
2189
2190 return 0;
2191 }
2192
2193 r = readlink_and_make_absolute(p, &d);
2194 if (r >= 0) {
2195 if ((arg_link_journal == LINK_GUEST ||
2196 arg_link_journal == LINK_AUTO) &&
2197 path_equal(d, q)) {
2198
2199 r = userns_mkdir(directory, p, 0755, 0, 0);
2200 if (r < 0)
2201 log_warning_errno(errno, "Failed to create directory %s: %m", q);
2202 return 0;
2203 }
2204
2205 if (unlink(p) < 0)
2206 return log_error_errno(errno, "Failed to remove symlink %s: %m", p);
2207 } else if (r == -EINVAL) {
2208
2209 if (arg_link_journal == LINK_GUEST &&
2210 rmdir(p) < 0) {
2211
2212 if (errno == ENOTDIR) {
2213 log_error("%s already exists and is neither a symlink nor a directory", p);
2214 return r;
2215 } else {
2216 log_error_errno(errno, "Failed to remove %s: %m", p);
2217 return -errno;
2218 }
2219 }
2220 } else if (r != -ENOENT) {
2221 log_error_errno(errno, "readlink(%s) failed: %m", p);
2222 return r;
2223 }
2224
2225 if (arg_link_journal == LINK_GUEST) {
2226
2227 if (symlink(q, p) < 0) {
2228 if (arg_link_journal_try) {
2229 log_debug_errno(errno, "Failed to symlink %s to %s, skipping journal setup: %m", q, p);
2230 return 0;
2231 } else {
2232 log_error_errno(errno, "Failed to symlink %s to %s: %m", q, p);
2233 return -errno;
2234 }
2235 }
2236
2237 r = userns_mkdir(directory, p, 0755, 0, 0);
2238 if (r < 0)
2239 log_warning_errno(errno, "Failed to create directory %s: %m", q);
2240 return 0;
2241 }
2242
2243 if (arg_link_journal == LINK_HOST) {
2244 /* don't create parents here -- if the host doesn't have
2245 * permanent journal set up, don't force it here */
2246 r = mkdir(p, 0755);
2247 if (r < 0) {
2248 if (arg_link_journal_try) {
2249 log_debug_errno(errno, "Failed to create %s, skipping journal setup: %m", p);
2250 return 0;
2251 } else {
2252 log_error_errno(errno, "Failed to create %s: %m", p);
2253 return r;
2254 }
2255 }
2256
2257 } else if (access(p, F_OK) < 0)
2258 return 0;
2259
2260 if (dir_is_empty(q) == 0)
2261 log_warning("%s is not empty, proceeding anyway.", q);
2262
2263 r = userns_mkdir(directory, p, 0755, 0, 0);
2264 if (r < 0) {
2265 log_error_errno(errno, "Failed to create %s: %m", q);
2266 return r;
2267 }
2268
2269 if (mount(p, q, NULL, MS_BIND, NULL) < 0)
2270 return log_error_errno(errno, "Failed to bind mount journal from host into guest: %m");
2271
2272 return 0;
2273 }
2274
2275 static int drop_capabilities(void) {
2276 return capability_bounding_set_drop(~arg_retain, false);
2277 }
2278
2279 static int register_machine(pid_t pid, int local_ifindex) {
2280 _cleanup_bus_error_free_ sd_bus_error error = SD_BUS_ERROR_NULL;
2281 _cleanup_bus_flush_close_unref_ sd_bus *bus = NULL;
2282 int r;
2283
2284 if (!arg_register)
2285 return 0;
2286
2287 r = sd_bus_default_system(&bus);
2288 if (r < 0)
2289 return log_error_errno(r, "Failed to open system bus: %m");
2290
2291 if (arg_keep_unit) {
2292 r = sd_bus_call_method(
2293 bus,
2294 "org.freedesktop.machine1",
2295 "/org/freedesktop/machine1",
2296 "org.freedesktop.machine1.Manager",
2297 "RegisterMachineWithNetwork",
2298 &error,
2299 NULL,
2300 "sayssusai",
2301 arg_machine,
2302 SD_BUS_MESSAGE_APPEND_ID128(arg_uuid),
2303 "nspawn",
2304 "container",
2305 (uint32_t) pid,
2306 strempty(arg_directory),
2307 local_ifindex > 0 ? 1 : 0, local_ifindex);
2308 } else {
2309 _cleanup_bus_message_unref_ sd_bus_message *m = NULL;
2310 char **i;
2311 unsigned j;
2312
2313 r = sd_bus_message_new_method_call(
2314 bus,
2315 &m,
2316 "org.freedesktop.machine1",
2317 "/org/freedesktop/machine1",
2318 "org.freedesktop.machine1.Manager",
2319 "CreateMachineWithNetwork");
2320 if (r < 0)
2321 return bus_log_create_error(r);
2322
2323 r = sd_bus_message_append(
2324 m,
2325 "sayssusai",
2326 arg_machine,
2327 SD_BUS_MESSAGE_APPEND_ID128(arg_uuid),
2328 "nspawn",
2329 "container",
2330 (uint32_t) pid,
2331 strempty(arg_directory),
2332 local_ifindex > 0 ? 1 : 0, local_ifindex);
2333 if (r < 0)
2334 return bus_log_create_error(r);
2335
2336 r = sd_bus_message_open_container(m, 'a', "(sv)");
2337 if (r < 0)
2338 return bus_log_create_error(r);
2339
2340 if (!isempty(arg_slice)) {
2341 r = sd_bus_message_append(m, "(sv)", "Slice", "s", arg_slice);
2342 if (r < 0)
2343 return bus_log_create_error(r);
2344 }
2345
2346 r = sd_bus_message_append(m, "(sv)", "DevicePolicy", "s", "strict");
2347 if (r < 0)
2348 return bus_log_create_error(r);
2349
2350 /* If you make changes here, also make sure to update
2351 * systemd-nspawn@.service, to keep the device
2352 * policies in sync regardless if we are run with or
2353 * without the --keep-unit switch. */
2354 r = sd_bus_message_append(m, "(sv)", "DeviceAllow", "a(ss)", 9,
2355 /* Allow the container to
2356 * access and create the API
2357 * device nodes, so that
2358 * PrivateDevices= in the
2359 * container can work
2360 * fine */
2361 "/dev/null", "rwm",
2362 "/dev/zero", "rwm",
2363 "/dev/full", "rwm",
2364 "/dev/random", "rwm",
2365 "/dev/urandom", "rwm",
2366 "/dev/tty", "rwm",
2367 "/dev/net/tun", "rwm",
2368 /* Allow the container
2369 * access to ptys. However,
2370 * do not permit the
2371 * container to ever create
2372 * these device nodes. */
2373 "/dev/pts/ptmx", "rw",
2374 "char-pts", "rw");
2375 if (r < 0)
2376 return bus_log_create_error(r);
2377
2378 for (j = 0; j < arg_n_custom_mounts; j++) {
2379 CustomMount *cm = &arg_custom_mounts[j];
2380
2381 if (cm->type != CUSTOM_MOUNT_BIND)
2382 continue;
2383
2384 r = is_device_node(cm->source);
2385 if (r < 0)
2386 return log_error_errno(r, "Failed to stat %s: %m", cm->source);
2387
2388 if (r) {
2389 r = sd_bus_message_append(m, "(sv)", "DeviceAllow", "a(ss)", 1,
2390 cm->source, cm->read_only ? "r" : "rw");
2391 if (r < 0)
2392 return log_error_errno(r, "Failed to append message arguments: %m");
2393 }
2394 }
2395
2396 if (arg_kill_signal != 0) {
2397 r = sd_bus_message_append(m, "(sv)", "KillSignal", "i", arg_kill_signal);
2398 if (r < 0)
2399 return bus_log_create_error(r);
2400
2401 r = sd_bus_message_append(m, "(sv)", "KillMode", "s", "mixed");
2402 if (r < 0)
2403 return bus_log_create_error(r);
2404 }
2405
2406 STRV_FOREACH(i, arg_property) {
2407 r = sd_bus_message_open_container(m, 'r', "sv");
2408 if (r < 0)
2409 return bus_log_create_error(r);
2410
2411 r = bus_append_unit_property_assignment(m, *i);
2412 if (r < 0)
2413 return r;
2414
2415 r = sd_bus_message_close_container(m);
2416 if (r < 0)
2417 return bus_log_create_error(r);
2418 }
2419
2420 r = sd_bus_message_close_container(m);
2421 if (r < 0)
2422 return bus_log_create_error(r);
2423
2424 r = sd_bus_call(bus, m, 0, &error, NULL);
2425 }
2426
2427 if (r < 0) {
2428 log_error("Failed to register machine: %s", bus_error_message(&error, r));
2429 return r;
2430 }
2431
2432 return 0;
2433 }
2434
2435 static int terminate_machine(pid_t pid) {
2436 _cleanup_bus_error_free_ sd_bus_error error = SD_BUS_ERROR_NULL;
2437 _cleanup_bus_message_unref_ sd_bus_message *reply = NULL;
2438 _cleanup_bus_flush_close_unref_ sd_bus *bus = NULL;
2439 const char *path;
2440 int r;
2441
2442 if (!arg_register)
2443 return 0;
2444
2445 /* If we are reusing the unit, then just exit, systemd will do
2446 * the right thing when we exit. */
2447 if (arg_keep_unit)
2448 return 0;
2449
2450 r = sd_bus_default_system(&bus);
2451 if (r < 0)
2452 return log_error_errno(r, "Failed to open system bus: %m");
2453
2454 r = sd_bus_call_method(
2455 bus,
2456 "org.freedesktop.machine1",
2457 "/org/freedesktop/machine1",
2458 "org.freedesktop.machine1.Manager",
2459 "GetMachineByPID",
2460 &error,
2461 &reply,
2462 "u",
2463 (uint32_t) pid);
2464 if (r < 0) {
2465 /* Note that the machine might already have been
2466 * cleaned up automatically, hence don't consider it a
2467 * failure if we cannot get the machine object. */
2468 log_debug("Failed to get machine: %s", bus_error_message(&error, r));
2469 return 0;
2470 }
2471
2472 r = sd_bus_message_read(reply, "o", &path);
2473 if (r < 0)
2474 return bus_log_parse_error(r);
2475
2476 r = sd_bus_call_method(
2477 bus,
2478 "org.freedesktop.machine1",
2479 path,
2480 "org.freedesktop.machine1.Machine",
2481 "Terminate",
2482 &error,
2483 NULL,
2484 NULL);
2485 if (r < 0) {
2486 log_debug("Failed to terminate machine: %s", bus_error_message(&error, r));
2487 return 0;
2488 }
2489
2490 return 0;
2491 }
2492
2493 static int reset_audit_loginuid(void) {
2494 _cleanup_free_ char *p = NULL;
2495 int r;
2496
2497 if (arg_share_system)
2498 return 0;
2499
2500 r = read_one_line_file("/proc/self/loginuid", &p);
2501 if (r == -ENOENT)
2502 return 0;
2503 if (r < 0)
2504 return log_error_errno(r, "Failed to read /proc/self/loginuid: %m");
2505
2506 /* Already reset? */
2507 if (streq(p, "4294967295"))
2508 return 0;
2509
2510 r = write_string_file("/proc/self/loginuid", "4294967295", 0);
2511 if (r < 0) {
2512 log_error_errno(r,
2513 "Failed to reset audit login UID. This probably means that your kernel is too\n"
2514 "old and you have audit enabled. Note that the auditing subsystem is known to\n"
2515 "be incompatible with containers on old kernels. Please make sure to upgrade\n"
2516 "your kernel or to off auditing with 'audit=0' on the kernel command line before\n"
2517 "using systemd-nspawn. Sleeping for 5s... (%m)");
2518
2519 sleep(5);
2520 }
2521
2522 return 0;
2523 }
2524
2525 #define HOST_HASH_KEY SD_ID128_MAKE(1a,37,6f,c7,46,ec,45,0b,ad,a3,d5,31,06,60,5d,b1)
2526 #define CONTAINER_HASH_KEY SD_ID128_MAKE(c3,c4,f9,19,b5,57,b2,1c,e6,cf,14,27,03,9c,ee,a2)
2527 #define MACVLAN_HASH_KEY SD_ID128_MAKE(00,13,6d,bc,66,83,44,81,bb,0c,f9,51,1f,24,a6,6f)
2528
2529 static int generate_mac(struct ether_addr *mac, sd_id128_t hash_key, uint64_t idx) {
2530 uint8_t result[8];
2531 size_t l, sz;
2532 uint8_t *v, *i;
2533 int r;
2534
2535 l = strlen(arg_machine);
2536 sz = sizeof(sd_id128_t) + l;
2537 if (idx > 0)
2538 sz += sizeof(idx);
2539
2540 v = alloca(sz);
2541
2542 /* fetch some persistent data unique to the host */
2543 r = sd_id128_get_machine((sd_id128_t*) v);
2544 if (r < 0)
2545 return r;
2546
2547 /* combine with some data unique (on this host) to this
2548 * container instance */
2549 i = mempcpy(v + sizeof(sd_id128_t), arg_machine, l);
2550 if (idx > 0) {
2551 idx = htole64(idx);
2552 memcpy(i, &idx, sizeof(idx));
2553 }
2554
2555 /* Let's hash the host machine ID plus the container name. We
2556 * use a fixed, but originally randomly created hash key here. */
2557 siphash24(result, v, sz, hash_key.bytes);
2558
2559 assert_cc(ETH_ALEN <= sizeof(result));
2560 memcpy(mac->ether_addr_octet, result, ETH_ALEN);
2561
2562 /* see eth_random_addr in the kernel */
2563 mac->ether_addr_octet[0] &= 0xfe; /* clear multicast bit */
2564 mac->ether_addr_octet[0] |= 0x02; /* set local assignment bit (IEEE802) */
2565
2566 return 0;
2567 }
2568
2569 static int setup_veth(pid_t pid, char iface_name[IFNAMSIZ], int *ifi) {
2570 _cleanup_netlink_message_unref_ sd_netlink_message *m = NULL;
2571 _cleanup_netlink_unref_ sd_netlink *rtnl = NULL;
2572 struct ether_addr mac_host, mac_container;
2573 int r, i;
2574
2575 if (!arg_private_network)
2576 return 0;
2577
2578 if (!arg_network_veth)
2579 return 0;
2580
2581 /* Use two different interface name prefixes depending whether
2582 * we are in bridge mode or not. */
2583 snprintf(iface_name, IFNAMSIZ - 1, "%s-%s",
2584 arg_network_bridge ? "vb" : "ve", arg_machine);
2585
2586 r = generate_mac(&mac_container, CONTAINER_HASH_KEY, 0);
2587 if (r < 0)
2588 return log_error_errno(r, "Failed to generate predictable MAC address for container side: %m");
2589
2590 r = generate_mac(&mac_host, HOST_HASH_KEY, 0);
2591 if (r < 0)
2592 return log_error_errno(r, "Failed to generate predictable MAC address for host side: %m");
2593
2594 r = sd_netlink_open(&rtnl);
2595 if (r < 0)
2596 return log_error_errno(r, "Failed to connect to netlink: %m");
2597
2598 r = sd_rtnl_message_new_link(rtnl, &m, RTM_NEWLINK, 0);
2599 if (r < 0)
2600 return log_error_errno(r, "Failed to allocate netlink message: %m");
2601
2602 r = sd_netlink_message_append_string(m, IFLA_IFNAME, iface_name);
2603 if (r < 0)
2604 return log_error_errno(r, "Failed to add netlink interface name: %m");
2605
2606 r = sd_netlink_message_append_ether_addr(m, IFLA_ADDRESS, &mac_host);
2607 if (r < 0)
2608 return log_error_errno(r, "Failed to add netlink MAC address: %m");
2609
2610 r = sd_netlink_message_open_container(m, IFLA_LINKINFO);
2611 if (r < 0)
2612 return log_error_errno(r, "Failed to open netlink container: %m");
2613
2614 r = sd_netlink_message_open_container_union(m, IFLA_INFO_DATA, "veth");
2615 if (r < 0)
2616 return log_error_errno(r, "Failed to open netlink container: %m");
2617
2618 r = sd_netlink_message_open_container(m, VETH_INFO_PEER);
2619 if (r < 0)
2620 return log_error_errno(r, "Failed to open netlink container: %m");
2621
2622 r = sd_netlink_message_append_string(m, IFLA_IFNAME, "host0");
2623 if (r < 0)
2624 return log_error_errno(r, "Failed to add netlink interface name: %m");
2625
2626 r = sd_netlink_message_append_ether_addr(m, IFLA_ADDRESS, &mac_container);
2627 if (r < 0)
2628 return log_error_errno(r, "Failed to add netlink MAC address: %m");
2629
2630 r = sd_netlink_message_append_u32(m, IFLA_NET_NS_PID, pid);
2631 if (r < 0)
2632 return log_error_errno(r, "Failed to add netlink namespace field: %m");
2633
2634 r = sd_netlink_message_close_container(m);
2635 if (r < 0)
2636 return log_error_errno(r, "Failed to close netlink container: %m");
2637
2638 r = sd_netlink_message_close_container(m);
2639 if (r < 0)
2640 return log_error_errno(r, "Failed to close netlink container: %m");
2641
2642 r = sd_netlink_message_close_container(m);
2643 if (r < 0)
2644 return log_error_errno(r, "Failed to close netlink container: %m");
2645
2646 r = sd_netlink_call(rtnl, m, 0, NULL);
2647 if (r < 0)
2648 return log_error_errno(r, "Failed to add new veth interfaces (host0, %s): %m", iface_name);
2649
2650 i = (int) if_nametoindex(iface_name);
2651 if (i <= 0)
2652 return log_error_errno(errno, "Failed to resolve interface %s: %m", iface_name);
2653
2654 *ifi = i;
2655
2656 return 0;
2657 }
2658
2659 static int setup_bridge(const char veth_name[], int *ifi) {
2660 _cleanup_netlink_message_unref_ sd_netlink_message *m = NULL;
2661 _cleanup_netlink_unref_ sd_netlink *rtnl = NULL;
2662 int r, bridge;
2663
2664 if (!arg_private_network)
2665 return 0;
2666
2667 if (!arg_network_veth)
2668 return 0;
2669
2670 if (!arg_network_bridge)
2671 return 0;
2672
2673 bridge = (int) if_nametoindex(arg_network_bridge);
2674 if (bridge <= 0)
2675 return log_error_errno(errno, "Failed to resolve interface %s: %m", arg_network_bridge);
2676
2677 *ifi = bridge;
2678
2679 r = sd_netlink_open(&rtnl);
2680 if (r < 0)
2681 return log_error_errno(r, "Failed to connect to netlink: %m");
2682
2683 r = sd_rtnl_message_new_link(rtnl, &m, RTM_SETLINK, 0);
2684 if (r < 0)
2685 return log_error_errno(r, "Failed to allocate netlink message: %m");
2686
2687 r = sd_rtnl_message_link_set_flags(m, IFF_UP, IFF_UP);
2688 if (r < 0)
2689 return log_error_errno(r, "Failed to set IFF_UP flag: %m");
2690
2691 r = sd_netlink_message_append_string(m, IFLA_IFNAME, veth_name);
2692 if (r < 0)
2693 return log_error_errno(r, "Failed to add netlink interface name field: %m");
2694
2695 r = sd_netlink_message_append_u32(m, IFLA_MASTER, bridge);
2696 if (r < 0)
2697 return log_error_errno(r, "Failed to add netlink master field: %m");
2698
2699 r = sd_netlink_call(rtnl, m, 0, NULL);
2700 if (r < 0)
2701 return log_error_errno(r, "Failed to add veth interface to bridge: %m");
2702
2703 return 0;
2704 }
2705
2706 static int parse_interface(struct udev *udev, const char *name) {
2707 _cleanup_udev_device_unref_ struct udev_device *d = NULL;
2708 char ifi_str[2 + DECIMAL_STR_MAX(int)];
2709 int ifi;
2710
2711 ifi = (int) if_nametoindex(name);
2712 if (ifi <= 0)
2713 return log_error_errno(errno, "Failed to resolve interface %s: %m", name);
2714
2715 sprintf(ifi_str, "n%i", ifi);
2716 d = udev_device_new_from_device_id(udev, ifi_str);
2717 if (!d)
2718 return log_error_errno(errno, "Failed to get udev device for interface %s: %m", name);
2719
2720 if (udev_device_get_is_initialized(d) <= 0) {
2721 log_error("Network interface %s is not initialized yet.", name);
2722 return -EBUSY;
2723 }
2724
2725 return ifi;
2726 }
2727
2728 static int move_network_interfaces(pid_t pid) {
2729 _cleanup_udev_unref_ struct udev *udev = NULL;
2730 _cleanup_netlink_unref_ sd_netlink *rtnl = NULL;
2731 char **i;
2732 int r;
2733
2734 if (!arg_private_network)
2735 return 0;
2736
2737 if (strv_isempty(arg_network_interfaces))
2738 return 0;
2739
2740 r = sd_netlink_open(&rtnl);
2741 if (r < 0)
2742 return log_error_errno(r, "Failed to connect to netlink: %m");
2743
2744 udev = udev_new();
2745 if (!udev) {
2746 log_error("Failed to connect to udev.");
2747 return -ENOMEM;
2748 }
2749
2750 STRV_FOREACH(i, arg_network_interfaces) {
2751 _cleanup_netlink_message_unref_ sd_netlink_message *m = NULL;
2752 int ifi;
2753
2754 ifi = parse_interface(udev, *i);
2755 if (ifi < 0)
2756 return ifi;
2757
2758 r = sd_rtnl_message_new_link(rtnl, &m, RTM_SETLINK, ifi);
2759 if (r < 0)
2760 return log_error_errno(r, "Failed to allocate netlink message: %m");
2761
2762 r = sd_netlink_message_append_u32(m, IFLA_NET_NS_PID, pid);
2763 if (r < 0)
2764 return log_error_errno(r, "Failed to append namespace PID to netlink message: %m");
2765
2766 r = sd_netlink_call(rtnl, m, 0, NULL);
2767 if (r < 0)
2768 return log_error_errno(r, "Failed to move interface %s to namespace: %m", *i);
2769 }
2770
2771 return 0;
2772 }
2773
2774 static int setup_macvlan(pid_t pid) {
2775 _cleanup_udev_unref_ struct udev *udev = NULL;
2776 _cleanup_netlink_unref_ sd_netlink *rtnl = NULL;
2777 unsigned idx = 0;
2778 char **i;
2779 int r;
2780
2781 if (!arg_private_network)
2782 return 0;
2783
2784 if (strv_isempty(arg_network_macvlan))
2785 return 0;
2786
2787 r = sd_netlink_open(&rtnl);
2788 if (r < 0)
2789 return log_error_errno(r, "Failed to connect to netlink: %m");
2790
2791 udev = udev_new();
2792 if (!udev) {
2793 log_error("Failed to connect to udev.");
2794 return -ENOMEM;
2795 }
2796
2797 STRV_FOREACH(i, arg_network_macvlan) {
2798 _cleanup_netlink_message_unref_ sd_netlink_message *m = NULL;
2799 _cleanup_free_ char *n = NULL;
2800 struct ether_addr mac;
2801 int ifi;
2802
2803 ifi = parse_interface(udev, *i);
2804 if (ifi < 0)
2805 return ifi;
2806
2807 r = generate_mac(&mac, MACVLAN_HASH_KEY, idx++);
2808 if (r < 0)
2809 return log_error_errno(r, "Failed to create MACVLAN MAC address: %m");
2810
2811 r = sd_rtnl_message_new_link(rtnl, &m, RTM_NEWLINK, 0);
2812 if (r < 0)
2813 return log_error_errno(r, "Failed to allocate netlink message: %m");
2814
2815 r = sd_netlink_message_append_u32(m, IFLA_LINK, ifi);
2816 if (r < 0)
2817 return log_error_errno(r, "Failed to add netlink interface index: %m");
2818
2819 n = strappend("mv-", *i);
2820 if (!n)
2821 return log_oom();
2822
2823 strshorten(n, IFNAMSIZ-1);
2824
2825 r = sd_netlink_message_append_string(m, IFLA_IFNAME, n);
2826 if (r < 0)
2827 return log_error_errno(r, "Failed to add netlink interface name: %m");
2828
2829 r = sd_netlink_message_append_ether_addr(m, IFLA_ADDRESS, &mac);
2830 if (r < 0)
2831 return log_error_errno(r, "Failed to add netlink MAC address: %m");
2832
2833 r = sd_netlink_message_append_u32(m, IFLA_NET_NS_PID, pid);
2834 if (r < 0)
2835 return log_error_errno(r, "Failed to add netlink namespace field: %m");
2836
2837 r = sd_netlink_message_open_container(m, IFLA_LINKINFO);
2838 if (r < 0)
2839 return log_error_errno(r, "Failed to open netlink container: %m");
2840
2841 r = sd_netlink_message_open_container_union(m, IFLA_INFO_DATA, "macvlan");
2842 if (r < 0)
2843 return log_error_errno(r, "Failed to open netlink container: %m");
2844
2845 r = sd_netlink_message_append_u32(m, IFLA_MACVLAN_MODE, MACVLAN_MODE_BRIDGE);
2846 if (r < 0)
2847 return log_error_errno(r, "Failed to append macvlan mode: %m");
2848
2849 r = sd_netlink_message_close_container(m);
2850 if (r < 0)
2851 return log_error_errno(r, "Failed to close netlink container: %m");
2852
2853 r = sd_netlink_message_close_container(m);
2854 if (r < 0)
2855 return log_error_errno(r, "Failed to close netlink container: %m");
2856
2857 r = sd_netlink_call(rtnl, m, 0, NULL);
2858 if (r < 0)
2859 return log_error_errno(r, "Failed to add new macvlan interfaces: %m");
2860 }
2861
2862 return 0;
2863 }
2864
2865 static int setup_ipvlan(pid_t pid) {
2866 _cleanup_udev_unref_ struct udev *udev = NULL;
2867 _cleanup_netlink_unref_ sd_netlink *rtnl = NULL;
2868 char **i;
2869 int r;
2870
2871 if (!arg_private_network)
2872 return 0;
2873
2874 if (strv_isempty(arg_network_ipvlan))
2875 return 0;
2876
2877 r = sd_netlink_open(&rtnl);
2878 if (r < 0)
2879 return log_error_errno(r, "Failed to connect to netlink: %m");
2880
2881 udev = udev_new();
2882 if (!udev) {
2883 log_error("Failed to connect to udev.");
2884 return -ENOMEM;
2885 }
2886
2887 STRV_FOREACH(i, arg_network_ipvlan) {
2888 _cleanup_netlink_message_unref_ sd_netlink_message *m = NULL;
2889 _cleanup_free_ char *n = NULL;
2890 int ifi;
2891
2892 ifi = parse_interface(udev, *i);
2893 if (ifi < 0)
2894 return ifi;
2895
2896 r = sd_rtnl_message_new_link(rtnl, &m, RTM_NEWLINK, 0);
2897 if (r < 0)
2898 return log_error_errno(r, "Failed to allocate netlink message: %m");
2899
2900 r = sd_netlink_message_append_u32(m, IFLA_LINK, ifi);
2901 if (r < 0)
2902 return log_error_errno(r, "Failed to add netlink interface index: %m");
2903
2904 n = strappend("iv-", *i);
2905 if (!n)
2906 return log_oom();
2907
2908 strshorten(n, IFNAMSIZ-1);
2909
2910 r = sd_netlink_message_append_string(m, IFLA_IFNAME, n);
2911 if (r < 0)
2912 return log_error_errno(r, "Failed to add netlink interface name: %m");
2913
2914 r = sd_netlink_message_append_u32(m, IFLA_NET_NS_PID, pid);
2915 if (r < 0)
2916 return log_error_errno(r, "Failed to add netlink namespace field: %m");
2917
2918 r = sd_netlink_message_open_container(m, IFLA_LINKINFO);
2919 if (r < 0)
2920 return log_error_errno(r, "Failed to open netlink container: %m");
2921
2922 r = sd_netlink_message_open_container_union(m, IFLA_INFO_DATA, "ipvlan");
2923 if (r < 0)
2924 return log_error_errno(r, "Failed to open netlink container: %m");
2925
2926 r = sd_netlink_message_append_u16(m, IFLA_IPVLAN_MODE, IPVLAN_MODE_L2);
2927 if (r < 0)
2928 return log_error_errno(r, "Failed to add ipvlan mode: %m");
2929
2930 r = sd_netlink_message_close_container(m);
2931 if (r < 0)
2932 return log_error_errno(r, "Failed to close netlink container: %m");
2933
2934 r = sd_netlink_message_close_container(m);
2935 if (r < 0)
2936 return log_error_errno(r, "Failed to close netlink container: %m");
2937
2938 r = sd_netlink_call(rtnl, m, 0, NULL);
2939 if (r < 0)
2940 return log_error_errno(r, "Failed to add new ipvlan interfaces: %m");
2941 }
2942
2943 return 0;
2944 }
2945
2946 static int setup_seccomp(void) {
2947
2948 #ifdef HAVE_SECCOMP
2949 static const struct {
2950 uint64_t capability;
2951 int syscall_num;
2952 } blacklist[] = {
2953 { CAP_SYS_RAWIO, SCMP_SYS(iopl) },
2954 { CAP_SYS_RAWIO, SCMP_SYS(ioperm) },
2955 { CAP_SYS_BOOT, SCMP_SYS(kexec_load) },
2956 { CAP_SYS_ADMIN, SCMP_SYS(swapon) },
2957 { CAP_SYS_ADMIN, SCMP_SYS(swapoff) },
2958 { CAP_SYS_ADMIN, SCMP_SYS(open_by_handle_at) },
2959 { CAP_SYS_MODULE, SCMP_SYS(init_module) },
2960 { CAP_SYS_MODULE, SCMP_SYS(finit_module) },
2961 { CAP_SYS_MODULE, SCMP_SYS(delete_module) },
2962 { CAP_SYSLOG, SCMP_SYS(syslog) },
2963 };
2964
2965 scmp_filter_ctx seccomp;
2966 unsigned i;
2967 int r;
2968
2969 seccomp = seccomp_init(SCMP_ACT_ALLOW);
2970 if (!seccomp)
2971 return log_oom();
2972
2973 r = seccomp_add_secondary_archs(seccomp);
2974 if (r < 0) {
2975 log_error_errno(r, "Failed to add secondary archs to seccomp filter: %m");
2976 goto finish;
2977 }
2978
2979 for (i = 0; i < ELEMENTSOF(blacklist); i++) {
2980 if (arg_retain & (1ULL << blacklist[i].capability))
2981 continue;
2982
2983 r = seccomp_rule_add(seccomp, SCMP_ACT_ERRNO(EPERM), blacklist[i].syscall_num, 0);
2984 if (r == -EFAULT)
2985 continue; /* unknown syscall */
2986 if (r < 0) {
2987 log_error_errno(r, "Failed to block syscall: %m");
2988 goto finish;
2989 }
2990 }
2991
2992
2993 /*
2994 Audit is broken in containers, much of the userspace audit
2995 hookup will fail if running inside a container. We don't
2996 care and just turn off creation of audit sockets.
2997
2998 This will make socket(AF_NETLINK, *, NETLINK_AUDIT) fail
2999 with EAFNOSUPPORT which audit userspace uses as indication
3000 that audit is disabled in the kernel.
3001 */
3002
3003 r = seccomp_rule_add(
3004 seccomp,
3005 SCMP_ACT_ERRNO(EAFNOSUPPORT),
3006 SCMP_SYS(socket),
3007 2,
3008 SCMP_A0(SCMP_CMP_EQ, AF_NETLINK),
3009 SCMP_A2(SCMP_CMP_EQ, NETLINK_AUDIT));
3010 if (r < 0) {
3011 log_error_errno(r, "Failed to add audit seccomp rule: %m");
3012 goto finish;
3013 }
3014
3015 r = seccomp_attr_set(seccomp, SCMP_FLTATR_CTL_NNP, 0);
3016 if (r < 0) {
3017 log_error_errno(r, "Failed to unset NO_NEW_PRIVS: %m");
3018 goto finish;
3019 }
3020
3021 r = seccomp_load(seccomp);
3022 if (r == -EINVAL) {
3023 log_debug_errno(r, "Kernel is probably not configured with CONFIG_SECCOMP. Disabling seccomp audit filter: %m");
3024 r = 0;
3025 goto finish;
3026 }
3027 if (r < 0) {
3028 log_error_errno(r, "Failed to install seccomp audit filter: %m");
3029 goto finish;
3030 }
3031
3032 finish:
3033 seccomp_release(seccomp);
3034 return r;
3035 #else
3036 return 0;
3037 #endif
3038
3039 }
3040
3041 static int setup_propagate(const char *root) {
3042 const char *p, *q;
3043
3044 (void) mkdir_p("/run/systemd/nspawn/", 0755);
3045 (void) mkdir_p("/run/systemd/nspawn/propagate", 0600);
3046 p = strjoina("/run/systemd/nspawn/propagate/", arg_machine);
3047 (void) mkdir_p(p, 0600);
3048
3049 if (userns_mkdir(root, "/run/systemd", 0755, 0, 0) < 0)
3050 return log_error_errno(errno, "Failed to create /run/systemd: %m");
3051
3052 if (userns_mkdir(root, "/run/systemd/nspawn", 0755, 0, 0) < 0)
3053 return log_error_errno(errno, "Failed to create /run/systemd/nspawn: %m");
3054
3055 if (userns_mkdir(root, "/run/systemd/nspawn/incoming", 0600, 0, 0) < 0)
3056 return log_error_errno(errno, "Failed to create /run/systemd/nspawn/incoming: %m");
3057
3058 q = prefix_roota(root, "/run/systemd/nspawn/incoming");
3059 if (mount(p, q, NULL, MS_BIND, NULL) < 0)
3060 return log_error_errno(errno, "Failed to install propagation bind mount.");
3061
3062 if (mount(NULL, q, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY, NULL) < 0)
3063 return log_error_errno(errno, "Failed to make propagation mount read-only");
3064
3065 return 0;
3066 }
3067
3068 static int setup_image(char **device_path, int *loop_nr) {
3069 struct loop_info64 info = {
3070 .lo_flags = LO_FLAGS_AUTOCLEAR|LO_FLAGS_PARTSCAN
3071 };
3072 _cleanup_close_ int fd = -1, control = -1, loop = -1;
3073 _cleanup_free_ char* loopdev = NULL;
3074 struct stat st;
3075 int r, nr;
3076
3077 assert(device_path);
3078 assert(loop_nr);
3079 assert(arg_image);
3080
3081 fd = open(arg_image, O_CLOEXEC|(arg_read_only ? O_RDONLY : O_RDWR)|O_NONBLOCK|O_NOCTTY);
3082 if (fd < 0)
3083 return log_error_errno(errno, "Failed to open %s: %m", arg_image);
3084
3085 if (fstat(fd, &st) < 0)
3086 return log_error_errno(errno, "Failed to stat %s: %m", arg_image);
3087
3088 if (S_ISBLK(st.st_mode)) {
3089 char *p;
3090
3091 p = strdup(arg_image);
3092 if (!p)
3093 return log_oom();
3094
3095 *device_path = p;
3096
3097 *loop_nr = -1;
3098
3099 r = fd;
3100 fd = -1;
3101
3102 return r;
3103 }
3104
3105 if (!S_ISREG(st.st_mode)) {
3106 log_error_errno(errno, "%s is not a regular file or block device: %m", arg_image);
3107 return -EINVAL;
3108 }
3109
3110 control = open("/dev/loop-control", O_RDWR|O_CLOEXEC|O_NOCTTY|O_NONBLOCK);
3111 if (control < 0)
3112 return log_error_errno(errno, "Failed to open /dev/loop-control: %m");
3113
3114 nr = ioctl(control, LOOP_CTL_GET_FREE);
3115 if (nr < 0)
3116 return log_error_errno(errno, "Failed to allocate loop device: %m");
3117
3118 if (asprintf(&loopdev, "/dev/loop%i", nr) < 0)
3119 return log_oom();
3120
3121 loop = open(loopdev, O_CLOEXEC|(arg_read_only ? O_RDONLY : O_RDWR)|O_NONBLOCK|O_NOCTTY);
3122 if (loop < 0)
3123 return log_error_errno(errno, "Failed to open loop device %s: %m", loopdev);
3124
3125 if (ioctl(loop, LOOP_SET_FD, fd) < 0)
3126 return log_error_errno(errno, "Failed to set loopback file descriptor on %s: %m", loopdev);
3127
3128 if (arg_read_only)
3129 info.lo_flags |= LO_FLAGS_READ_ONLY;
3130
3131 if (ioctl(loop, LOOP_SET_STATUS64, &info) < 0)
3132 return log_error_errno(errno, "Failed to set loopback settings on %s: %m", loopdev);
3133
3134 *device_path = loopdev;
3135 loopdev = NULL;
3136
3137 *loop_nr = nr;
3138
3139 r = loop;
3140 loop = -1;
3141
3142 return r;
3143 }
3144
3145 #define PARTITION_TABLE_BLURB \
3146 "Note that the disk image needs to either contain only a single MBR partition of\n" \
3147 "type 0x83 that is marked bootable, or a single GPT partition of type " \
3148 "0FC63DAF-8483-4772-8E79-3D69D8477DE4 or follow\n" \
3149 " http://www.freedesktop.org/wiki/Specifications/DiscoverablePartitionsSpec/\n" \
3150 "to be bootable with systemd-nspawn."
3151
3152 static int dissect_image(
3153 int fd,
3154 char **root_device, bool *root_device_rw,
3155 char **home_device, bool *home_device_rw,
3156 char **srv_device, bool *srv_device_rw,
3157 bool *secondary) {
3158
3159 #ifdef HAVE_BLKID
3160 int home_nr = -1, srv_nr = -1;
3161 #ifdef GPT_ROOT_NATIVE
3162 int root_nr = -1;
3163 #endif
3164 #ifdef GPT_ROOT_SECONDARY
3165 int secondary_root_nr = -1;
3166 #endif
3167 _cleanup_free_ char *home = NULL, *root = NULL, *secondary_root = NULL, *srv = NULL, *generic = NULL;
3168 _cleanup_udev_enumerate_unref_ struct udev_enumerate *e = NULL;
3169 _cleanup_udev_device_unref_ struct udev_device *d = NULL;
3170 _cleanup_blkid_free_probe_ blkid_probe b = NULL;
3171 _cleanup_udev_unref_ struct udev *udev = NULL;
3172 struct udev_list_entry *first, *item;
3173 bool home_rw = true, root_rw = true, secondary_root_rw = true, srv_rw = true, generic_rw = true;
3174 bool is_gpt, is_mbr, multiple_generic = false;
3175 const char *pttype = NULL;
3176 blkid_partlist pl;
3177 struct stat st;
3178 unsigned i;
3179 int r;
3180
3181 assert(fd >= 0);
3182 assert(root_device);
3183 assert(home_device);
3184 assert(srv_device);
3185 assert(secondary);
3186 assert(arg_image);
3187
3188 b = blkid_new_probe();
3189 if (!b)
3190 return log_oom();
3191
3192 errno = 0;
3193 r = blkid_probe_set_device(b, fd, 0, 0);
3194 if (r != 0) {
3195 if (errno == 0)
3196 return log_oom();
3197
3198 log_error_errno(errno, "Failed to set device on blkid probe: %m");
3199 return -errno;
3200 }
3201
3202 blkid_probe_enable_partitions(b, 1);
3203 blkid_probe_set_partitions_flags(b, BLKID_PARTS_ENTRY_DETAILS);
3204
3205 errno = 0;
3206 r = blkid_do_safeprobe(b);
3207 if (r == -2 || r == 1) {
3208 log_error("Failed to identify any partition table on\n"
3209 " %s\n"
3210 PARTITION_TABLE_BLURB, arg_image);
3211 return -EINVAL;
3212 } else if (r != 0) {
3213 if (errno == 0)
3214 errno = EIO;
3215 log_error_errno(errno, "Failed to probe: %m");
3216 return -errno;
3217 }
3218
3219 (void) blkid_probe_lookup_value(b, "PTTYPE", &pttype, NULL);
3220
3221 is_gpt = streq_ptr(pttype, "gpt");
3222 is_mbr = streq_ptr(pttype, "dos");
3223
3224 if (!is_gpt && !is_mbr) {
3225 log_error("No GPT or MBR partition table discovered on\n"
3226 " %s\n"
3227 PARTITION_TABLE_BLURB, arg_image);
3228 return -EINVAL;
3229 }
3230
3231 errno = 0;
3232 pl = blkid_probe_get_partitions(b);
3233 if (!pl) {
3234 if (errno == 0)
3235 return log_oom();
3236
3237 log_error("Failed to list partitions of %s", arg_image);
3238 return -errno;
3239 }
3240
3241 udev = udev_new();
3242 if (!udev)
3243 return log_oom();
3244
3245 if (fstat(fd, &st) < 0)
3246 return log_error_errno(errno, "Failed to stat block device: %m");
3247
3248 d = udev_device_new_from_devnum(udev, 'b', st.st_rdev);
3249 if (!d)
3250 return log_oom();
3251
3252 for (i = 0;; i++) {
3253 int n, m;
3254
3255 if (i >= 10) {
3256 log_error("Kernel partitions never appeared.");
3257 return -ENXIO;
3258 }
3259
3260 e = udev_enumerate_new(udev);
3261 if (!e)
3262 return log_oom();
3263
3264 r = udev_enumerate_add_match_parent(e, d);
3265 if (r < 0)
3266 return log_oom();
3267
3268 r = udev_enumerate_scan_devices(e);
3269 if (r < 0)
3270 return log_error_errno(r, "Failed to scan for partition devices of %s: %m", arg_image);
3271
3272 /* Count the partitions enumerated by the kernel */
3273 n = 0;
3274 first = udev_enumerate_get_list_entry(e);
3275 udev_list_entry_foreach(item, first)
3276 n++;
3277
3278 /* Count the partitions enumerated by blkid */
3279 m = blkid_partlist_numof_partitions(pl);
3280 if (n == m + 1)
3281 break;
3282 if (n > m + 1) {
3283 log_error("blkid and kernel partition list do not match.");
3284 return -EIO;
3285 }
3286 if (n < m + 1) {
3287 unsigned j;
3288
3289 /* The kernel has probed fewer partitions than
3290 * blkid? Maybe the kernel prober is still
3291 * running or it got EBUSY because udev
3292 * already opened the device. Let's reprobe
3293 * the device, which is a synchronous call
3294 * that waits until probing is complete. */
3295
3296 for (j = 0; j < 20; j++) {
3297
3298 r = ioctl(fd, BLKRRPART, 0);
3299 if (r < 0)
3300 r = -errno;
3301 if (r >= 0 || r != -EBUSY)
3302 break;
3303
3304 /* If something else has the device
3305 * open, such as an udev rule, the
3306 * ioctl will return EBUSY. Since
3307 * there's no way to wait until it
3308 * isn't busy anymore, let's just wait
3309 * a bit, and try again.
3310 *
3311 * This is really something they
3312 * should fix in the kernel! */
3313
3314 usleep(50 * USEC_PER_MSEC);
3315 }
3316
3317 if (r < 0)
3318 return log_error_errno(r, "Failed to reread partition table: %m");
3319 }
3320
3321 e = udev_enumerate_unref(e);
3322 }
3323
3324 first = udev_enumerate_get_list_entry(e);
3325 udev_list_entry_foreach(item, first) {
3326 _cleanup_udev_device_unref_ struct udev_device *q;
3327 const char *node;
3328 unsigned long long flags;
3329 blkid_partition pp;
3330 dev_t qn;
3331 int nr;
3332
3333 errno = 0;
3334 q = udev_device_new_from_syspath(udev, udev_list_entry_get_name(item));
3335 if (!q) {
3336 if (!errno)
3337 errno = ENOMEM;
3338
3339 log_error_errno(errno, "Failed to get partition device of %s: %m", arg_image);
3340 return -errno;
3341 }
3342
3343 qn = udev_device_get_devnum(q);
3344 if (major(qn) == 0)
3345 continue;
3346
3347 if (st.st_rdev == qn)
3348 continue;
3349
3350 node = udev_device_get_devnode(q);
3351 if (!node)
3352 continue;
3353
3354 pp = blkid_partlist_devno_to_partition(pl, qn);
3355 if (!pp)
3356 continue;
3357
3358 flags = blkid_partition_get_flags(pp);
3359
3360 nr = blkid_partition_get_partno(pp);
3361 if (nr < 0)
3362 continue;
3363
3364 if (is_gpt) {
3365 sd_id128_t type_id;
3366 const char *stype;
3367
3368 if (flags & GPT_FLAG_NO_AUTO)
3369 continue;
3370
3371 stype = blkid_partition_get_type_string(pp);
3372 if (!stype)
3373 continue;
3374
3375 if (sd_id128_from_string(stype, &type_id) < 0)
3376 continue;
3377
3378 if (sd_id128_equal(type_id, GPT_HOME)) {
3379
3380 if (home && nr >= home_nr)
3381 continue;
3382
3383 home_nr = nr;
3384 home_rw = !(flags & GPT_FLAG_READ_ONLY);
3385
3386 r = free_and_strdup(&home, node);
3387 if (r < 0)
3388 return log_oom();
3389
3390 } else if (sd_id128_equal(type_id, GPT_SRV)) {
3391
3392 if (srv && nr >= srv_nr)
3393 continue;
3394
3395 srv_nr = nr;
3396 srv_rw = !(flags & GPT_FLAG_READ_ONLY);
3397
3398 r = free_and_strdup(&srv, node);
3399 if (r < 0)
3400 return log_oom();
3401 }
3402 #ifdef GPT_ROOT_NATIVE
3403 else if (sd_id128_equal(type_id, GPT_ROOT_NATIVE)) {
3404
3405 if (root && nr >= root_nr)
3406 continue;
3407
3408 root_nr = nr;
3409 root_rw = !(flags & GPT_FLAG_READ_ONLY);
3410
3411 r = free_and_strdup(&root, node);
3412 if (r < 0)
3413 return log_oom();
3414 }
3415 #endif
3416 #ifdef GPT_ROOT_SECONDARY
3417 else if (sd_id128_equal(type_id, GPT_ROOT_SECONDARY)) {
3418
3419 if (secondary_root && nr >= secondary_root_nr)
3420 continue;
3421
3422 secondary_root_nr = nr;
3423 secondary_root_rw = !(flags & GPT_FLAG_READ_ONLY);
3424
3425 r = free_and_strdup(&secondary_root, node);
3426 if (r < 0)
3427 return log_oom();
3428 }
3429 #endif
3430 else if (sd_id128_equal(type_id, GPT_LINUX_GENERIC)) {
3431
3432 if (generic)
3433 multiple_generic = true;
3434 else {
3435 generic_rw = !(flags & GPT_FLAG_READ_ONLY);
3436
3437 r = free_and_strdup(&generic, node);
3438 if (r < 0)
3439 return log_oom();
3440 }
3441 }
3442
3443 } else if (is_mbr) {
3444 int type;
3445
3446 if (flags != 0x80) /* Bootable flag */
3447 continue;
3448
3449 type = blkid_partition_get_type(pp);
3450 if (type != 0x83) /* Linux partition */
3451 continue;
3452
3453 if (generic)
3454 multiple_generic = true;
3455 else {
3456 generic_rw = true;
3457
3458 r = free_and_strdup(&root, node);
3459 if (r < 0)
3460 return log_oom();
3461 }
3462 }
3463 }
3464
3465 if (root) {
3466 *root_device = root;
3467 root = NULL;
3468
3469 *root_device_rw = root_rw;
3470 *secondary = false;
3471 } else if (secondary_root) {
3472 *root_device = secondary_root;
3473 secondary_root = NULL;
3474
3475 *root_device_rw = secondary_root_rw;
3476 *secondary = true;
3477 } else if (generic) {
3478
3479 /* There were no partitions with precise meanings
3480 * around, but we found generic partitions. In this
3481 * case, if there's only one, we can go ahead and boot
3482 * it, otherwise we bail out, because we really cannot
3483 * make any sense of it. */
3484
3485 if (multiple_generic) {
3486 log_error("Identified multiple bootable Linux partitions on\n"
3487 " %s\n"
3488 PARTITION_TABLE_BLURB, arg_image);
3489 return -EINVAL;
3490 }
3491
3492 *root_device = generic;
3493 generic = NULL;
3494
3495 *root_device_rw = generic_rw;
3496 *secondary = false;
3497 } else {
3498 log_error("Failed to identify root partition in disk image\n"
3499 " %s\n"
3500 PARTITION_TABLE_BLURB, arg_image);
3501 return -EINVAL;
3502 }
3503
3504 if (home) {
3505 *home_device = home;
3506 home = NULL;
3507
3508 *home_device_rw = home_rw;
3509 }
3510
3511 if (srv) {
3512 *srv_device = srv;
3513 srv = NULL;
3514
3515 *srv_device_rw = srv_rw;
3516 }
3517
3518 return 0;
3519 #else
3520 log_error("--image= is not supported, compiled without blkid support.");
3521 return -EOPNOTSUPP;
3522 #endif
3523 }
3524
3525 static int mount_device(const char *what, const char *where, const char *directory, bool rw) {
3526 #ifdef HAVE_BLKID
3527 _cleanup_blkid_free_probe_ blkid_probe b = NULL;
3528 const char *fstype, *p;
3529 int r;
3530
3531 assert(what);
3532 assert(where);
3533
3534 if (arg_read_only)
3535 rw = false;
3536
3537 if (directory)
3538 p = strjoina(where, directory);
3539 else
3540 p = where;
3541
3542 errno = 0;
3543 b = blkid_new_probe_from_filename(what);
3544 if (!b) {
3545 if (errno == 0)
3546 return log_oom();
3547 log_error_errno(errno, "Failed to allocate prober for %s: %m", what);
3548 return -errno;
3549 }
3550
3551 blkid_probe_enable_superblocks(b, 1);
3552 blkid_probe_set_superblocks_flags(b, BLKID_SUBLKS_TYPE);
3553
3554 errno = 0;
3555 r = blkid_do_safeprobe(b);
3556 if (r == -1 || r == 1) {
3557 log_error("Cannot determine file system type of %s", what);
3558 return -EINVAL;
3559 } else if (r != 0) {
3560 if (errno == 0)
3561 errno = EIO;
3562 log_error_errno(errno, "Failed to probe %s: %m", what);
3563 return -errno;
3564 }
3565
3566 errno = 0;
3567 if (blkid_probe_lookup_value(b, "TYPE", &fstype, NULL) < 0) {
3568 if (errno == 0)
3569 errno = EINVAL;
3570 log_error("Failed to determine file system type of %s", what);
3571 return -errno;
3572 }
3573
3574 if (streq(fstype, "crypto_LUKS")) {
3575 log_error("nspawn currently does not support LUKS disk images.");
3576 return -EOPNOTSUPP;
3577 }
3578
3579 if (mount(what, p, fstype, MS_NODEV|(rw ? 0 : MS_RDONLY), NULL) < 0)
3580 return log_error_errno(errno, "Failed to mount %s: %m", what);
3581
3582 return 0;
3583 #else
3584 log_error("--image= is not supported, compiled without blkid support.");
3585 return -EOPNOTSUPP;
3586 #endif
3587 }
3588
3589 static int mount_devices(
3590 const char *where,
3591 const char *root_device, bool root_device_rw,
3592 const char *home_device, bool home_device_rw,
3593 const char *srv_device, bool srv_device_rw) {
3594 int r;
3595
3596 assert(where);
3597
3598 if (root_device) {
3599 r = mount_device(root_device, arg_directory, NULL, root_device_rw);
3600 if (r < 0)
3601 return log_error_errno(r, "Failed to mount root directory: %m");
3602 }
3603
3604 if (home_device) {
3605 r = mount_device(home_device, arg_directory, "/home", home_device_rw);
3606 if (r < 0)
3607 return log_error_errno(r, "Failed to mount home directory: %m");
3608 }
3609
3610 if (srv_device) {
3611 r = mount_device(srv_device, arg_directory, "/srv", srv_device_rw);
3612 if (r < 0)
3613 return log_error_errno(r, "Failed to mount server data directory: %m");
3614 }
3615
3616 return 0;
3617 }
3618
3619 static void loop_remove(int nr, int *image_fd) {
3620 _cleanup_close_ int control = -1;
3621 int r;
3622
3623 if (nr < 0)
3624 return;
3625
3626 if (image_fd && *image_fd >= 0) {
3627 r = ioctl(*image_fd, LOOP_CLR_FD);
3628 if (r < 0)
3629 log_debug_errno(errno, "Failed to close loop image: %m");
3630 *image_fd = safe_close(*image_fd);
3631 }
3632
3633 control = open("/dev/loop-control", O_RDWR|O_CLOEXEC|O_NOCTTY|O_NONBLOCK);
3634 if (control < 0) {
3635 log_warning_errno(errno, "Failed to open /dev/loop-control: %m");
3636 return;
3637 }
3638
3639 r = ioctl(control, LOOP_CTL_REMOVE, nr);
3640 if (r < 0)
3641 log_debug_errno(errno, "Failed to remove loop %d: %m", nr);
3642 }
3643
3644 static int spawn_getent(const char *database, const char *key, pid_t *rpid) {
3645 int pipe_fds[2];
3646 pid_t pid;
3647
3648 assert(database);
3649 assert(key);
3650 assert(rpid);
3651
3652 if (pipe2(pipe_fds, O_CLOEXEC) < 0)
3653 return log_error_errno(errno, "Failed to allocate pipe: %m");
3654
3655 pid = fork();
3656 if (pid < 0)
3657 return log_error_errno(errno, "Failed to fork getent child: %m");
3658 else if (pid == 0) {
3659 int nullfd;
3660 char *empty_env = NULL;
3661
3662 if (dup3(pipe_fds[1], STDOUT_FILENO, 0) < 0)
3663 _exit(EXIT_FAILURE);
3664
3665 if (pipe_fds[0] > 2)
3666 safe_close(pipe_fds[0]);
3667 if (pipe_fds[1] > 2)
3668 safe_close(pipe_fds[1]);
3669
3670 nullfd = open("/dev/null", O_RDWR);
3671 if (nullfd < 0)
3672 _exit(EXIT_FAILURE);
3673
3674 if (dup3(nullfd, STDIN_FILENO, 0) < 0)
3675 _exit(EXIT_FAILURE);
3676
3677 if (dup3(nullfd, STDERR_FILENO, 0) < 0)
3678 _exit(EXIT_FAILURE);
3679
3680 if (nullfd > 2)
3681 safe_close(nullfd);
3682
3683 (void) reset_all_signal_handlers();
3684 (void) reset_signal_mask();
3685 close_all_fds(NULL, 0);
3686
3687 execle("/usr/bin/getent", "getent", database, key, NULL, &empty_env);
3688 execle("/bin/getent", "getent", database, key, NULL, &empty_env);
3689 _exit(EXIT_FAILURE);
3690 }
3691
3692 pipe_fds[1] = safe_close(pipe_fds[1]);
3693
3694 *rpid = pid;
3695
3696 return pipe_fds[0];
3697 }
3698
3699 static int change_uid_gid(char **_home) {
3700 char line[LINE_MAX], *x, *u, *g, *h;
3701 const char *word, *state;
3702 _cleanup_free_ uid_t *uids = NULL;
3703 _cleanup_free_ char *home = NULL;
3704 _cleanup_fclose_ FILE *f = NULL;
3705 _cleanup_close_ int fd = -1;
3706 unsigned n_uids = 0;
3707 size_t sz = 0, l;
3708 uid_t uid;
3709 gid_t gid;
3710 pid_t pid;
3711 int r;
3712
3713 assert(_home);
3714
3715 if (!arg_user || streq(arg_user, "root") || streq(arg_user, "0")) {
3716 /* Reset everything fully to 0, just in case */
3717
3718 r = reset_uid_gid();
3719 if (r < 0)
3720 return log_error_errno(r, "Failed to become root: %m");
3721
3722 *_home = NULL;
3723 return 0;
3724 }
3725
3726 /* First, get user credentials */
3727 fd = spawn_getent("passwd", arg_user, &pid);
3728 if (fd < 0)
3729 return fd;
3730
3731 f = fdopen(fd, "r");
3732 if (!f)
3733 return log_oom();
3734 fd = -1;
3735
3736 if (!fgets(line, sizeof(line), f)) {
3737
3738 if (!ferror(f)) {
3739 log_error("Failed to resolve user %s.", arg_user);
3740 return -ESRCH;
3741 }
3742
3743 log_error_errno(errno, "Failed to read from getent: %m");
3744 return -errno;
3745 }
3746
3747 truncate_nl(line);
3748
3749 wait_for_terminate_and_warn("getent passwd", pid, true);
3750
3751 x = strchr(line, ':');
3752 if (!x) {
3753 log_error("/etc/passwd entry has invalid user field.");
3754 return -EIO;
3755 }
3756
3757 u = strchr(x+1, ':');
3758 if (!u) {
3759 log_error("/etc/passwd entry has invalid password field.");
3760 return -EIO;
3761 }
3762
3763 u++;
3764 g = strchr(u, ':');
3765 if (!g) {
3766 log_error("/etc/passwd entry has invalid UID field.");
3767 return -EIO;
3768 }
3769
3770 *g = 0;
3771 g++;
3772 x = strchr(g, ':');
3773 if (!x) {
3774 log_error("/etc/passwd entry has invalid GID field.");
3775 return -EIO;
3776 }
3777
3778 *x = 0;
3779 h = strchr(x+1, ':');
3780 if (!h) {
3781 log_error("/etc/passwd entry has invalid GECOS field.");
3782 return -EIO;
3783 }
3784
3785 h++;
3786 x = strchr(h, ':');
3787 if (!x) {
3788 log_error("/etc/passwd entry has invalid home directory field.");
3789 return -EIO;
3790 }
3791
3792 *x = 0;
3793
3794 r = parse_uid(u, &uid);
3795 if (r < 0) {
3796 log_error("Failed to parse UID of user.");
3797 return -EIO;
3798 }
3799
3800 r = parse_gid(g, &gid);
3801 if (r < 0) {
3802 log_error("Failed to parse GID of user.");
3803 return -EIO;
3804 }
3805
3806 home = strdup(h);
3807 if (!home)
3808 return log_oom();
3809
3810 /* Second, get group memberships */
3811 fd = spawn_getent("initgroups", arg_user, &pid);
3812 if (fd < 0)
3813 return fd;
3814
3815 fclose(f);
3816 f = fdopen(fd, "r");
3817 if (!f)
3818 return log_oom();
3819 fd = -1;
3820
3821 if (!fgets(line, sizeof(line), f)) {
3822 if (!ferror(f)) {
3823 log_error("Failed to resolve user %s.", arg_user);
3824 return -ESRCH;
3825 }
3826
3827 log_error_errno(errno, "Failed to read from getent: %m");
3828 return -errno;
3829 }
3830
3831 truncate_nl(line);
3832
3833 wait_for_terminate_and_warn("getent initgroups", pid, true);
3834
3835 /* Skip over the username and subsequent separator whitespace */
3836 x = line;
3837 x += strcspn(x, WHITESPACE);
3838 x += strspn(x, WHITESPACE);
3839
3840 FOREACH_WORD(word, l, x, state) {
3841 char c[l+1];
3842
3843 memcpy(c, word, l);
3844 c[l] = 0;
3845
3846 if (!GREEDY_REALLOC(uids, sz, n_uids+1))
3847 return log_oom();
3848
3849 r = parse_uid(c, &uids[n_uids++]);
3850 if (r < 0) {
3851 log_error("Failed to parse group data from getent.");
3852 return -EIO;
3853 }
3854 }
3855
3856 r = mkdir_parents(home, 0775);
3857 if (r < 0)
3858 return log_error_errno(r, "Failed to make home root directory: %m");
3859
3860 r = mkdir_safe(home, 0755, uid, gid);
3861 if (r < 0 && r != -EEXIST)
3862 return log_error_errno(r, "Failed to make home directory: %m");
3863
3864 (void) fchown(STDIN_FILENO, uid, gid);
3865 (void) fchown(STDOUT_FILENO, uid, gid);
3866 (void) fchown(STDERR_FILENO, uid, gid);
3867
3868 if (setgroups(n_uids, uids) < 0)
3869 return log_error_errno(errno, "Failed to set auxiliary groups: %m");
3870
3871 if (setresgid(gid, gid, gid) < 0)
3872 return log_error_errno(errno, "setregid() failed: %m");
3873
3874 if (setresuid(uid, uid, uid) < 0)
3875 return log_error_errno(errno, "setreuid() failed: %m");
3876
3877 if (_home) {
3878 *_home = home;
3879 home = NULL;
3880 }
3881
3882 return 0;
3883 }
3884
3885 /*
3886 * Return values:
3887 * < 0 : wait_for_terminate() failed to get the state of the
3888 * container, the container was terminated by a signal, or
3889 * failed for an unknown reason. No change is made to the
3890 * container argument.
3891 * > 0 : The program executed in the container terminated with an
3892 * error. The exit code of the program executed in the
3893 * container is returned. The container argument has been set
3894 * to CONTAINER_TERMINATED.
3895 * 0 : The container is being rebooted, has been shut down or exited
3896 * successfully. The container argument has been set to either
3897 * CONTAINER_TERMINATED or CONTAINER_REBOOTED.
3898 *
3899 * That is, success is indicated by a return value of zero, and an
3900 * error is indicated by a non-zero value.
3901 */
3902 static int wait_for_container(pid_t pid, ContainerStatus *container) {
3903 siginfo_t status;
3904 int r;
3905
3906 r = wait_for_terminate(pid, &status);
3907 if (r < 0)
3908 return log_warning_errno(r, "Failed to wait for container: %m");
3909
3910 switch (status.si_code) {
3911
3912 case CLD_EXITED:
3913 if (status.si_status == 0) {
3914 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s exited successfully.", arg_machine);
3915
3916 } else
3917 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s failed with error code %i.", arg_machine, status.si_status);
3918
3919 *container = CONTAINER_TERMINATED;
3920 return status.si_status;
3921
3922 case CLD_KILLED:
3923 if (status.si_status == SIGINT) {
3924
3925 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s has been shut down.", arg_machine);
3926 *container = CONTAINER_TERMINATED;
3927 return 0;
3928
3929 } else if (status.si_status == SIGHUP) {
3930
3931 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s is being rebooted.", arg_machine);
3932 *container = CONTAINER_REBOOTED;
3933 return 0;
3934 }
3935
3936 /* CLD_KILLED fallthrough */
3937
3938 case CLD_DUMPED:
3939 log_error("Container %s terminated by signal %s.", arg_machine, signal_to_string(status.si_status));
3940 return -EIO;
3941
3942 default:
3943 log_error("Container %s failed due to unknown reason.", arg_machine);
3944 return -EIO;
3945 }
3946
3947 return r;
3948 }
3949
3950 static void nop_handler(int sig) {}
3951
3952 static int on_orderly_shutdown(sd_event_source *s, const struct signalfd_siginfo *si, void *userdata) {
3953 pid_t pid;
3954
3955 pid = PTR_TO_UINT32(userdata);
3956 if (pid > 0) {
3957 if (kill(pid, arg_kill_signal) >= 0) {
3958 log_info("Trying to halt container. Send SIGTERM again to trigger immediate termination.");
3959 sd_event_source_set_userdata(s, NULL);
3960 return 0;
3961 }
3962 }
3963
3964 sd_event_exit(sd_event_source_get_event(s), 0);
3965 return 0;
3966 }
3967
3968 static int determine_names(void) {
3969 int r;
3970
3971 if (!arg_image && !arg_directory) {
3972 if (arg_machine) {
3973 _cleanup_(image_unrefp) Image *i = NULL;
3974
3975 r = image_find(arg_machine, &i);
3976 if (r < 0)
3977 return log_error_errno(r, "Failed to find image for machine '%s': %m", arg_machine);
3978 else if (r == 0) {
3979 log_error("No image for machine '%s': %m", arg_machine);
3980 return -ENOENT;
3981 }
3982
3983 if (i->type == IMAGE_RAW)
3984 r = set_sanitized_path(&arg_image, i->path);
3985 else
3986 r = set_sanitized_path(&arg_directory, i->path);
3987 if (r < 0)
3988 return log_error_errno(r, "Invalid image directory: %m");
3989
3990 if (!arg_ephemeral)
3991 arg_read_only = arg_read_only || i->read_only;
3992 } else
3993 arg_directory = get_current_dir_name();
3994
3995 if (!arg_directory && !arg_machine) {
3996 log_error("Failed to determine path, please use -D or -i.");
3997 return -EINVAL;
3998 }
3999 }
4000
4001 if (!arg_machine) {
4002 if (arg_directory && path_equal(arg_directory, "/"))
4003 arg_machine = gethostname_malloc();
4004 else
4005 arg_machine = strdup(basename(arg_image ?: arg_directory));
4006
4007 if (!arg_machine)
4008 return log_oom();
4009
4010 hostname_cleanup(arg_machine, false);
4011 if (!machine_name_is_valid(arg_machine)) {
4012 log_error("Failed to determine machine name automatically, please use -M.");
4013 return -EINVAL;
4014 }
4015
4016 if (arg_ephemeral) {
4017 char *b;
4018
4019 /* Add a random suffix when this is an
4020 * ephemeral machine, so that we can run many
4021 * instances at once without manually having
4022 * to specify -M each time. */
4023
4024 if (asprintf(&b, "%s-%016" PRIx64, arg_machine, random_u64()) < 0)
4025 return log_oom();
4026
4027 free(arg_machine);
4028 arg_machine = b;
4029 }
4030 }
4031
4032 return 0;
4033 }
4034
4035 static int determine_uid_shift(const char *directory) {
4036 int r;
4037
4038 if (!arg_userns) {
4039 arg_uid_shift = 0;
4040 return 0;
4041 }
4042
4043 if (arg_uid_shift == UID_INVALID) {
4044 struct stat st;
4045
4046 r = stat(directory, &st);
4047 if (r < 0)
4048 return log_error_errno(errno, "Failed to determine UID base of %s: %m", directory);
4049
4050 arg_uid_shift = st.st_uid & UINT32_C(0xffff0000);
4051
4052 if (arg_uid_shift != (st.st_gid & UINT32_C(0xffff0000))) {
4053 log_error("UID and GID base of %s don't match.", directory);
4054 return -EINVAL;
4055 }
4056
4057 arg_uid_range = UINT32_C(0x10000);
4058 }
4059
4060 if (arg_uid_shift > (uid_t) -1 - arg_uid_range) {
4061 log_error("UID base too high for UID range.");
4062 return -EINVAL;
4063 }
4064
4065 log_info("Using user namespaces with base " UID_FMT " and range " UID_FMT ".", arg_uid_shift, arg_uid_range);
4066 return 0;
4067 }
4068
4069 static int inner_child(
4070 Barrier *barrier,
4071 const char *directory,
4072 bool secondary,
4073 int kmsg_socket,
4074 int rtnl_socket,
4075 FDSet *fds,
4076 int argc,
4077 char *argv[]) {
4078
4079 _cleanup_free_ char *home = NULL;
4080 unsigned n_env = 2;
4081 const char *envp[] = {
4082 "PATH=" DEFAULT_PATH_SPLIT_USR,
4083 "container=systemd-nspawn", /* LXC sets container=lxc, so follow the scheme here */
4084 NULL, /* TERM */
4085 NULL, /* HOME */
4086 NULL, /* USER */
4087 NULL, /* LOGNAME */
4088 NULL, /* container_uuid */
4089 NULL, /* LISTEN_FDS */
4090 NULL, /* LISTEN_PID */
4091 NULL
4092 };
4093
4094 _cleanup_strv_free_ char **env_use = NULL;
4095 int r;
4096
4097 assert(barrier);
4098 assert(directory);
4099 assert(kmsg_socket >= 0);
4100
4101 if (arg_userns) {
4102 /* Tell the parent, that it now can write the UID map. */
4103 (void) barrier_place(barrier); /* #1 */
4104
4105 /* Wait until the parent wrote the UID map */
4106 if (!barrier_place_and_sync(barrier)) { /* #2 */
4107 log_error("Parent died too early");
4108 return -ESRCH;
4109 }
4110 }
4111
4112 r = mount_all(NULL, true);
4113 if (r < 0)
4114 return r;
4115
4116 /* Wait until we are cgroup-ified, so that we
4117 * can mount the right cgroup path writable */
4118 if (!barrier_place_and_sync(barrier)) { /* #3 */
4119 log_error("Parent died too early");
4120 return -ESRCH;
4121 }
4122
4123 r = mount_systemd_cgroup_writable("");
4124 if (r < 0)
4125 return r;
4126
4127 r = reset_uid_gid();
4128 if (r < 0)
4129 return log_error_errno(r, "Couldn't become new root: %m");
4130
4131 r = setup_boot_id(NULL);
4132 if (r < 0)
4133 return r;
4134
4135 r = setup_kmsg(NULL, kmsg_socket);
4136 if (r < 0)
4137 return r;
4138 kmsg_socket = safe_close(kmsg_socket);
4139
4140 umask(0022);
4141
4142 if (setsid() < 0)
4143 return log_error_errno(errno, "setsid() failed: %m");
4144
4145 if (arg_private_network)
4146 loopback_setup();
4147
4148 r = send_rtnl(rtnl_socket);
4149 if (r < 0)
4150 return r;
4151 rtnl_socket = safe_close(rtnl_socket);
4152
4153 if (drop_capabilities() < 0)
4154 return log_error_errno(errno, "drop_capabilities() failed: %m");
4155
4156 setup_hostname();
4157
4158 if (arg_personality != PERSONALITY_INVALID) {
4159 if (personality(arg_personality) < 0)
4160 return log_error_errno(errno, "personality() failed: %m");
4161 } else if (secondary) {
4162 if (personality(PER_LINUX32) < 0)
4163 return log_error_errno(errno, "personality() failed: %m");
4164 }
4165
4166 #ifdef HAVE_SELINUX
4167 if (arg_selinux_context)
4168 if (setexeccon((security_context_t) arg_selinux_context) < 0)
4169 return log_error_errno(errno, "setexeccon(\"%s\") failed: %m", arg_selinux_context);
4170 #endif
4171
4172 r = change_uid_gid(&home);
4173 if (r < 0)
4174 return r;
4175
4176 envp[n_env] = strv_find_prefix(environ, "TERM=");
4177 if (envp[n_env])
4178 n_env ++;
4179
4180 if ((asprintf((char**)(envp + n_env++), "HOME=%s", home ? home: "/root") < 0) ||
4181 (asprintf((char**)(envp + n_env++), "USER=%s", arg_user ? arg_user : "root") < 0) ||
4182 (asprintf((char**)(envp + n_env++), "LOGNAME=%s", arg_user ? arg_user : "root") < 0))
4183 return log_oom();
4184
4185 if (!sd_id128_equal(arg_uuid, SD_ID128_NULL)) {
4186 char as_uuid[37];
4187
4188 if (asprintf((char**)(envp + n_env++), "container_uuid=%s", id128_format_as_uuid(arg_uuid, as_uuid)) < 0)
4189 return log_oom();
4190 }
4191
4192 if (fdset_size(fds) > 0) {
4193 r = fdset_cloexec(fds, false);
4194 if (r < 0)
4195 return log_error_errno(r, "Failed to unset O_CLOEXEC for file descriptors.");
4196
4197 if ((asprintf((char **)(envp + n_env++), "LISTEN_FDS=%u", fdset_size(fds)) < 0) ||
4198 (asprintf((char **)(envp + n_env++), "LISTEN_PID=1") < 0))
4199 return log_oom();
4200 }
4201
4202 env_use = strv_env_merge(2, envp, arg_setenv);
4203 if (!env_use)
4204 return log_oom();
4205
4206 /* Let the parent know that we are ready and
4207 * wait until the parent is ready with the
4208 * setup, too... */
4209 if (!barrier_place_and_sync(barrier)) { /* #4 */
4210 log_error("Parent died too early");
4211 return -ESRCH;
4212 }
4213
4214 /* Now, explicitly close the log, so that we
4215 * then can close all remaining fds. Closing
4216 * the log explicitly first has the benefit
4217 * that the logging subsystem knows about it,
4218 * and is thus ready to be reopened should we
4219 * need it again. Note that the other fds
4220 * closed here are at least the locking and
4221 * barrier fds. */
4222 log_close();
4223 (void) fdset_close_others(fds);
4224
4225 if (arg_boot) {
4226 char **a;
4227 size_t m;
4228
4229 /* Automatically search for the init system */
4230
4231 m = 1 + argc - optind;
4232 a = newa(char*, m + 1);
4233 memcpy(a + 1, argv + optind, m * sizeof(char*));
4234
4235 a[0] = (char*) "/usr/lib/systemd/systemd";
4236 execve(a[0], a, env_use);
4237
4238 a[0] = (char*) "/lib/systemd/systemd";
4239 execve(a[0], a, env_use);
4240
4241 a[0] = (char*) "/sbin/init";
4242 execve(a[0], a, env_use);
4243 } else if (argc > optind)
4244 execvpe(argv[optind], argv + optind, env_use);
4245 else {
4246 chdir(home ? home : "/root");
4247 execle("/bin/bash", "-bash", NULL, env_use);
4248 execle("/bin/sh", "-sh", NULL, env_use);
4249 }
4250
4251 (void) log_open();
4252 return log_error_errno(errno, "execv() failed: %m");
4253 }
4254
4255 static int outer_child(
4256 Barrier *barrier,
4257 const char *directory,
4258 const char *console,
4259 const char *root_device, bool root_device_rw,
4260 const char *home_device, bool home_device_rw,
4261 const char *srv_device, bool srv_device_rw,
4262 bool interactive,
4263 bool secondary,
4264 int pid_socket,
4265 int kmsg_socket,
4266 int rtnl_socket,
4267 int uid_shift_socket,
4268 FDSet *fds,
4269 int argc,
4270 char *argv[]) {
4271
4272 pid_t pid;
4273 ssize_t l;
4274 int r;
4275
4276 assert(barrier);
4277 assert(directory);
4278 assert(console);
4279 assert(pid_socket >= 0);
4280 assert(kmsg_socket >= 0);
4281
4282 if (prctl(PR_SET_PDEATHSIG, SIGKILL) < 0)
4283 return log_error_errno(errno, "PR_SET_PDEATHSIG failed: %m");
4284
4285 if (interactive) {
4286 close_nointr(STDIN_FILENO);
4287 close_nointr(STDOUT_FILENO);
4288 close_nointr(STDERR_FILENO);
4289
4290 r = open_terminal(console, O_RDWR);
4291 if (r != STDIN_FILENO) {
4292 if (r >= 0) {
4293 safe_close(r);
4294 r = -EINVAL;
4295 }
4296
4297 return log_error_errno(r, "Failed to open console: %m");
4298 }
4299
4300 if (dup2(STDIN_FILENO, STDOUT_FILENO) != STDOUT_FILENO ||
4301 dup2(STDIN_FILENO, STDERR_FILENO) != STDERR_FILENO)
4302 return log_error_errno(errno, "Failed to duplicate console: %m");
4303 }
4304
4305 r = reset_audit_loginuid();
4306 if (r < 0)
4307 return r;
4308
4309 /* Mark everything as slave, so that we still
4310 * receive mounts from the real root, but don't
4311 * propagate mounts to the real root. */
4312 if (mount(NULL, "/", NULL, MS_SLAVE|MS_REC, NULL) < 0)
4313 return log_error_errno(errno, "MS_SLAVE|MS_REC failed: %m");
4314
4315 r = mount_devices(directory,
4316 root_device, root_device_rw,
4317 home_device, home_device_rw,
4318 srv_device, srv_device_rw);
4319 if (r < 0)
4320 return r;
4321
4322 r = determine_uid_shift(directory);
4323 if (r < 0)
4324 return r;
4325
4326 if (arg_userns) {
4327 l = send(uid_shift_socket, &arg_uid_shift, sizeof(arg_uid_shift), MSG_NOSIGNAL);
4328 if (l < 0)
4329 return log_error_errno(errno, "Failed to send UID shift: %m");
4330 if (l != sizeof(arg_uid_shift)) {
4331 log_error("Short write while sending UID shift.");
4332 return -EIO;
4333 }
4334 }
4335
4336 /* Turn directory into bind mount */
4337 if (mount(directory, directory, NULL, MS_BIND|MS_REC, NULL) < 0)
4338 return log_error_errno(errno, "Failed to make bind mount: %m");
4339
4340 r = setup_volatile(directory);
4341 if (r < 0)
4342 return r;
4343
4344 r = setup_volatile_state(directory);
4345 if (r < 0)
4346 return r;
4347
4348 r = base_filesystem_create(directory, arg_uid_shift, (gid_t) arg_uid_shift);
4349 if (r < 0)
4350 return r;
4351
4352 if (arg_read_only) {
4353 r = bind_remount_recursive(directory, true);
4354 if (r < 0)
4355 return log_error_errno(r, "Failed to make tree read-only: %m");
4356 }
4357
4358 r = mount_all(directory, false);
4359 if (r < 0)
4360 return r;
4361
4362 if (copy_devnodes(directory) < 0)
4363 return r;
4364
4365 dev_setup(directory, arg_uid_shift, arg_uid_shift);
4366
4367 if (setup_pts(directory) < 0)
4368 return r;
4369
4370 r = setup_propagate(directory);
4371 if (r < 0)
4372 return r;
4373
4374 r = setup_dev_console(directory, console);
4375 if (r < 0)
4376 return r;
4377
4378 r = setup_seccomp();
4379 if (r < 0)
4380 return r;
4381
4382 r = setup_timezone(directory);
4383 if (r < 0)
4384 return r;
4385
4386 r = setup_resolv_conf(directory);
4387 if (r < 0)
4388 return r;
4389
4390 r = setup_journal(directory);
4391 if (r < 0)
4392 return r;
4393
4394 r = mount_custom(directory);
4395 if (r < 0)
4396 return r;
4397
4398 r = mount_cgroup(directory);
4399 if (r < 0)
4400 return r;
4401
4402 r = mount_move_root(directory);
4403 if (r < 0)
4404 return log_error_errno(r, "Failed to move root directory: %m");
4405
4406 pid = raw_clone(SIGCHLD|CLONE_NEWNS|
4407 (arg_share_system ? 0 : CLONE_NEWIPC|CLONE_NEWPID|CLONE_NEWUTS) |
4408 (arg_private_network ? CLONE_NEWNET : 0) |
4409 (arg_userns ? CLONE_NEWUSER : 0),
4410 NULL);
4411 if (pid < 0)
4412 return log_error_errno(errno, "Failed to fork inner child: %m");
4413
4414 if (pid == 0) {
4415 pid_socket = safe_close(pid_socket);
4416 uid_shift_socket = safe_close(uid_shift_socket);
4417
4418 /* The inner child has all namespaces that are
4419 * requested, so that we all are owned by the user if
4420 * user namespaces are turned on. */
4421
4422 r = inner_child(barrier, directory, secondary, kmsg_socket, rtnl_socket, fds, argc, argv);
4423 if (r < 0)
4424 _exit(EXIT_FAILURE);
4425
4426 _exit(EXIT_SUCCESS);
4427 }
4428
4429 l = send(pid_socket, &pid, sizeof(pid), MSG_NOSIGNAL);
4430 if (l < 0)
4431 return log_error_errno(errno, "Failed to send PID: %m");
4432 if (l != sizeof(pid)) {
4433 log_error("Short write while sending PID.");
4434 return -EIO;
4435 }
4436
4437 pid_socket = safe_close(pid_socket);
4438
4439 return 0;
4440 }
4441
4442 static int setup_uid_map(pid_t pid) {
4443 char uid_map[strlen("/proc//uid_map") + DECIMAL_STR_MAX(uid_t) + 1], line[DECIMAL_STR_MAX(uid_t)*3+3+1];
4444 int r;
4445
4446 assert(pid > 1);
4447
4448 xsprintf(uid_map, "/proc/" PID_FMT "/uid_map", pid);
4449 xsprintf(line, UID_FMT " " UID_FMT " " UID_FMT "\n", 0, arg_uid_shift, arg_uid_range);
4450 r = write_string_file(uid_map, line, 0);
4451 if (r < 0)
4452 return log_error_errno(r, "Failed to write UID map: %m");
4453
4454 /* We always assign the same UID and GID ranges */
4455 xsprintf(uid_map, "/proc/" PID_FMT "/gid_map", pid);
4456 r = write_string_file(uid_map, line, 0);
4457 if (r < 0)
4458 return log_error_errno(r, "Failed to write GID map: %m");
4459
4460 return 0;
4461 }
4462
4463 static int chown_cgroup(pid_t pid) {
4464 _cleanup_free_ char *path = NULL, *fs = NULL;
4465 _cleanup_close_ int fd = -1;
4466 const char *fn;
4467 int r;
4468
4469 r = cg_pid_get_path(NULL, pid, &path);
4470 if (r < 0)
4471 return log_error_errno(r, "Failed to get container cgroup path: %m");
4472
4473 r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, path, NULL, &fs);
4474 if (r < 0)
4475 return log_error_errno(r, "Failed to get file system path for container cgroup: %m");
4476
4477 fd = open(fs, O_RDONLY|O_CLOEXEC|O_DIRECTORY);
4478 if (fd < 0)
4479 return log_error_errno(errno, "Failed to open %s: %m", fs);
4480
4481 FOREACH_STRING(fn, ".", "tasks", "notify_on_release", "cgroup.procs", "cgroup.clone_children")
4482 if (fchownat(fd, fn, arg_uid_shift, arg_uid_shift, 0) < 0)
4483 log_warning_errno(errno, "Failed to chown() cgroup file %s, ignoring: %m", fn);
4484
4485 return 0;
4486 }
4487
4488 int main(int argc, char *argv[]) {
4489
4490 _cleanup_free_ char *device_path = NULL, *root_device = NULL, *home_device = NULL, *srv_device = NULL, *console = NULL;
4491 bool root_device_rw = true, home_device_rw = true, srv_device_rw = true;
4492 _cleanup_close_ int master = -1, image_fd = -1;
4493 _cleanup_fdset_free_ FDSet *fds = NULL;
4494 int r, n_fd_passed, loop_nr = -1;
4495 char veth_name[IFNAMSIZ];
4496 bool secondary = false, remove_subvol = false;
4497 sigset_t mask_chld;
4498 pid_t pid = 0;
4499 int ret = EXIT_SUCCESS;
4500 union in_addr_union exposed = {};
4501 _cleanup_release_lock_file_ LockFile tree_global_lock = LOCK_FILE_INIT, tree_local_lock = LOCK_FILE_INIT;
4502 bool interactive;
4503
4504 log_parse_environment();
4505 log_open();
4506
4507 r = parse_argv(argc, argv);
4508 if (r <= 0)
4509 goto finish;
4510
4511 r = determine_names();
4512 if (r < 0)
4513 goto finish;
4514
4515 if (geteuid() != 0) {
4516 log_error("Need to be root.");
4517 r = -EPERM;
4518 goto finish;
4519 }
4520
4521 n_fd_passed = sd_listen_fds(false);
4522 if (n_fd_passed > 0) {
4523 r = fdset_new_listen_fds(&fds, false);
4524 if (r < 0) {
4525 log_error_errno(r, "Failed to collect file descriptors: %m");
4526 goto finish;
4527 }
4528 }
4529
4530 if (arg_directory) {
4531 assert(!arg_image);
4532
4533 if (path_equal(arg_directory, "/") && !arg_ephemeral) {
4534 log_error("Spawning container on root directory is not supported. Consider using --ephemeral.");
4535 r = -EINVAL;
4536 goto finish;
4537 }
4538
4539 if (arg_ephemeral) {
4540 _cleanup_free_ char *np = NULL;
4541
4542 /* If the specified path is a mount point we
4543 * generate the new snapshot immediately
4544 * inside it under a random name. However if
4545 * the specified is not a mount point we
4546 * create the new snapshot in the parent
4547 * directory, just next to it. */
4548 r = path_is_mount_point(arg_directory, 0);
4549 if (r < 0) {
4550 log_error_errno(r, "Failed to determine whether directory %s is mount point: %m", arg_directory);
4551 goto finish;
4552 }
4553 if (r > 0)
4554 r = tempfn_random_child(arg_directory, "machine.", &np);
4555 else
4556 r = tempfn_random(arg_directory, "machine.", &np);
4557 if (r < 0) {
4558 log_error_errno(r, "Failed to generate name for snapshot: %m");
4559 goto finish;
4560 }
4561
4562 r = image_path_lock(np, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
4563 if (r < 0) {
4564 log_error_errno(r, "Failed to lock %s: %m", np);
4565 goto finish;
4566 }
4567
4568 r = btrfs_subvol_snapshot(arg_directory, np, (arg_read_only ? BTRFS_SNAPSHOT_READ_ONLY : 0) | BTRFS_SNAPSHOT_FALLBACK_COPY | BTRFS_SNAPSHOT_RECURSIVE);
4569 if (r < 0) {
4570 log_error_errno(r, "Failed to create snapshot %s from %s: %m", np, arg_directory);
4571 goto finish;
4572 }
4573
4574 free(arg_directory);
4575 arg_directory = np;
4576 np = NULL;
4577
4578 remove_subvol = true;
4579
4580 } else {
4581 r = image_path_lock(arg_directory, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
4582 if (r == -EBUSY) {
4583 log_error_errno(r, "Directory tree %s is currently busy.", arg_directory);
4584 goto finish;
4585 }
4586 if (r < 0) {
4587 log_error_errno(r, "Failed to lock %s: %m", arg_directory);
4588 return r;
4589 }
4590
4591 if (arg_template) {
4592 r = btrfs_subvol_snapshot(arg_template, arg_directory, (arg_read_only ? BTRFS_SNAPSHOT_READ_ONLY : 0) | BTRFS_SNAPSHOT_FALLBACK_COPY | BTRFS_SNAPSHOT_RECURSIVE);
4593 if (r == -EEXIST) {
4594 if (!arg_quiet)
4595 log_info("Directory %s already exists, not populating from template %s.", arg_directory, arg_template);
4596 } else if (r < 0) {
4597 log_error_errno(r, "Couldn't create snapshot %s from %s: %m", arg_directory, arg_template);
4598 goto finish;
4599 } else {
4600 if (!arg_quiet)
4601 log_info("Populated %s from template %s.", arg_directory, arg_template);
4602 }
4603 }
4604 }
4605
4606 if (arg_boot) {
4607 if (path_is_os_tree(arg_directory) <= 0) {
4608 log_error("Directory %s doesn't look like an OS root directory (os-release file is missing). Refusing.", arg_directory);
4609 r = -EINVAL;
4610 goto finish;
4611 }
4612 } else {
4613 const char *p;
4614
4615 p = strjoina(arg_directory,
4616 argc > optind && path_is_absolute(argv[optind]) ? argv[optind] : "/usr/bin/");
4617 if (access(p, F_OK) < 0) {
4618 log_error("Directory %s lacks the binary to execute or doesn't look like a binary tree. Refusing.", arg_directory);
4619 r = -EINVAL;
4620 goto finish;
4621 }
4622 }
4623
4624 } else {
4625 char template[] = "/tmp/nspawn-root-XXXXXX";
4626
4627 assert(arg_image);
4628 assert(!arg_template);
4629
4630 r = image_path_lock(arg_image, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
4631 if (r == -EBUSY) {
4632 r = log_error_errno(r, "Disk image %s is currently busy.", arg_image);
4633 goto finish;
4634 }
4635 if (r < 0) {
4636 r = log_error_errno(r, "Failed to create image lock: %m");
4637 goto finish;
4638 }
4639
4640 if (!mkdtemp(template)) {
4641 log_error_errno(errno, "Failed to create temporary directory: %m");
4642 r = -errno;
4643 goto finish;
4644 }
4645
4646 arg_directory = strdup(template);
4647 if (!arg_directory) {
4648 r = log_oom();
4649 goto finish;
4650 }
4651
4652 image_fd = setup_image(&device_path, &loop_nr);
4653 if (image_fd < 0) {
4654 r = image_fd;
4655 goto finish;
4656 }
4657
4658 r = dissect_image(image_fd,
4659 &root_device, &root_device_rw,
4660 &home_device, &home_device_rw,
4661 &srv_device, &srv_device_rw,
4662 &secondary);
4663 if (r < 0)
4664 goto finish;
4665 }
4666
4667 r = custom_mounts_prepare();
4668 if (r < 0)
4669 goto finish;
4670
4671 interactive =
4672 isatty(STDIN_FILENO) > 0 &&
4673 isatty(STDOUT_FILENO) > 0;
4674
4675 master = posix_openpt(O_RDWR|O_NOCTTY|O_CLOEXEC|O_NDELAY);
4676 if (master < 0) {
4677 r = log_error_errno(errno, "Failed to acquire pseudo tty: %m");
4678 goto finish;
4679 }
4680
4681 r = ptsname_malloc(master, &console);
4682 if (r < 0) {
4683 r = log_error_errno(r, "Failed to determine tty name: %m");
4684 goto finish;
4685 }
4686
4687 if (unlockpt(master) < 0) {
4688 r = log_error_errno(errno, "Failed to unlock tty: %m");
4689 goto finish;
4690 }
4691
4692 if (!arg_quiet)
4693 log_info("Spawning container %s on %s.\nPress ^] three times within 1s to kill container.",
4694 arg_machine, arg_image ?: arg_directory);
4695
4696 assert_se(sigprocmask_many(SIG_BLOCK, NULL, SIGCHLD, SIGWINCH, SIGTERM, SIGINT, -1) >= 0);
4697
4698 assert_se(sigemptyset(&mask_chld) == 0);
4699 assert_se(sigaddset(&mask_chld, SIGCHLD) == 0);
4700
4701 if (prctl(PR_SET_CHILD_SUBREAPER, 1) < 0) {
4702 r = log_error_errno(errno, "Failed to become subreaper: %m");
4703 goto finish;
4704 }
4705
4706 for (;;) {
4707 _cleanup_close_pair_ int kmsg_socket_pair[2] = { -1, -1 }, rtnl_socket_pair[2] = { -1, -1 }, pid_socket_pair[2] = { -1, -1 },
4708 uid_shift_socket_pair[2] = { -1, -1 };
4709 ContainerStatus container_status;
4710 _cleanup_(barrier_destroy) Barrier barrier = BARRIER_NULL;
4711 static const struct sigaction sa = {
4712 .sa_handler = nop_handler,
4713 .sa_flags = SA_NOCLDSTOP,
4714 };
4715 int ifi = 0;
4716 ssize_t l;
4717 _cleanup_event_unref_ sd_event *event = NULL;
4718 _cleanup_(pty_forward_freep) PTYForward *forward = NULL;
4719 _cleanup_netlink_unref_ sd_netlink *rtnl = NULL;
4720 char last_char = 0;
4721
4722 r = barrier_create(&barrier);
4723 if (r < 0) {
4724 log_error_errno(r, "Cannot initialize IPC barrier: %m");
4725 goto finish;
4726 }
4727
4728 if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC, 0, kmsg_socket_pair) < 0) {
4729 r = log_error_errno(errno, "Failed to create kmsg socket pair: %m");
4730 goto finish;
4731 }
4732
4733 if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC, 0, rtnl_socket_pair) < 0) {
4734 r = log_error_errno(errno, "Failed to create rtnl socket pair: %m");
4735 goto finish;
4736 }
4737
4738 if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC, 0, pid_socket_pair) < 0) {
4739 r = log_error_errno(errno, "Failed to create pid socket pair: %m");
4740 goto finish;
4741 }
4742
4743 if (arg_userns)
4744 if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC, 0, uid_shift_socket_pair) < 0) {
4745 r = log_error_errno(errno, "Failed to create uid shift socket pair: %m");
4746 goto finish;
4747 }
4748
4749 /* Child can be killed before execv(), so handle SIGCHLD
4750 * in order to interrupt parent's blocking calls and
4751 * give it a chance to call wait() and terminate. */
4752 r = sigprocmask(SIG_UNBLOCK, &mask_chld, NULL);
4753 if (r < 0) {
4754 r = log_error_errno(errno, "Failed to change the signal mask: %m");
4755 goto finish;
4756 }
4757
4758 r = sigaction(SIGCHLD, &sa, NULL);
4759 if (r < 0) {
4760 r = log_error_errno(errno, "Failed to install SIGCHLD handler: %m");
4761 goto finish;
4762 }
4763
4764 pid = raw_clone(SIGCHLD|CLONE_NEWNS, NULL);
4765 if (pid < 0) {
4766 if (errno == EINVAL)
4767 r = log_error_errno(errno, "clone() failed, do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in): %m");
4768 else
4769 r = log_error_errno(errno, "clone() failed: %m");
4770
4771 goto finish;
4772 }
4773
4774 if (pid == 0) {
4775 /* The outer child only has a file system namespace. */
4776 barrier_set_role(&barrier, BARRIER_CHILD);
4777
4778 master = safe_close(master);
4779
4780 kmsg_socket_pair[0] = safe_close(kmsg_socket_pair[0]);
4781 rtnl_socket_pair[0] = safe_close(rtnl_socket_pair[0]);
4782 pid_socket_pair[0] = safe_close(pid_socket_pair[0]);
4783 uid_shift_socket_pair[0] = safe_close(uid_shift_socket_pair[0]);
4784
4785 (void) reset_all_signal_handlers();
4786 (void) reset_signal_mask();
4787
4788 r = outer_child(&barrier,
4789 arg_directory,
4790 console,
4791 root_device, root_device_rw,
4792 home_device, home_device_rw,
4793 srv_device, srv_device_rw,
4794 interactive,
4795 secondary,
4796 pid_socket_pair[1],
4797 kmsg_socket_pair[1],
4798 rtnl_socket_pair[1],
4799 uid_shift_socket_pair[1],
4800 fds,
4801 argc, argv);
4802 if (r < 0)
4803 _exit(EXIT_FAILURE);
4804
4805 _exit(EXIT_SUCCESS);
4806 }
4807
4808 barrier_set_role(&barrier, BARRIER_PARENT);
4809
4810 fdset_free(fds);
4811 fds = NULL;
4812
4813 kmsg_socket_pair[1] = safe_close(kmsg_socket_pair[1]);
4814 rtnl_socket_pair[1] = safe_close(rtnl_socket_pair[1]);
4815 pid_socket_pair[1] = safe_close(pid_socket_pair[1]);
4816
4817 /* Wait for the outer child. */
4818 r = wait_for_terminate_and_warn("namespace helper", pid, NULL);
4819 if (r < 0)
4820 goto finish;
4821 if (r != 0) {
4822 r = -EIO;
4823 goto finish;
4824 }
4825 pid = 0;
4826
4827 /* And now retrieve the PID of the inner child. */
4828 l = recv(pid_socket_pair[0], &pid, sizeof(pid), 0);
4829 if (l < 0) {
4830 r = log_error_errno(errno, "Failed to read inner child PID: %m");
4831 goto finish;
4832 }
4833 if (l != sizeof(pid)) {
4834 log_error("Short read while reading inner child PID: %m");
4835 r = EIO;
4836 goto finish;
4837 }
4838
4839 log_debug("Init process invoked as PID " PID_FMT, pid);
4840
4841 if (arg_userns) {
4842 if (!barrier_place_and_sync(&barrier)) { /* #1 */
4843 log_error("Child died too early.");
4844 r = -ESRCH;
4845 goto finish;
4846 }
4847
4848 l = recv(uid_shift_socket_pair[0], &arg_uid_shift, sizeof(arg_uid_shift), 0);
4849 if (l < 0) {
4850 r = log_error_errno(errno, "Failed to read UID shift: %m");
4851 goto finish;
4852 }
4853 if (l != sizeof(arg_uid_shift)) {
4854 log_error("Short read while reading UID shift: %m");
4855 r = EIO;
4856 goto finish;
4857 }
4858
4859 r = setup_uid_map(pid);
4860 if (r < 0)
4861 goto finish;
4862
4863 (void) barrier_place(&barrier); /* #2 */
4864 }
4865
4866 r = move_network_interfaces(pid);
4867 if (r < 0)
4868 goto finish;
4869
4870 r = setup_veth(pid, veth_name, &ifi);
4871 if (r < 0)
4872 goto finish;
4873
4874 r = setup_bridge(veth_name, &ifi);
4875 if (r < 0)
4876 goto finish;
4877
4878 r = setup_macvlan(pid);
4879 if (r < 0)
4880 goto finish;
4881
4882 r = setup_ipvlan(pid);
4883 if (r < 0)
4884 goto finish;
4885
4886 r = register_machine(pid, ifi);
4887 if (r < 0)
4888 goto finish;
4889
4890 r = chown_cgroup(pid);
4891 if (r < 0)
4892 goto finish;
4893
4894 /* Notify the child that the parent is ready with all
4895 * its setup (including cgroup-ification), and that
4896 * the child can now hand over control to the code to
4897 * run inside the container. */
4898 (void) barrier_place(&barrier); /* #3 */
4899
4900 /* Block SIGCHLD here, before notifying child.
4901 * process_pty() will handle it with the other signals. */
4902 assert_se(sigprocmask(SIG_BLOCK, &mask_chld, NULL) >= 0);
4903
4904 /* Reset signal to default */
4905 r = default_signals(SIGCHLD, -1);
4906 if (r < 0) {
4907 log_error_errno(r, "Failed to reset SIGCHLD: %m");
4908 goto finish;
4909 }
4910
4911 /* Let the child know that we are ready and wait that the child is completely ready now. */
4912 if (!barrier_place_and_sync(&barrier)) { /* #5 */
4913 log_error("Client died too early.");
4914 r = -ESRCH;
4915 goto finish;
4916 }
4917
4918 sd_notifyf(false,
4919 "READY=1\n"
4920 "STATUS=Container running.\n"
4921 "X_NSPAWN_LEADER_PID=" PID_FMT, pid);
4922
4923 r = sd_event_new(&event);
4924 if (r < 0) {
4925 log_error_errno(r, "Failed to get default event source: %m");
4926 goto finish;
4927 }
4928
4929 if (arg_kill_signal > 0) {
4930 /* Try to kill the init system on SIGINT or SIGTERM */
4931 sd_event_add_signal(event, NULL, SIGINT, on_orderly_shutdown, UINT32_TO_PTR(pid));
4932 sd_event_add_signal(event, NULL, SIGTERM, on_orderly_shutdown, UINT32_TO_PTR(pid));
4933 } else {
4934 /* Immediately exit */
4935 sd_event_add_signal(event, NULL, SIGINT, NULL, NULL);
4936 sd_event_add_signal(event, NULL, SIGTERM, NULL, NULL);
4937 }
4938
4939 /* simply exit on sigchld */
4940 sd_event_add_signal(event, NULL, SIGCHLD, NULL, NULL);
4941
4942 if (arg_expose_ports) {
4943 r = watch_rtnl(event, rtnl_socket_pair[0], &exposed, &rtnl);
4944 if (r < 0)
4945 goto finish;
4946
4947 (void) expose_ports(rtnl, &exposed);
4948 }
4949
4950 rtnl_socket_pair[0] = safe_close(rtnl_socket_pair[0]);
4951
4952 r = pty_forward_new(event, master, true, !interactive, &forward);
4953 if (r < 0) {
4954 log_error_errno(r, "Failed to create PTY forwarder: %m");
4955 goto finish;
4956 }
4957
4958 r = sd_event_loop(event);
4959 if (r < 0) {
4960 log_error_errno(r, "Failed to run event loop: %m");
4961 goto finish;
4962 }
4963
4964 pty_forward_get_last_char(forward, &last_char);
4965
4966 forward = pty_forward_free(forward);
4967
4968 if (!arg_quiet && last_char != '\n')
4969 putc('\n', stdout);
4970
4971 /* Kill if it is not dead yet anyway */
4972 terminate_machine(pid);
4973
4974 /* Normally redundant, but better safe than sorry */
4975 kill(pid, SIGKILL);
4976
4977 r = wait_for_container(pid, &container_status);
4978 pid = 0;
4979
4980 if (r < 0)
4981 /* We failed to wait for the container, or the
4982 * container exited abnormally */
4983 goto finish;
4984 else if (r > 0 || container_status == CONTAINER_TERMINATED){
4985 /* The container exited with a non-zero
4986 * status, or with zero status and no reboot
4987 * was requested. */
4988 ret = r;
4989 break;
4990 }
4991
4992 /* CONTAINER_REBOOTED, loop again */
4993
4994 if (arg_keep_unit) {
4995 /* Special handling if we are running as a
4996 * service: instead of simply restarting the
4997 * machine we want to restart the entire
4998 * service, so let's inform systemd about this
4999 * with the special exit code 133. The service
5000 * file uses RestartForceExitStatus=133 so
5001 * that this results in a full nspawn
5002 * restart. This is necessary since we might
5003 * have cgroup parameters set we want to have
5004 * flushed out. */
5005 ret = 133;
5006 r = 0;
5007 break;
5008 }
5009
5010 flush_ports(&exposed);
5011 }
5012
5013 finish:
5014 sd_notify(false,
5015 "STOPPING=1\n"
5016 "STATUS=Terminating...");
5017
5018 if (pid > 0)
5019 kill(pid, SIGKILL);
5020
5021 /* Try to flush whatever is still queued in the pty */
5022 if (master >= 0)
5023 (void) copy_bytes(master, STDOUT_FILENO, (off_t) -1, false);
5024
5025 loop_remove(loop_nr, &image_fd);
5026
5027 if (remove_subvol && arg_directory) {
5028 int k;
5029
5030 k = btrfs_subvol_remove(arg_directory, true);
5031 if (k < 0)
5032 log_warning_errno(k, "Cannot remove subvolume '%s', ignoring: %m", arg_directory);
5033 }
5034
5035 if (arg_machine) {
5036 const char *p;
5037
5038 p = strjoina("/run/systemd/nspawn/propagate/", arg_machine);
5039 (void) rm_rf(p, REMOVE_ROOT);
5040 }
5041
5042 free(arg_directory);
5043 free(arg_template);
5044 free(arg_image);
5045 free(arg_machine);
5046 free(arg_user);
5047 strv_free(arg_setenv);
5048 strv_free(arg_network_interfaces);
5049 strv_free(arg_network_macvlan);
5050 strv_free(arg_network_ipvlan);
5051 custom_mount_free_all();
5052
5053 flush_ports(&exposed);
5054
5055 while (arg_expose_ports) {
5056 ExposePort *p = arg_expose_ports;
5057 LIST_REMOVE(ports, arg_expose_ports, p);
5058 free(p);
5059 }
5060
5061 return r < 0 ? EXIT_FAILURE : ret;
5062 }