]> git.ipfire.org Git - thirdparty/systemd.git/blob - src/nspawn/nspawn.c
Merge pull request #785 from zonque/free-and-strdup
[thirdparty/systemd.git] / src / nspawn / nspawn.c
1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
2
3 /***
4 This file is part of systemd.
5
6 Copyright 2010 Lennart Poettering
7
8 systemd is free software; you can redistribute it and/or modify it
9 under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 2.1 of the License, or
11 (at your option) any later version.
12
13 systemd is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 Lesser General Public License for more details.
17
18 You should have received a copy of the GNU Lesser General Public License
19 along with systemd; If not, see <http://www.gnu.org/licenses/>.
20 ***/
21
22 #include <signal.h>
23 #include <sched.h>
24 #include <unistd.h>
25 #include <sys/types.h>
26 #include <sys/mount.h>
27 #include <stdlib.h>
28 #include <string.h>
29 #include <stdio.h>
30 #include <errno.h>
31 #include <sys/prctl.h>
32 #include <getopt.h>
33 #include <grp.h>
34 #include <linux/fs.h>
35 #include <sys/socket.h>
36 #include <linux/netlink.h>
37 #include <net/if.h>
38 #include <linux/veth.h>
39 #include <sys/personality.h>
40 #include <linux/loop.h>
41 #include <sys/file.h>
42
43 #ifdef HAVE_SELINUX
44 #include <selinux/selinux.h>
45 #endif
46
47 #ifdef HAVE_SECCOMP
48 #include <seccomp.h>
49 #endif
50
51 #ifdef HAVE_BLKID
52 #include <blkid/blkid.h>
53 #endif
54
55 #include "sd-daemon.h"
56 #include "sd-bus.h"
57 #include "sd-id128.h"
58 #include "sd-netlink.h"
59 #include "random-util.h"
60 #include "log.h"
61 #include "util.h"
62 #include "mkdir.h"
63 #include "rm-rf.h"
64 #include "macro.h"
65 #include "missing.h"
66 #include "cgroup-util.h"
67 #include "strv.h"
68 #include "path-util.h"
69 #include "loopback-setup.h"
70 #include "dev-setup.h"
71 #include "fdset.h"
72 #include "build.h"
73 #include "fileio.h"
74 #include "bus-util.h"
75 #include "bus-error.h"
76 #include "ptyfwd.h"
77 #include "env-util.h"
78 #include "netlink-util.h"
79 #include "udev-util.h"
80 #include "blkid-util.h"
81 #include "gpt.h"
82 #include "siphash24.h"
83 #include "copy.h"
84 #include "base-filesystem.h"
85 #include "barrier.h"
86 #include "event-util.h"
87 #include "capability.h"
88 #include "cap-list.h"
89 #include "btrfs-util.h"
90 #include "machine-image.h"
91 #include "list.h"
92 #include "in-addr-util.h"
93 #include "firewall-util.h"
94 #include "local-addresses.h"
95 #include "formats-util.h"
96 #include "process-util.h"
97 #include "terminal-util.h"
98 #include "hostname-util.h"
99 #include "signal-util.h"
100
101 #ifdef HAVE_SECCOMP
102 #include "seccomp-util.h"
103 #endif
104
105 typedef struct ExposePort {
106 int protocol;
107 uint16_t host_port;
108 uint16_t container_port;
109 LIST_FIELDS(struct ExposePort, ports);
110 } ExposePort;
111
112 typedef enum ContainerStatus {
113 CONTAINER_TERMINATED,
114 CONTAINER_REBOOTED
115 } ContainerStatus;
116
117 typedef enum LinkJournal {
118 LINK_NO,
119 LINK_AUTO,
120 LINK_HOST,
121 LINK_GUEST
122 } LinkJournal;
123
124 typedef enum Volatile {
125 VOLATILE_NO,
126 VOLATILE_YES,
127 VOLATILE_STATE,
128 } Volatile;
129
130 typedef enum CustomMountType {
131 CUSTOM_MOUNT_BIND,
132 CUSTOM_MOUNT_TMPFS,
133 CUSTOM_MOUNT_OVERLAY,
134 } CustomMountType;
135
136 typedef struct CustomMount {
137 CustomMountType type;
138 bool read_only;
139 char *source; /* for overlayfs this is the upper directory */
140 char *destination;
141 char *options;
142 char *work_dir;
143 char **lower;
144 } CustomMount;
145
146 static char *arg_directory = NULL;
147 static char *arg_template = NULL;
148 static char *arg_user = NULL;
149 static sd_id128_t arg_uuid = {};
150 static char *arg_machine = NULL;
151 static const char *arg_selinux_context = NULL;
152 static const char *arg_selinux_apifs_context = NULL;
153 static const char *arg_slice = NULL;
154 static bool arg_private_network = false;
155 static bool arg_read_only = false;
156 static bool arg_boot = false;
157 static bool arg_ephemeral = false;
158 static LinkJournal arg_link_journal = LINK_AUTO;
159 static bool arg_link_journal_try = false;
160 static uint64_t arg_retain =
161 (1ULL << CAP_CHOWN) |
162 (1ULL << CAP_DAC_OVERRIDE) |
163 (1ULL << CAP_DAC_READ_SEARCH) |
164 (1ULL << CAP_FOWNER) |
165 (1ULL << CAP_FSETID) |
166 (1ULL << CAP_IPC_OWNER) |
167 (1ULL << CAP_KILL) |
168 (1ULL << CAP_LEASE) |
169 (1ULL << CAP_LINUX_IMMUTABLE) |
170 (1ULL << CAP_NET_BIND_SERVICE) |
171 (1ULL << CAP_NET_BROADCAST) |
172 (1ULL << CAP_NET_RAW) |
173 (1ULL << CAP_SETGID) |
174 (1ULL << CAP_SETFCAP) |
175 (1ULL << CAP_SETPCAP) |
176 (1ULL << CAP_SETUID) |
177 (1ULL << CAP_SYS_ADMIN) |
178 (1ULL << CAP_SYS_CHROOT) |
179 (1ULL << CAP_SYS_NICE) |
180 (1ULL << CAP_SYS_PTRACE) |
181 (1ULL << CAP_SYS_TTY_CONFIG) |
182 (1ULL << CAP_SYS_RESOURCE) |
183 (1ULL << CAP_SYS_BOOT) |
184 (1ULL << CAP_AUDIT_WRITE) |
185 (1ULL << CAP_AUDIT_CONTROL) |
186 (1ULL << CAP_MKNOD);
187 static CustomMount *arg_custom_mounts = NULL;
188 static unsigned arg_n_custom_mounts = 0;
189 static char **arg_setenv = NULL;
190 static bool arg_quiet = false;
191 static bool arg_share_system = false;
192 static bool arg_register = true;
193 static bool arg_keep_unit = false;
194 static char **arg_network_interfaces = NULL;
195 static char **arg_network_macvlan = NULL;
196 static char **arg_network_ipvlan = NULL;
197 static bool arg_network_veth = false;
198 static const char *arg_network_bridge = NULL;
199 static unsigned long arg_personality = PERSONALITY_INVALID;
200 static char *arg_image = NULL;
201 static Volatile arg_volatile = VOLATILE_NO;
202 static ExposePort *arg_expose_ports = NULL;
203 static char **arg_property = NULL;
204 static uid_t arg_uid_shift = UID_INVALID, arg_uid_range = 0x10000U;
205 static bool arg_userns = false;
206 static int arg_kill_signal = 0;
207
208 static void help(void) {
209 printf("%s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
210 "Spawn a minimal namespace container for debugging, testing and building.\n\n"
211 " -h --help Show this help\n"
212 " --version Print version string\n"
213 " -q --quiet Do not show status information\n"
214 " -D --directory=PATH Root directory for the container\n"
215 " --template=PATH Initialize root directory from template directory,\n"
216 " if missing\n"
217 " -x --ephemeral Run container with snapshot of root directory, and\n"
218 " remove it after exit\n"
219 " -i --image=PATH File system device or disk image for the container\n"
220 " -b --boot Boot up full system (i.e. invoke init)\n"
221 " -u --user=USER Run the command under specified user or uid\n"
222 " -M --machine=NAME Set the machine name for the container\n"
223 " --uuid=UUID Set a specific machine UUID for the container\n"
224 " -S --slice=SLICE Place the container in the specified slice\n"
225 " --property=NAME=VALUE Set scope unit property\n"
226 " --private-users[=UIDBASE[:NUIDS]]\n"
227 " Run within user namespace\n"
228 " --private-network Disable network in container\n"
229 " --network-interface=INTERFACE\n"
230 " Assign an existing network interface to the\n"
231 " container\n"
232 " --network-macvlan=INTERFACE\n"
233 " Create a macvlan network interface based on an\n"
234 " existing network interface to the container\n"
235 " --network-ipvlan=INTERFACE\n"
236 " Create a ipvlan network interface based on an\n"
237 " existing network interface to the container\n"
238 " -n --network-veth Add a virtual ethernet connection between host\n"
239 " and container\n"
240 " --network-bridge=INTERFACE\n"
241 " Add a virtual ethernet connection between host\n"
242 " and container and add it to an existing bridge on\n"
243 " the host\n"
244 " -p --port=[PROTOCOL:]HOSTPORT[:CONTAINERPORT]\n"
245 " Expose a container IP port on the host\n"
246 " -Z --selinux-context=SECLABEL\n"
247 " Set the SELinux security context to be used by\n"
248 " processes in the container\n"
249 " -L --selinux-apifs-context=SECLABEL\n"
250 " Set the SELinux security context to be used by\n"
251 " API/tmpfs file systems in the container\n"
252 " --capability=CAP In addition to the default, retain specified\n"
253 " capability\n"
254 " --drop-capability=CAP Drop the specified capability from the default set\n"
255 " --kill-signal=SIGNAL Select signal to use for shutting down PID 1\n"
256 " --link-journal=MODE Link up guest journal, one of no, auto, guest, host,\n"
257 " try-guest, try-host\n"
258 " -j Equivalent to --link-journal=try-guest\n"
259 " --read-only Mount the root directory read-only\n"
260 " --bind=PATH[:PATH] Bind mount a file or directory from the host into\n"
261 " the container\n"
262 " --bind-ro=PATH[:PATH] Similar, but creates a read-only bind mount\n"
263 " --tmpfs=PATH:[OPTIONS] Mount an empty tmpfs to the specified directory\n"
264 " --overlay=PATH[:PATH...]:PATH\n"
265 " Create an overlay mount from the host to \n"
266 " the container\n"
267 " --overlay-ro=PATH[:PATH...]:PATH\n"
268 " Similar, but creates a read-only overlay mount\n"
269 " --setenv=NAME=VALUE Pass an environment variable to PID 1\n"
270 " --share-system Share system namespaces with host\n"
271 " --register=BOOLEAN Register container as machine\n"
272 " --keep-unit Do not register a scope for the machine, reuse\n"
273 " the service unit nspawn is running in\n"
274 " --volatile[=MODE] Run the system in volatile mode\n"
275 , program_invocation_short_name);
276 }
277
278 static CustomMount* custom_mount_add(CustomMountType t) {
279 CustomMount *c, *ret;
280
281 c = realloc(arg_custom_mounts, (arg_n_custom_mounts + 1) * sizeof(CustomMount));
282 if (!c)
283 return NULL;
284
285 arg_custom_mounts = c;
286 ret = arg_custom_mounts + arg_n_custom_mounts;
287 arg_n_custom_mounts++;
288
289 *ret = (CustomMount) { .type = t };
290
291 return ret;
292 }
293
294 static void custom_mount_free_all(void) {
295 unsigned i;
296
297 for (i = 0; i < arg_n_custom_mounts; i++) {
298 CustomMount *m = &arg_custom_mounts[i];
299
300 free(m->source);
301 free(m->destination);
302 free(m->options);
303
304 if (m->work_dir) {
305 (void) rm_rf(m->work_dir, REMOVE_ROOT|REMOVE_PHYSICAL);
306 free(m->work_dir);
307 }
308
309 strv_free(m->lower);
310 }
311
312 free(arg_custom_mounts);
313 arg_custom_mounts = NULL;
314 arg_n_custom_mounts = 0;
315 }
316
317 static int custom_mount_compare(const void *a, const void *b) {
318 const CustomMount *x = a, *y = b;
319 int r;
320
321 r = path_compare(x->destination, y->destination);
322 if (r != 0)
323 return r;
324
325 if (x->type < y->type)
326 return -1;
327 if (x->type > y->type)
328 return 1;
329
330 return 0;
331 }
332
333 static int custom_mounts_prepare(void) {
334 unsigned i;
335 int r;
336
337 /* Ensure the mounts are applied prefix first. */
338 qsort_safe(arg_custom_mounts, arg_n_custom_mounts, sizeof(CustomMount), custom_mount_compare);
339
340 /* Allocate working directories for the overlay file systems that need it */
341 for (i = 0; i < arg_n_custom_mounts; i++) {
342 CustomMount *m = &arg_custom_mounts[i];
343
344 if (arg_userns && arg_uid_shift == UID_INVALID && path_equal(m->destination, "/")) {
345 log_error("--private-users with automatic UID shift may not be combined with custom root mounts.");
346 return -EINVAL;
347 }
348
349 if (m->type != CUSTOM_MOUNT_OVERLAY)
350 continue;
351
352 if (m->work_dir)
353 continue;
354
355 if (m->read_only)
356 continue;
357
358 r = tempfn_random(m->source, NULL, &m->work_dir);
359 if (r < 0)
360 return log_error_errno(r, "Failed to generate work directory from %s: %m", m->source);
361 }
362
363 return 0;
364 }
365
366 static int set_sanitized_path(char **b, const char *path) {
367 char *p;
368
369 assert(b);
370 assert(path);
371
372 p = canonicalize_file_name(path);
373 if (!p) {
374 if (errno != ENOENT)
375 return -errno;
376
377 p = path_make_absolute_cwd(path);
378 if (!p)
379 return -ENOMEM;
380 }
381
382 free(*b);
383 *b = path_kill_slashes(p);
384 return 0;
385 }
386
387 static int parse_argv(int argc, char *argv[]) {
388
389 enum {
390 ARG_VERSION = 0x100,
391 ARG_PRIVATE_NETWORK,
392 ARG_UUID,
393 ARG_READ_ONLY,
394 ARG_CAPABILITY,
395 ARG_DROP_CAPABILITY,
396 ARG_LINK_JOURNAL,
397 ARG_BIND,
398 ARG_BIND_RO,
399 ARG_TMPFS,
400 ARG_OVERLAY,
401 ARG_OVERLAY_RO,
402 ARG_SETENV,
403 ARG_SHARE_SYSTEM,
404 ARG_REGISTER,
405 ARG_KEEP_UNIT,
406 ARG_NETWORK_INTERFACE,
407 ARG_NETWORK_MACVLAN,
408 ARG_NETWORK_IPVLAN,
409 ARG_NETWORK_BRIDGE,
410 ARG_PERSONALITY,
411 ARG_VOLATILE,
412 ARG_TEMPLATE,
413 ARG_PROPERTY,
414 ARG_PRIVATE_USERS,
415 ARG_KILL_SIGNAL,
416 };
417
418 static const struct option options[] = {
419 { "help", no_argument, NULL, 'h' },
420 { "version", no_argument, NULL, ARG_VERSION },
421 { "directory", required_argument, NULL, 'D' },
422 { "template", required_argument, NULL, ARG_TEMPLATE },
423 { "ephemeral", no_argument, NULL, 'x' },
424 { "user", required_argument, NULL, 'u' },
425 { "private-network", no_argument, NULL, ARG_PRIVATE_NETWORK },
426 { "boot", no_argument, NULL, 'b' },
427 { "uuid", required_argument, NULL, ARG_UUID },
428 { "read-only", no_argument, NULL, ARG_READ_ONLY },
429 { "capability", required_argument, NULL, ARG_CAPABILITY },
430 { "drop-capability", required_argument, NULL, ARG_DROP_CAPABILITY },
431 { "link-journal", required_argument, NULL, ARG_LINK_JOURNAL },
432 { "bind", required_argument, NULL, ARG_BIND },
433 { "bind-ro", required_argument, NULL, ARG_BIND_RO },
434 { "tmpfs", required_argument, NULL, ARG_TMPFS },
435 { "overlay", required_argument, NULL, ARG_OVERLAY },
436 { "overlay-ro", required_argument, NULL, ARG_OVERLAY_RO },
437 { "machine", required_argument, NULL, 'M' },
438 { "slice", required_argument, NULL, 'S' },
439 { "setenv", required_argument, NULL, ARG_SETENV },
440 { "selinux-context", required_argument, NULL, 'Z' },
441 { "selinux-apifs-context", required_argument, NULL, 'L' },
442 { "quiet", no_argument, NULL, 'q' },
443 { "share-system", no_argument, NULL, ARG_SHARE_SYSTEM },
444 { "register", required_argument, NULL, ARG_REGISTER },
445 { "keep-unit", no_argument, NULL, ARG_KEEP_UNIT },
446 { "network-interface", required_argument, NULL, ARG_NETWORK_INTERFACE },
447 { "network-macvlan", required_argument, NULL, ARG_NETWORK_MACVLAN },
448 { "network-ipvlan", required_argument, NULL, ARG_NETWORK_IPVLAN },
449 { "network-veth", no_argument, NULL, 'n' },
450 { "network-bridge", required_argument, NULL, ARG_NETWORK_BRIDGE },
451 { "personality", required_argument, NULL, ARG_PERSONALITY },
452 { "image", required_argument, NULL, 'i' },
453 { "volatile", optional_argument, NULL, ARG_VOLATILE },
454 { "port", required_argument, NULL, 'p' },
455 { "property", required_argument, NULL, ARG_PROPERTY },
456 { "private-users", optional_argument, NULL, ARG_PRIVATE_USERS },
457 { "kill-signal", required_argument, NULL, ARG_KILL_SIGNAL },
458 {}
459 };
460
461 int c, r;
462 uint64_t plus = 0, minus = 0;
463
464 assert(argc >= 0);
465 assert(argv);
466
467 while ((c = getopt_long(argc, argv, "+hD:u:bL:M:jS:Z:qi:xp:n", options, NULL)) >= 0)
468
469 switch (c) {
470
471 case 'h':
472 help();
473 return 0;
474
475 case ARG_VERSION:
476 puts(PACKAGE_STRING);
477 puts(SYSTEMD_FEATURES);
478 return 0;
479
480 case 'D':
481 r = set_sanitized_path(&arg_directory, optarg);
482 if (r < 0)
483 return log_error_errno(r, "Invalid root directory: %m");
484
485 break;
486
487 case ARG_TEMPLATE:
488 r = set_sanitized_path(&arg_template, optarg);
489 if (r < 0)
490 return log_error_errno(r, "Invalid template directory: %m");
491
492 break;
493
494 case 'i':
495 r = set_sanitized_path(&arg_image, optarg);
496 if (r < 0)
497 return log_error_errno(r, "Invalid image path: %m");
498
499 break;
500
501 case 'x':
502 arg_ephemeral = true;
503 break;
504
505 case 'u':
506 r = free_and_strdup(&arg_user, optarg);
507 if (r < 0)
508 return log_oom();
509
510 break;
511
512 case ARG_NETWORK_BRIDGE:
513 arg_network_bridge = optarg;
514
515 /* fall through */
516
517 case 'n':
518 arg_network_veth = true;
519 arg_private_network = true;
520 break;
521
522 case ARG_NETWORK_INTERFACE:
523 if (strv_extend(&arg_network_interfaces, optarg) < 0)
524 return log_oom();
525
526 arg_private_network = true;
527 break;
528
529 case ARG_NETWORK_MACVLAN:
530 if (strv_extend(&arg_network_macvlan, optarg) < 0)
531 return log_oom();
532
533 arg_private_network = true;
534 break;
535
536 case ARG_NETWORK_IPVLAN:
537 if (strv_extend(&arg_network_ipvlan, optarg) < 0)
538 return log_oom();
539
540 /* fall through */
541
542 case ARG_PRIVATE_NETWORK:
543 arg_private_network = true;
544 break;
545
546 case 'b':
547 arg_boot = true;
548 break;
549
550 case ARG_UUID:
551 r = sd_id128_from_string(optarg, &arg_uuid);
552 if (r < 0) {
553 log_error("Invalid UUID: %s", optarg);
554 return r;
555 }
556 break;
557
558 case 'S':
559 arg_slice = optarg;
560 break;
561
562 case 'M':
563 if (isempty(optarg)) {
564 free(arg_machine);
565 arg_machine = NULL;
566 } else {
567 if (!machine_name_is_valid(optarg)) {
568 log_error("Invalid machine name: %s", optarg);
569 return -EINVAL;
570 }
571
572 r = free_and_strdup(&arg_machine, optarg);
573 if (r < 0)
574 return log_oom();
575
576 break;
577 }
578
579 case 'Z':
580 arg_selinux_context = optarg;
581 break;
582
583 case 'L':
584 arg_selinux_apifs_context = optarg;
585 break;
586
587 case ARG_READ_ONLY:
588 arg_read_only = true;
589 break;
590
591 case ARG_CAPABILITY:
592 case ARG_DROP_CAPABILITY: {
593 const char *state, *word;
594 size_t length;
595
596 FOREACH_WORD_SEPARATOR(word, length, optarg, ",", state) {
597 _cleanup_free_ char *t;
598
599 t = strndup(word, length);
600 if (!t)
601 return log_oom();
602
603 if (streq(t, "all")) {
604 if (c == ARG_CAPABILITY)
605 plus = (uint64_t) -1;
606 else
607 minus = (uint64_t) -1;
608 } else {
609 int cap;
610
611 cap = capability_from_name(t);
612 if (cap < 0) {
613 log_error("Failed to parse capability %s.", t);
614 return -EINVAL;
615 }
616
617 if (c == ARG_CAPABILITY)
618 plus |= 1ULL << (uint64_t) cap;
619 else
620 minus |= 1ULL << (uint64_t) cap;
621 }
622 }
623
624 break;
625 }
626
627 case 'j':
628 arg_link_journal = LINK_GUEST;
629 arg_link_journal_try = true;
630 break;
631
632 case ARG_LINK_JOURNAL:
633 if (streq(optarg, "auto")) {
634 arg_link_journal = LINK_AUTO;
635 arg_link_journal_try = false;
636 } else if (streq(optarg, "no")) {
637 arg_link_journal = LINK_NO;
638 arg_link_journal_try = false;
639 } else if (streq(optarg, "guest")) {
640 arg_link_journal = LINK_GUEST;
641 arg_link_journal_try = false;
642 } else if (streq(optarg, "host")) {
643 arg_link_journal = LINK_HOST;
644 arg_link_journal_try = false;
645 } else if (streq(optarg, "try-guest")) {
646 arg_link_journal = LINK_GUEST;
647 arg_link_journal_try = true;
648 } else if (streq(optarg, "try-host")) {
649 arg_link_journal = LINK_HOST;
650 arg_link_journal_try = true;
651 } else {
652 log_error("Failed to parse link journal mode %s", optarg);
653 return -EINVAL;
654 }
655
656 break;
657
658 case ARG_BIND:
659 case ARG_BIND_RO: {
660 _cleanup_free_ char *source = NULL, *destination = NULL;
661 CustomMount *m;
662 char *e;
663
664 e = strchr(optarg, ':');
665 if (e) {
666 source = strndup(optarg, e - optarg);
667 destination = strdup(e + 1);
668 } else {
669 source = strdup(optarg);
670 destination = strdup(optarg);
671 }
672
673 if (!source || !destination)
674 return log_oom();
675
676 if (!path_is_absolute(source) || !path_is_absolute(destination)) {
677 log_error("Invalid bind mount specification: %s", optarg);
678 return -EINVAL;
679 }
680
681 m = custom_mount_add(CUSTOM_MOUNT_BIND);
682 if (!m)
683 return log_oom();
684
685 m->source = source;
686 m->destination = destination;
687 m->read_only = c == ARG_BIND_RO;
688
689 source = destination = NULL;
690
691 break;
692 }
693
694 case ARG_TMPFS: {
695 _cleanup_free_ char *path = NULL, *opts = NULL;
696 CustomMount *m;
697 char *e;
698
699 e = strchr(optarg, ':');
700 if (e) {
701 path = strndup(optarg, e - optarg);
702 opts = strdup(e + 1);
703 } else {
704 path = strdup(optarg);
705 opts = strdup("mode=0755");
706 }
707
708 if (!path || !opts)
709 return log_oom();
710
711 if (!path_is_absolute(path)) {
712 log_error("Invalid tmpfs specification: %s", optarg);
713 return -EINVAL;
714 }
715
716 m = custom_mount_add(CUSTOM_MOUNT_TMPFS);
717 if (!m)
718 return log_oom();
719
720 m->destination = path;
721 m->options = opts;
722
723 path = opts = NULL;
724
725 break;
726 }
727
728 case ARG_OVERLAY:
729 case ARG_OVERLAY_RO: {
730 _cleanup_free_ char *upper = NULL, *destination = NULL;
731 _cleanup_strv_free_ char **lower = NULL;
732 CustomMount *m;
733 unsigned n = 0;
734 char **i;
735
736 lower = strv_split(optarg, ":");
737 if (!lower)
738 return log_oom();
739
740 STRV_FOREACH(i, lower) {
741 if (!path_is_absolute(*i)) {
742 log_error("Overlay path %s is not absolute.", *i);
743 return -EINVAL;
744 }
745
746 n++;
747 }
748
749 if (n < 2) {
750 log_error("--overlay= needs at least two colon-separated directories specified.");
751 return -EINVAL;
752 }
753
754 if (n == 2) {
755 /* If two parameters are specified,
756 * the first one is the lower, the
757 * second one the upper directory. And
758 * we'll also define the destination
759 * mount point the same as the upper. */
760 upper = lower[1];
761 lower[1] = NULL;
762
763 destination = strdup(upper);
764 if (!destination)
765 return log_oom();
766
767 } else {
768 upper = lower[n - 2];
769 destination = lower[n - 1];
770 lower[n - 2] = NULL;
771 }
772
773 m = custom_mount_add(CUSTOM_MOUNT_OVERLAY);
774 if (!m)
775 return log_oom();
776
777 m->destination = destination;
778 m->source = upper;
779 m->lower = lower;
780 m->read_only = c == ARG_OVERLAY_RO;
781
782 upper = destination = NULL;
783 lower = NULL;
784
785 break;
786 }
787
788 case ARG_SETENV: {
789 char **n;
790
791 if (!env_assignment_is_valid(optarg)) {
792 log_error("Environment variable assignment '%s' is not valid.", optarg);
793 return -EINVAL;
794 }
795
796 n = strv_env_set(arg_setenv, optarg);
797 if (!n)
798 return log_oom();
799
800 strv_free(arg_setenv);
801 arg_setenv = n;
802 break;
803 }
804
805 case 'q':
806 arg_quiet = true;
807 break;
808
809 case ARG_SHARE_SYSTEM:
810 arg_share_system = true;
811 break;
812
813 case ARG_REGISTER:
814 r = parse_boolean(optarg);
815 if (r < 0) {
816 log_error("Failed to parse --register= argument: %s", optarg);
817 return r;
818 }
819
820 arg_register = r;
821 break;
822
823 case ARG_KEEP_UNIT:
824 arg_keep_unit = true;
825 break;
826
827 case ARG_PERSONALITY:
828
829 arg_personality = personality_from_string(optarg);
830 if (arg_personality == PERSONALITY_INVALID) {
831 log_error("Unknown or unsupported personality '%s'.", optarg);
832 return -EINVAL;
833 }
834
835 break;
836
837 case ARG_VOLATILE:
838
839 if (!optarg)
840 arg_volatile = VOLATILE_YES;
841 else {
842 r = parse_boolean(optarg);
843 if (r < 0) {
844 if (streq(optarg, "state"))
845 arg_volatile = VOLATILE_STATE;
846 else {
847 log_error("Failed to parse --volatile= argument: %s", optarg);
848 return r;
849 }
850 } else
851 arg_volatile = r ? VOLATILE_YES : VOLATILE_NO;
852 }
853
854 break;
855
856 case 'p': {
857 const char *split, *e;
858 uint16_t container_port, host_port;
859 int protocol;
860 ExposePort *p;
861
862 if ((e = startswith(optarg, "tcp:")))
863 protocol = IPPROTO_TCP;
864 else if ((e = startswith(optarg, "udp:")))
865 protocol = IPPROTO_UDP;
866 else {
867 e = optarg;
868 protocol = IPPROTO_TCP;
869 }
870
871 split = strchr(e, ':');
872 if (split) {
873 char v[split - e + 1];
874
875 memcpy(v, e, split - e);
876 v[split - e] = 0;
877
878 r = safe_atou16(v, &host_port);
879 if (r < 0 || host_port <= 0) {
880 log_error("Failed to parse host port: %s", optarg);
881 return -EINVAL;
882 }
883
884 r = safe_atou16(split + 1, &container_port);
885 } else {
886 r = safe_atou16(e, &container_port);
887 host_port = container_port;
888 }
889
890 if (r < 0 || container_port <= 0) {
891 log_error("Failed to parse host port: %s", optarg);
892 return -EINVAL;
893 }
894
895 LIST_FOREACH(ports, p, arg_expose_ports) {
896 if (p->protocol == protocol && p->host_port == host_port) {
897 log_error("Duplicate port specification: %s", optarg);
898 return -EINVAL;
899 }
900 }
901
902 p = new(ExposePort, 1);
903 if (!p)
904 return log_oom();
905
906 p->protocol = protocol;
907 p->host_port = host_port;
908 p->container_port = container_port;
909
910 LIST_PREPEND(ports, arg_expose_ports, p);
911
912 break;
913 }
914
915 case ARG_PROPERTY:
916 if (strv_extend(&arg_property, optarg) < 0)
917 return log_oom();
918
919 break;
920
921 case ARG_PRIVATE_USERS:
922 if (optarg) {
923 _cleanup_free_ char *buffer = NULL;
924 const char *range, *shift;
925
926 range = strchr(optarg, ':');
927 if (range) {
928 buffer = strndup(optarg, range - optarg);
929 if (!buffer)
930 return log_oom();
931 shift = buffer;
932
933 range++;
934 if (safe_atou32(range, &arg_uid_range) < 0 || arg_uid_range <= 0) {
935 log_error("Failed to parse UID range: %s", range);
936 return -EINVAL;
937 }
938 } else
939 shift = optarg;
940
941 if (parse_uid(shift, &arg_uid_shift) < 0) {
942 log_error("Failed to parse UID: %s", optarg);
943 return -EINVAL;
944 }
945 }
946
947 arg_userns = true;
948 break;
949
950 case ARG_KILL_SIGNAL:
951 arg_kill_signal = signal_from_string_try_harder(optarg);
952 if (arg_kill_signal < 0) {
953 log_error("Cannot parse signal: %s", optarg);
954 return -EINVAL;
955 }
956
957 break;
958
959 case '?':
960 return -EINVAL;
961
962 default:
963 assert_not_reached("Unhandled option");
964 }
965
966 if (arg_share_system)
967 arg_register = false;
968
969 if (arg_boot && arg_share_system) {
970 log_error("--boot and --share-system may not be combined.");
971 return -EINVAL;
972 }
973
974 if (arg_keep_unit && cg_pid_get_owner_uid(0, NULL) >= 0) {
975 log_error("--keep-unit may not be used when invoked from a user session.");
976 return -EINVAL;
977 }
978
979 if (arg_directory && arg_image) {
980 log_error("--directory= and --image= may not be combined.");
981 return -EINVAL;
982 }
983
984 if (arg_template && arg_image) {
985 log_error("--template= and --image= may not be combined.");
986 return -EINVAL;
987 }
988
989 if (arg_template && !(arg_directory || arg_machine)) {
990 log_error("--template= needs --directory= or --machine=.");
991 return -EINVAL;
992 }
993
994 if (arg_ephemeral && arg_template) {
995 log_error("--ephemeral and --template= may not be combined.");
996 return -EINVAL;
997 }
998
999 if (arg_ephemeral && arg_image) {
1000 log_error("--ephemeral and --image= may not be combined.");
1001 return -EINVAL;
1002 }
1003
1004 if (arg_ephemeral && !IN_SET(arg_link_journal, LINK_NO, LINK_AUTO)) {
1005 log_error("--ephemeral and --link-journal= may not be combined.");
1006 return -EINVAL;
1007 }
1008
1009 if (arg_volatile != VOLATILE_NO && arg_read_only) {
1010 log_error("Cannot combine --read-only with --volatile. Note that --volatile already implies a read-only base hierarchy.");
1011 return -EINVAL;
1012 }
1013
1014 if (arg_expose_ports && !arg_private_network) {
1015 log_error("Cannot use --port= without private networking.");
1016 return -EINVAL;
1017 }
1018
1019 if (arg_userns && access("/proc/self/uid_map", F_OK) < 0)
1020 return log_error_errno(EOPNOTSUPP, "--private-users= is not supported, kernel compiled without user namespace support.");
1021
1022 arg_retain = (arg_retain | plus | (arg_private_network ? 1ULL << CAP_NET_ADMIN : 0)) & ~minus;
1023
1024 if (arg_boot && arg_kill_signal <= 0)
1025 arg_kill_signal = SIGRTMIN+3;
1026
1027 return 1;
1028 }
1029
1030 static int tmpfs_patch_options(const char *options, char **ret) {
1031 char *buf = NULL;
1032
1033 if (arg_userns && arg_uid_shift != 0) {
1034 assert(arg_uid_shift != UID_INVALID);
1035
1036 if (options)
1037 (void) asprintf(&buf, "%s,uid=" UID_FMT ",gid=" UID_FMT, options, arg_uid_shift, arg_uid_shift);
1038 else
1039 (void) asprintf(&buf, "uid=" UID_FMT ",gid=" UID_FMT, arg_uid_shift, arg_uid_shift);
1040 if (!buf)
1041 return -ENOMEM;
1042
1043 options = buf;
1044 }
1045
1046 #ifdef HAVE_SELINUX
1047 if (arg_selinux_apifs_context) {
1048 char *t;
1049
1050 if (options)
1051 t = strjoin(options, ",context=\"", arg_selinux_apifs_context, "\"", NULL);
1052 else
1053 t = strjoin("context=\"", arg_selinux_apifs_context, "\"", NULL);
1054 if (!t) {
1055 free(buf);
1056 return -ENOMEM;
1057 }
1058
1059 free(buf);
1060 buf = t;
1061 }
1062 #endif
1063
1064 *ret = buf;
1065 return !!buf;
1066 }
1067
1068 static int mount_all(const char *dest, bool userns) {
1069
1070 typedef struct MountPoint {
1071 const char *what;
1072 const char *where;
1073 const char *type;
1074 const char *options;
1075 unsigned long flags;
1076 bool fatal;
1077 bool userns;
1078 } MountPoint;
1079
1080 static const MountPoint mount_table[] = {
1081 { "proc", "/proc", "proc", NULL, MS_NOSUID|MS_NOEXEC|MS_NODEV, true, true },
1082 { "/proc/sys", "/proc/sys", NULL, NULL, MS_BIND, true, true }, /* Bind mount first */
1083 { NULL, "/proc/sys", NULL, NULL, MS_BIND|MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_REMOUNT, true, true }, /* Then, make it r/o */
1084 { "sysfs", "/sys", "sysfs", NULL, MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV, true, false },
1085 { "tmpfs", "/sys/fs/cgroup", "tmpfs", "mode=755", MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_STRICTATIME, true, false },
1086 { "tmpfs", "/dev", "tmpfs", "mode=755", MS_NOSUID|MS_STRICTATIME, true, false },
1087 { "tmpfs", "/dev/shm", "tmpfs", "mode=1777", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true, false },
1088 { "tmpfs", "/run", "tmpfs", "mode=755", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true, false },
1089 { "tmpfs", "/tmp", "tmpfs", "mode=1777", MS_STRICTATIME, true, false },
1090 #ifdef HAVE_SELINUX
1091 { "/sys/fs/selinux", "/sys/fs/selinux", NULL, NULL, MS_BIND, false, false }, /* Bind mount first */
1092 { NULL, "/sys/fs/selinux", NULL, NULL, MS_BIND|MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_REMOUNT, false, false }, /* Then, make it r/o */
1093 #endif
1094 };
1095
1096 unsigned k;
1097 int r;
1098
1099 for (k = 0; k < ELEMENTSOF(mount_table); k++) {
1100 _cleanup_free_ char *where = NULL, *options = NULL;
1101 const char *o;
1102
1103 if (userns != mount_table[k].userns)
1104 continue;
1105
1106 where = prefix_root(dest, mount_table[k].where);
1107 if (!where)
1108 return log_oom();
1109
1110 r = path_is_mount_point(where, AT_SYMLINK_FOLLOW);
1111 if (r < 0 && r != -ENOENT)
1112 return log_error_errno(r, "Failed to detect whether %s is a mount point: %m", where);
1113
1114 /* Skip this entry if it is not a remount. */
1115 if (mount_table[k].what && r > 0)
1116 continue;
1117
1118 r = mkdir_p(where, 0755);
1119 if (r < 0) {
1120 if (mount_table[k].fatal)
1121 return log_error_errno(r, "Failed to create directory %s: %m", where);
1122
1123 log_warning_errno(r, "Failed to create directory %s: %m", where);
1124 continue;
1125 }
1126
1127 o = mount_table[k].options;
1128 if (streq_ptr(mount_table[k].type, "tmpfs")) {
1129 r = tmpfs_patch_options(o, &options);
1130 if (r < 0)
1131 return log_oom();
1132 if (r > 0)
1133 o = options;
1134 }
1135
1136 if (mount(mount_table[k].what,
1137 where,
1138 mount_table[k].type,
1139 mount_table[k].flags,
1140 o) < 0) {
1141
1142 if (mount_table[k].fatal)
1143 return log_error_errno(errno, "mount(%s) failed: %m", where);
1144
1145 log_warning_errno(errno, "mount(%s) failed, ignoring: %m", where);
1146 }
1147 }
1148
1149 return 0;
1150 }
1151
1152 static int mount_bind(const char *dest, CustomMount *m) {
1153 struct stat source_st, dest_st;
1154 const char *where;
1155 int r;
1156
1157 assert(m);
1158
1159 if (stat(m->source, &source_st) < 0)
1160 return log_error_errno(errno, "Failed to stat %s: %m", m->source);
1161
1162 where = prefix_roota(dest, m->destination);
1163
1164 if (stat(where, &dest_st) >= 0) {
1165 if (S_ISDIR(source_st.st_mode) && !S_ISDIR(dest_st.st_mode)) {
1166 log_error("Cannot bind mount directory %s on file %s.", m->source, where);
1167 return -EINVAL;
1168 }
1169
1170 if (!S_ISDIR(source_st.st_mode) && S_ISDIR(dest_st.st_mode)) {
1171 log_error("Cannot bind mount file %s on directory %s.", m->source, where);
1172 return -EINVAL;
1173 }
1174
1175 } else if (errno == ENOENT) {
1176 r = mkdir_parents_label(where, 0755);
1177 if (r < 0)
1178 return log_error_errno(r, "Failed to make parents of %s: %m", where);
1179 } else {
1180 log_error_errno(errno, "Failed to stat %s: %m", where);
1181 return -errno;
1182 }
1183
1184 /* Create the mount point. Any non-directory file can be
1185 * mounted on any non-directory file (regular, fifo, socket,
1186 * char, block).
1187 */
1188 if (S_ISDIR(source_st.st_mode))
1189 r = mkdir_label(where, 0755);
1190 else
1191 r = touch(where);
1192 if (r < 0 && r != -EEXIST)
1193 return log_error_errno(r, "Failed to create mount point %s: %m", where);
1194
1195 if (mount(m->source, where, NULL, MS_BIND, NULL) < 0)
1196 return log_error_errno(errno, "mount(%s) failed: %m", where);
1197
1198 if (m->read_only) {
1199 r = bind_remount_recursive(where, true);
1200 if (r < 0)
1201 return log_error_errno(r, "Read-only bind mount failed: %m");
1202 }
1203
1204 return 0;
1205 }
1206
1207 static int mount_tmpfs(const char *dest, CustomMount *m) {
1208 const char *where, *options;
1209 _cleanup_free_ char *buf = NULL;
1210 int r;
1211
1212 assert(dest);
1213 assert(m);
1214
1215 where = prefix_roota(dest, m->destination);
1216
1217 r = mkdir_p_label(where, 0755);
1218 if (r < 0 && r != -EEXIST)
1219 return log_error_errno(r, "Creating mount point for tmpfs %s failed: %m", where);
1220
1221 r = tmpfs_patch_options(m->options, &buf);
1222 if (r < 0)
1223 return log_oom();
1224 options = r > 0 ? buf : m->options;
1225
1226 if (mount("tmpfs", where, "tmpfs", MS_NODEV|MS_STRICTATIME, options) < 0)
1227 return log_error_errno(errno, "tmpfs mount to %s failed: %m", where);
1228
1229 return 0;
1230 }
1231
1232 static int mount_overlay(const char *dest, CustomMount *m) {
1233 _cleanup_free_ char *lower = NULL;
1234 const char *where, *options;
1235 int r;
1236
1237 assert(dest);
1238 assert(m);
1239
1240 where = prefix_roota(dest, m->destination);
1241
1242 r = mkdir_label(where, 0755);
1243 if (r < 0 && r != -EEXIST)
1244 return log_error_errno(r, "Creating mount point for overlay %s failed: %m", where);
1245
1246 (void) mkdir_p_label(m->source, 0755);
1247
1248 strv_reverse(m->lower);
1249 lower = strv_join(m->lower, ":");
1250 strv_reverse(m->lower);
1251 if (!lower)
1252 return log_oom();
1253
1254 if (m->read_only)
1255 options = strjoina("lowerdir=", m->source, ":", lower);
1256 else {
1257 assert(m->work_dir);
1258 (void) mkdir_label(m->work_dir, 0700);
1259
1260 options = strjoina("lowerdir=", lower, ",upperdir=", m->source, ",workdir=", m->work_dir);
1261 }
1262
1263 if (mount("overlay", where, "overlay", m->read_only ? MS_RDONLY : 0, options) < 0)
1264 return log_error_errno(errno, "overlay mount to %s failed: %m", where);
1265
1266 return 0;
1267 }
1268
1269 static int mount_custom(const char *dest) {
1270 unsigned i;
1271 int r;
1272
1273 assert(dest);
1274
1275 for (i = 0; i < arg_n_custom_mounts; i++) {
1276 CustomMount *m = &arg_custom_mounts[i];
1277
1278 switch (m->type) {
1279
1280 case CUSTOM_MOUNT_BIND:
1281 r = mount_bind(dest, m);
1282 break;
1283
1284 case CUSTOM_MOUNT_TMPFS:
1285 r = mount_tmpfs(dest, m);
1286 break;
1287
1288 case CUSTOM_MOUNT_OVERLAY:
1289 r = mount_overlay(dest, m);
1290 break;
1291
1292 default:
1293 assert_not_reached("Unknown custom mount type");
1294 }
1295
1296 if (r < 0)
1297 return r;
1298 }
1299
1300 return 0;
1301 }
1302
1303 static int mount_cgroup_hierarchy(const char *dest, const char *controller, const char *hierarchy, bool read_only) {
1304 char *to;
1305 int r;
1306
1307 to = strjoina(dest, "/sys/fs/cgroup/", hierarchy);
1308
1309 r = path_is_mount_point(to, 0);
1310 if (r < 0 && r != -ENOENT)
1311 return log_error_errno(r, "Failed to determine if %s is mounted already: %m", to);
1312 if (r > 0)
1313 return 0;
1314
1315 mkdir_p(to, 0755);
1316
1317 /* The superblock mount options of the mount point need to be
1318 * identical to the hosts', and hence writable... */
1319 if (mount("cgroup", to, "cgroup", MS_NOSUID|MS_NOEXEC|MS_NODEV, controller) < 0)
1320 return log_error_errno(errno, "Failed to mount to %s: %m", to);
1321
1322 /* ... hence let's only make the bind mount read-only, not the
1323 * superblock. */
1324 if (read_only) {
1325 if (mount(NULL, to, NULL, MS_BIND|MS_REMOUNT|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_RDONLY, NULL) < 0)
1326 return log_error_errno(errno, "Failed to remount %s read-only: %m", to);
1327 }
1328 return 1;
1329 }
1330
1331 static int mount_cgroup(const char *dest) {
1332 _cleanup_set_free_free_ Set *controllers = NULL;
1333 const char *cgroup_root;
1334 int r;
1335
1336 controllers = set_new(&string_hash_ops);
1337 if (!controllers)
1338 return log_oom();
1339
1340 r = cg_kernel_controllers(controllers);
1341 if (r < 0)
1342 return log_error_errno(r, "Failed to determine cgroup controllers: %m");
1343
1344 for (;;) {
1345 _cleanup_free_ char *controller = NULL, *origin = NULL, *combined = NULL;
1346
1347 controller = set_steal_first(controllers);
1348 if (!controller)
1349 break;
1350
1351 origin = prefix_root("/sys/fs/cgroup/", controller);
1352 if (!origin)
1353 return log_oom();
1354
1355 r = readlink_malloc(origin, &combined);
1356 if (r == -EINVAL) {
1357 /* Not a symbolic link, but directly a single cgroup hierarchy */
1358
1359 r = mount_cgroup_hierarchy(dest, controller, controller, true);
1360 if (r < 0)
1361 return r;
1362
1363 } else if (r < 0)
1364 return log_error_errno(r, "Failed to read link %s: %m", origin);
1365 else {
1366 _cleanup_free_ char *target = NULL;
1367
1368 target = prefix_root(dest, origin);
1369 if (!target)
1370 return log_oom();
1371
1372 /* A symbolic link, a combination of controllers in one hierarchy */
1373
1374 if (!filename_is_valid(combined)) {
1375 log_warning("Ignoring invalid combined hierarchy %s.", combined);
1376 continue;
1377 }
1378
1379 r = mount_cgroup_hierarchy(dest, combined, combined, true);
1380 if (r < 0)
1381 return r;
1382
1383 r = symlink_idempotent(combined, target);
1384 if (r == -EINVAL) {
1385 log_error("Invalid existing symlink for combined hierarchy");
1386 return r;
1387 }
1388 if (r < 0)
1389 return log_error_errno(r, "Failed to create symlink for combined hierarchy: %m");
1390 }
1391 }
1392
1393 r = mount_cgroup_hierarchy(dest, "name=systemd,xattr", "systemd", false);
1394 if (r < 0)
1395 return r;
1396
1397 cgroup_root = prefix_roota(dest, "/sys/fs/cgroup");
1398 if (mount(NULL, cgroup_root, NULL, MS_REMOUNT|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_STRICTATIME|MS_RDONLY, "mode=755") < 0)
1399 return log_error_errno(errno, "Failed to remount %s read-only: %m", cgroup_root);
1400
1401 return 0;
1402 }
1403
1404 static int mount_systemd_cgroup_writable(const char *dest) {
1405 _cleanup_free_ char *own_cgroup_path = NULL;
1406 const char *systemd_root, *systemd_own;
1407 int r;
1408
1409 assert(dest);
1410
1411 r = cg_pid_get_path(NULL, 0, &own_cgroup_path);
1412 if (r < 0)
1413 return log_error_errno(r, "Failed to determine our own cgroup path: %m");
1414
1415 /* Make our own cgroup a (writable) bind mount */
1416 systemd_own = strjoina(dest, "/sys/fs/cgroup/systemd", own_cgroup_path);
1417 if (mount(systemd_own, systemd_own, NULL, MS_BIND, NULL) < 0)
1418 return log_error_errno(errno, "Failed to turn %s into a bind mount: %m", own_cgroup_path);
1419
1420 /* And then remount the systemd cgroup root read-only */
1421 systemd_root = prefix_roota(dest, "/sys/fs/cgroup/systemd");
1422 if (mount(NULL, systemd_root, NULL, MS_BIND|MS_REMOUNT|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_RDONLY, NULL) < 0)
1423 return log_error_errno(errno, "Failed to mount cgroup root read-only: %m");
1424
1425 return 0;
1426 }
1427
1428 static int userns_lchown(const char *p, uid_t uid, gid_t gid) {
1429 assert(p);
1430
1431 if (!arg_userns)
1432 return 0;
1433
1434 if (uid == UID_INVALID && gid == GID_INVALID)
1435 return 0;
1436
1437 if (uid != UID_INVALID) {
1438 uid += arg_uid_shift;
1439
1440 if (uid < arg_uid_shift || uid >= arg_uid_shift + arg_uid_range)
1441 return -EOVERFLOW;
1442 }
1443
1444 if (gid != GID_INVALID) {
1445 gid += (gid_t) arg_uid_shift;
1446
1447 if (gid < (gid_t) arg_uid_shift || gid >= (gid_t) (arg_uid_shift + arg_uid_range))
1448 return -EOVERFLOW;
1449 }
1450
1451 if (lchown(p, uid, gid) < 0)
1452 return -errno;
1453
1454 return 0;
1455 }
1456
1457 static int userns_mkdir(const char *root, const char *path, mode_t mode, uid_t uid, gid_t gid) {
1458 const char *q;
1459
1460 q = prefix_roota(root, path);
1461 if (mkdir(q, mode) < 0) {
1462 if (errno == EEXIST)
1463 return 0;
1464 return -errno;
1465 }
1466
1467 return userns_lchown(q, uid, gid);
1468 }
1469
1470 static int setup_timezone(const char *dest) {
1471 _cleanup_free_ char *p = NULL, *q = NULL;
1472 const char *where, *check, *what;
1473 char *z, *y;
1474 int r;
1475
1476 assert(dest);
1477
1478 /* Fix the timezone, if possible */
1479 r = readlink_malloc("/etc/localtime", &p);
1480 if (r < 0) {
1481 log_warning("/etc/localtime is not a symlink, not updating container timezone.");
1482 return 0;
1483 }
1484
1485 z = path_startswith(p, "../usr/share/zoneinfo/");
1486 if (!z)
1487 z = path_startswith(p, "/usr/share/zoneinfo/");
1488 if (!z) {
1489 log_warning("/etc/localtime does not point into /usr/share/zoneinfo/, not updating container timezone.");
1490 return 0;
1491 }
1492
1493 where = prefix_roota(dest, "/etc/localtime");
1494 r = readlink_malloc(where, &q);
1495 if (r >= 0) {
1496 y = path_startswith(q, "../usr/share/zoneinfo/");
1497 if (!y)
1498 y = path_startswith(q, "/usr/share/zoneinfo/");
1499
1500 /* Already pointing to the right place? Then do nothing .. */
1501 if (y && streq(y, z))
1502 return 0;
1503 }
1504
1505 check = strjoina("/usr/share/zoneinfo/", z);
1506 check = prefix_root(dest, check);
1507 if (laccess(check, F_OK) < 0) {
1508 log_warning("Timezone %s does not exist in container, not updating container timezone.", z);
1509 return 0;
1510 }
1511
1512 r = unlink(where);
1513 if (r < 0 && errno != ENOENT) {
1514 log_error_errno(errno, "Failed to remove existing timezone info %s in container: %m", where);
1515 return 0;
1516 }
1517
1518 what = strjoina("../usr/share/zoneinfo/", z);
1519 if (symlink(what, where) < 0) {
1520 log_error_errno(errno, "Failed to correct timezone of container: %m");
1521 return 0;
1522 }
1523
1524 r = userns_lchown(where, 0, 0);
1525 if (r < 0)
1526 return log_warning_errno(r, "Failed to chown /etc/localtime: %m");
1527
1528 return 0;
1529 }
1530
1531 static int setup_resolv_conf(const char *dest) {
1532 const char *where = NULL;
1533 int r;
1534
1535 assert(dest);
1536
1537 if (arg_private_network)
1538 return 0;
1539
1540 /* Fix resolv.conf, if possible */
1541 where = prefix_roota(dest, "/etc/resolv.conf");
1542
1543 r = copy_file("/etc/resolv.conf", where, O_TRUNC|O_NOFOLLOW, 0644, 0);
1544 if (r < 0) {
1545 /* If the file already exists as symlink, let's
1546 * suppress the warning, under the assumption that
1547 * resolved or something similar runs inside and the
1548 * symlink points there.
1549 *
1550 * If the disk image is read-only, there's also no
1551 * point in complaining.
1552 */
1553 log_full_errno(IN_SET(r, -ELOOP, -EROFS) ? LOG_DEBUG : LOG_WARNING, r,
1554 "Failed to copy /etc/resolv.conf to %s: %m", where);
1555 return 0;
1556 }
1557
1558 r = userns_lchown(where, 0, 0);
1559 if (r < 0)
1560 log_warning_errno(r, "Failed to chown /etc/resolv.conf: %m");
1561
1562 return 0;
1563 }
1564
1565 static int setup_volatile_state(const char *directory) {
1566 _cleanup_free_ char *buf = NULL;
1567 const char *p, *options;
1568 int r;
1569
1570 assert(directory);
1571
1572 if (arg_volatile != VOLATILE_STATE)
1573 return 0;
1574
1575 /* --volatile=state means we simply overmount /var
1576 with a tmpfs, and the rest read-only. */
1577
1578 r = bind_remount_recursive(directory, true);
1579 if (r < 0)
1580 return log_error_errno(r, "Failed to remount %s read-only: %m", directory);
1581
1582 p = prefix_roota(directory, "/var");
1583 r = mkdir(p, 0755);
1584 if (r < 0 && errno != EEXIST)
1585 return log_error_errno(errno, "Failed to create %s: %m", directory);
1586
1587 options = "mode=755";
1588 r = tmpfs_patch_options(options, &buf);
1589 if (r < 0)
1590 return log_oom();
1591 if (r > 0)
1592 options = buf;
1593
1594 if (mount("tmpfs", p, "tmpfs", MS_STRICTATIME, options) < 0)
1595 return log_error_errno(errno, "Failed to mount tmpfs to /var: %m");
1596
1597 return 0;
1598 }
1599
1600 static int setup_volatile(const char *directory) {
1601 bool tmpfs_mounted = false, bind_mounted = false;
1602 char template[] = "/tmp/nspawn-volatile-XXXXXX";
1603 _cleanup_free_ char *buf = NULL;
1604 const char *f, *t, *options;
1605 int r;
1606
1607 assert(directory);
1608
1609 if (arg_volatile != VOLATILE_YES)
1610 return 0;
1611
1612 /* --volatile=yes means we mount a tmpfs to the root dir, and
1613 the original /usr to use inside it, and that read-only. */
1614
1615 if (!mkdtemp(template))
1616 return log_error_errno(errno, "Failed to create temporary directory: %m");
1617
1618 options = "mode=755";
1619 r = tmpfs_patch_options(options, &buf);
1620 if (r < 0)
1621 return log_oom();
1622 if (r > 0)
1623 options = buf;
1624
1625 if (mount("tmpfs", template, "tmpfs", MS_STRICTATIME, options) < 0) {
1626 r = log_error_errno(errno, "Failed to mount tmpfs for root directory: %m");
1627 goto fail;
1628 }
1629
1630 tmpfs_mounted = true;
1631
1632 f = prefix_roota(directory, "/usr");
1633 t = prefix_roota(template, "/usr");
1634
1635 r = mkdir(t, 0755);
1636 if (r < 0 && errno != EEXIST) {
1637 r = log_error_errno(errno, "Failed to create %s: %m", t);
1638 goto fail;
1639 }
1640
1641 if (mount(f, t, NULL, MS_BIND|MS_REC, NULL) < 0) {
1642 r = log_error_errno(errno, "Failed to create /usr bind mount: %m");
1643 goto fail;
1644 }
1645
1646 bind_mounted = true;
1647
1648 r = bind_remount_recursive(t, true);
1649 if (r < 0) {
1650 log_error_errno(r, "Failed to remount %s read-only: %m", t);
1651 goto fail;
1652 }
1653
1654 if (mount(template, directory, NULL, MS_MOVE, NULL) < 0) {
1655 r = log_error_errno(errno, "Failed to move root mount: %m");
1656 goto fail;
1657 }
1658
1659 (void) rmdir(template);
1660
1661 return 0;
1662
1663 fail:
1664 if (bind_mounted)
1665 (void) umount(t);
1666
1667 if (tmpfs_mounted)
1668 (void) umount(template);
1669 (void) rmdir(template);
1670 return r;
1671 }
1672
1673 static char* id128_format_as_uuid(sd_id128_t id, char s[37]) {
1674 assert(s);
1675
1676 snprintf(s, 37,
1677 "%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
1678 SD_ID128_FORMAT_VAL(id));
1679
1680 return s;
1681 }
1682
1683 static int setup_boot_id(const char *dest) {
1684 const char *from, *to;
1685 sd_id128_t rnd = {};
1686 char as_uuid[37];
1687 int r;
1688
1689 if (arg_share_system)
1690 return 0;
1691
1692 /* Generate a new randomized boot ID, so that each boot-up of
1693 * the container gets a new one */
1694
1695 from = prefix_roota(dest, "/run/proc-sys-kernel-random-boot-id");
1696 to = prefix_roota(dest, "/proc/sys/kernel/random/boot_id");
1697
1698 r = sd_id128_randomize(&rnd);
1699 if (r < 0)
1700 return log_error_errno(r, "Failed to generate random boot id: %m");
1701
1702 id128_format_as_uuid(rnd, as_uuid);
1703
1704 r = write_string_file(from, as_uuid, WRITE_STRING_FILE_CREATE);
1705 if (r < 0)
1706 return log_error_errno(r, "Failed to write boot id: %m");
1707
1708 if (mount(from, to, NULL, MS_BIND, NULL) < 0)
1709 r = log_error_errno(errno, "Failed to bind mount boot id: %m");
1710 else if (mount(NULL, to, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY|MS_NOSUID|MS_NODEV, NULL) < 0)
1711 log_warning_errno(errno, "Failed to make boot id read-only: %m");
1712
1713 unlink(from);
1714 return r;
1715 }
1716
1717 static int copy_devnodes(const char *dest) {
1718
1719 static const char devnodes[] =
1720 "null\0"
1721 "zero\0"
1722 "full\0"
1723 "random\0"
1724 "urandom\0"
1725 "tty\0"
1726 "net/tun\0";
1727
1728 const char *d;
1729 int r = 0;
1730 _cleanup_umask_ mode_t u;
1731
1732 assert(dest);
1733
1734 u = umask(0000);
1735
1736 /* Create /dev/net, so that we can create /dev/net/tun in it */
1737 if (userns_mkdir(dest, "/dev/net", 0755, 0, 0) < 0)
1738 return log_error_errno(r, "Failed to create /dev/net directory: %m");
1739
1740 NULSTR_FOREACH(d, devnodes) {
1741 _cleanup_free_ char *from = NULL, *to = NULL;
1742 struct stat st;
1743
1744 from = strappend("/dev/", d);
1745 to = prefix_root(dest, from);
1746
1747 if (stat(from, &st) < 0) {
1748
1749 if (errno != ENOENT)
1750 return log_error_errno(errno, "Failed to stat %s: %m", from);
1751
1752 } else if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
1753
1754 log_error("%s is not a char or block device, cannot copy.", from);
1755 return -EIO;
1756
1757 } else {
1758 if (mknod(to, st.st_mode, st.st_rdev) < 0) {
1759 if (errno != EPERM)
1760 return log_error_errno(errno, "mknod(%s) failed: %m", to);
1761
1762 /* Some systems abusively restrict mknod but
1763 * allow bind mounts. */
1764 r = touch(to);
1765 if (r < 0)
1766 return log_error_errno(r, "touch (%s) failed: %m", to);
1767 if (mount(from, to, NULL, MS_BIND, NULL) < 0)
1768 return log_error_errno(errno, "Both mknod and bind mount (%s) failed: %m", to);
1769 }
1770
1771 r = userns_lchown(to, 0, 0);
1772 if (r < 0)
1773 return log_error_errno(r, "chown() of device node %s failed: %m", to);
1774 }
1775 }
1776
1777 return r;
1778 }
1779
1780 static int setup_pts(const char *dest) {
1781 _cleanup_free_ char *options = NULL;
1782 const char *p;
1783
1784 #ifdef HAVE_SELINUX
1785 if (arg_selinux_apifs_context)
1786 (void) asprintf(&options,
1787 "newinstance,ptmxmode=0666,mode=620,gid=" GID_FMT ",context=\"%s\"",
1788 arg_uid_shift + TTY_GID,
1789 arg_selinux_apifs_context);
1790 else
1791 #endif
1792 (void) asprintf(&options,
1793 "newinstance,ptmxmode=0666,mode=620,gid=" GID_FMT,
1794 arg_uid_shift + TTY_GID);
1795
1796 if (!options)
1797 return log_oom();
1798
1799 /* Mount /dev/pts itself */
1800 p = prefix_roota(dest, "/dev/pts");
1801 if (mkdir(p, 0755) < 0)
1802 return log_error_errno(errno, "Failed to create /dev/pts: %m");
1803 if (mount("devpts", p, "devpts", MS_NOSUID|MS_NOEXEC, options) < 0)
1804 return log_error_errno(errno, "Failed to mount /dev/pts: %m");
1805 if (userns_lchown(p, 0, 0) < 0)
1806 return log_error_errno(errno, "Failed to chown /dev/pts: %m");
1807
1808 /* Create /dev/ptmx symlink */
1809 p = prefix_roota(dest, "/dev/ptmx");
1810 if (symlink("pts/ptmx", p) < 0)
1811 return log_error_errno(errno, "Failed to create /dev/ptmx symlink: %m");
1812 if (userns_lchown(p, 0, 0) < 0)
1813 return log_error_errno(errno, "Failed to chown /dev/ptmx: %m");
1814
1815 /* And fix /dev/pts/ptmx ownership */
1816 p = prefix_roota(dest, "/dev/pts/ptmx");
1817 if (userns_lchown(p, 0, 0) < 0)
1818 return log_error_errno(errno, "Failed to chown /dev/pts/ptmx: %m");
1819
1820 return 0;
1821 }
1822
1823 static int setup_dev_console(const char *dest, const char *console) {
1824 _cleanup_umask_ mode_t u;
1825 const char *to;
1826 int r;
1827
1828 assert(dest);
1829 assert(console);
1830
1831 u = umask(0000);
1832
1833 r = chmod_and_chown(console, 0600, arg_uid_shift, arg_uid_shift);
1834 if (r < 0)
1835 return log_error_errno(r, "Failed to correct access mode for TTY: %m");
1836
1837 /* We need to bind mount the right tty to /dev/console since
1838 * ptys can only exist on pts file systems. To have something
1839 * to bind mount things on we create a empty regular file. */
1840
1841 to = prefix_roota(dest, "/dev/console");
1842 r = touch(to);
1843 if (r < 0)
1844 return log_error_errno(r, "touch() for /dev/console failed: %m");
1845
1846 if (mount(console, to, NULL, MS_BIND, NULL) < 0)
1847 return log_error_errno(errno, "Bind mount for /dev/console failed: %m");
1848
1849 return 0;
1850 }
1851
1852 static int setup_kmsg(const char *dest, int kmsg_socket) {
1853 const char *from, *to;
1854 _cleanup_umask_ mode_t u;
1855 int fd, k;
1856 union {
1857 struct cmsghdr cmsghdr;
1858 uint8_t buf[CMSG_SPACE(sizeof(int))];
1859 } control = {};
1860 struct msghdr mh = {
1861 .msg_control = &control,
1862 .msg_controllen = sizeof(control),
1863 };
1864 struct cmsghdr *cmsg;
1865
1866 assert(kmsg_socket >= 0);
1867
1868 u = umask(0000);
1869
1870 /* We create the kmsg FIFO as /run/kmsg, but immediately
1871 * delete it after bind mounting it to /proc/kmsg. While FIFOs
1872 * on the reading side behave very similar to /proc/kmsg,
1873 * their writing side behaves differently from /dev/kmsg in
1874 * that writing blocks when nothing is reading. In order to
1875 * avoid any problems with containers deadlocking due to this
1876 * we simply make /dev/kmsg unavailable to the container. */
1877 from = prefix_roota(dest, "/run/kmsg");
1878 to = prefix_roota(dest, "/proc/kmsg");
1879
1880 if (mkfifo(from, 0600) < 0)
1881 return log_error_errno(errno, "mkfifo() for /run/kmsg failed: %m");
1882 if (mount(from, to, NULL, MS_BIND, NULL) < 0)
1883 return log_error_errno(errno, "Bind mount for /proc/kmsg failed: %m");
1884
1885 fd = open(from, O_RDWR|O_NDELAY|O_CLOEXEC);
1886 if (fd < 0)
1887 return log_error_errno(errno, "Failed to open fifo: %m");
1888
1889 cmsg = CMSG_FIRSTHDR(&mh);
1890 cmsg->cmsg_level = SOL_SOCKET;
1891 cmsg->cmsg_type = SCM_RIGHTS;
1892 cmsg->cmsg_len = CMSG_LEN(sizeof(int));
1893 memcpy(CMSG_DATA(cmsg), &fd, sizeof(int));
1894
1895 mh.msg_controllen = cmsg->cmsg_len;
1896
1897 /* Store away the fd in the socket, so that it stays open as
1898 * long as we run the child */
1899 k = sendmsg(kmsg_socket, &mh, MSG_NOSIGNAL);
1900 safe_close(fd);
1901
1902 if (k < 0)
1903 return log_error_errno(errno, "Failed to send FIFO fd: %m");
1904
1905 /* And now make the FIFO unavailable as /run/kmsg... */
1906 (void) unlink(from);
1907
1908 return 0;
1909 }
1910
1911 static int send_rtnl(int send_fd) {
1912 union {
1913 struct cmsghdr cmsghdr;
1914 uint8_t buf[CMSG_SPACE(sizeof(int))];
1915 } control = {};
1916 struct msghdr mh = {
1917 .msg_control = &control,
1918 .msg_controllen = sizeof(control),
1919 };
1920 struct cmsghdr *cmsg;
1921 _cleanup_close_ int fd = -1;
1922 ssize_t k;
1923
1924 assert(send_fd >= 0);
1925
1926 if (!arg_expose_ports)
1927 return 0;
1928
1929 fd = socket(PF_NETLINK, SOCK_RAW|SOCK_CLOEXEC|SOCK_NONBLOCK, NETLINK_ROUTE);
1930 if (fd < 0)
1931 return log_error_errno(errno, "Failed to allocate container netlink: %m");
1932
1933 cmsg = CMSG_FIRSTHDR(&mh);
1934 cmsg->cmsg_level = SOL_SOCKET;
1935 cmsg->cmsg_type = SCM_RIGHTS;
1936 cmsg->cmsg_len = CMSG_LEN(sizeof(int));
1937 memcpy(CMSG_DATA(cmsg), &fd, sizeof(int));
1938
1939 mh.msg_controllen = cmsg->cmsg_len;
1940
1941 /* Store away the fd in the socket, so that it stays open as
1942 * long as we run the child */
1943 k = sendmsg(send_fd, &mh, MSG_NOSIGNAL);
1944 if (k < 0)
1945 return log_error_errno(errno, "Failed to send netlink fd: %m");
1946
1947 return 0;
1948 }
1949
1950 static int flush_ports(union in_addr_union *exposed) {
1951 ExposePort *p;
1952 int r, af = AF_INET;
1953
1954 assert(exposed);
1955
1956 if (!arg_expose_ports)
1957 return 0;
1958
1959 if (in_addr_is_null(af, exposed))
1960 return 0;
1961
1962 log_debug("Lost IP address.");
1963
1964 LIST_FOREACH(ports, p, arg_expose_ports) {
1965 r = fw_add_local_dnat(false,
1966 af,
1967 p->protocol,
1968 NULL,
1969 NULL, 0,
1970 NULL, 0,
1971 p->host_port,
1972 exposed,
1973 p->container_port,
1974 NULL);
1975 if (r < 0)
1976 log_warning_errno(r, "Failed to modify firewall: %m");
1977 }
1978
1979 *exposed = IN_ADDR_NULL;
1980 return 0;
1981 }
1982
1983 static int expose_ports(sd_netlink *rtnl, union in_addr_union *exposed) {
1984 _cleanup_free_ struct local_address *addresses = NULL;
1985 _cleanup_free_ char *pretty = NULL;
1986 union in_addr_union new_exposed;
1987 ExposePort *p;
1988 bool add;
1989 int af = AF_INET, r;
1990
1991 assert(exposed);
1992
1993 /* Invoked each time an address is added or removed inside the
1994 * container */
1995
1996 if (!arg_expose_ports)
1997 return 0;
1998
1999 r = local_addresses(rtnl, 0, af, &addresses);
2000 if (r < 0)
2001 return log_error_errno(r, "Failed to enumerate local addresses: %m");
2002
2003 add = r > 0 &&
2004 addresses[0].family == af &&
2005 addresses[0].scope < RT_SCOPE_LINK;
2006
2007 if (!add)
2008 return flush_ports(exposed);
2009
2010 new_exposed = addresses[0].address;
2011 if (in_addr_equal(af, exposed, &new_exposed))
2012 return 0;
2013
2014 in_addr_to_string(af, &new_exposed, &pretty);
2015 log_debug("New container IP is %s.", strna(pretty));
2016
2017 LIST_FOREACH(ports, p, arg_expose_ports) {
2018
2019 r = fw_add_local_dnat(true,
2020 af,
2021 p->protocol,
2022 NULL,
2023 NULL, 0,
2024 NULL, 0,
2025 p->host_port,
2026 &new_exposed,
2027 p->container_port,
2028 in_addr_is_null(af, exposed) ? NULL : exposed);
2029 if (r < 0)
2030 log_warning_errno(r, "Failed to modify firewall: %m");
2031 }
2032
2033 *exposed = new_exposed;
2034 return 0;
2035 }
2036
2037 static int on_address_change(sd_netlink *rtnl, sd_netlink_message *m, void *userdata) {
2038 union in_addr_union *exposed = userdata;
2039
2040 assert(rtnl);
2041 assert(m);
2042 assert(exposed);
2043
2044 expose_ports(rtnl, exposed);
2045 return 0;
2046 }
2047
2048 static int watch_rtnl(sd_event *event, int recv_fd, union in_addr_union *exposed, sd_netlink **ret) {
2049 union {
2050 struct cmsghdr cmsghdr;
2051 uint8_t buf[CMSG_SPACE(sizeof(int))];
2052 } control = {};
2053 struct msghdr mh = {
2054 .msg_control = &control,
2055 .msg_controllen = sizeof(control),
2056 };
2057 struct cmsghdr *cmsg;
2058 _cleanup_netlink_unref_ sd_netlink *rtnl = NULL;
2059 int fd, r;
2060 ssize_t k;
2061
2062 assert(event);
2063 assert(recv_fd >= 0);
2064 assert(ret);
2065
2066 if (!arg_expose_ports)
2067 return 0;
2068
2069 k = recvmsg(recv_fd, &mh, MSG_NOSIGNAL);
2070 if (k < 0)
2071 return log_error_errno(errno, "Failed to recv netlink fd: %m");
2072
2073 cmsg = CMSG_FIRSTHDR(&mh);
2074 assert(cmsg->cmsg_level == SOL_SOCKET);
2075 assert(cmsg->cmsg_type == SCM_RIGHTS);
2076 assert(cmsg->cmsg_len == CMSG_LEN(sizeof(int)));
2077 memcpy(&fd, CMSG_DATA(cmsg), sizeof(int));
2078
2079 r = sd_netlink_open_fd(&rtnl, fd);
2080 if (r < 0) {
2081 safe_close(fd);
2082 return log_error_errno(r, "Failed to create rtnl object: %m");
2083 }
2084
2085 r = sd_netlink_add_match(rtnl, RTM_NEWADDR, on_address_change, exposed);
2086 if (r < 0)
2087 return log_error_errno(r, "Failed to subscribe to RTM_NEWADDR messages: %m");
2088
2089 r = sd_netlink_add_match(rtnl, RTM_DELADDR, on_address_change, exposed);
2090 if (r < 0)
2091 return log_error_errno(r, "Failed to subscribe to RTM_DELADDR messages: %m");
2092
2093 r = sd_netlink_attach_event(rtnl, event, 0);
2094 if (r < 0)
2095 return log_error_errno(r, "Failed to add to even loop: %m");
2096
2097 *ret = rtnl;
2098 rtnl = NULL;
2099
2100 return 0;
2101 }
2102
2103 static int setup_hostname(void) {
2104
2105 if (arg_share_system)
2106 return 0;
2107
2108 if (sethostname_idempotent(arg_machine) < 0)
2109 return -errno;
2110
2111 return 0;
2112 }
2113
2114 static int setup_journal(const char *directory) {
2115 sd_id128_t machine_id, this_id;
2116 _cleanup_free_ char *b = NULL, *d = NULL;
2117 const char *etc_machine_id, *p, *q;
2118 char *id;
2119 int r;
2120
2121 /* Don't link journals in ephemeral mode */
2122 if (arg_ephemeral)
2123 return 0;
2124
2125 etc_machine_id = prefix_roota(directory, "/etc/machine-id");
2126
2127 r = read_one_line_file(etc_machine_id, &b);
2128 if (r == -ENOENT && arg_link_journal == LINK_AUTO)
2129 return 0;
2130 else if (r < 0)
2131 return log_error_errno(r, "Failed to read machine ID from %s: %m", etc_machine_id);
2132
2133 id = strstrip(b);
2134 if (isempty(id) && arg_link_journal == LINK_AUTO)
2135 return 0;
2136
2137 /* Verify validity */
2138 r = sd_id128_from_string(id, &machine_id);
2139 if (r < 0)
2140 return log_error_errno(r, "Failed to parse machine ID from %s: %m", etc_machine_id);
2141
2142 r = sd_id128_get_machine(&this_id);
2143 if (r < 0)
2144 return log_error_errno(r, "Failed to retrieve machine ID: %m");
2145
2146 if (sd_id128_equal(machine_id, this_id)) {
2147 log_full(arg_link_journal == LINK_AUTO ? LOG_WARNING : LOG_ERR,
2148 "Host and machine ids are equal (%s): refusing to link journals", id);
2149 if (arg_link_journal == LINK_AUTO)
2150 return 0;
2151 return -EEXIST;
2152 }
2153
2154 if (arg_link_journal == LINK_NO)
2155 return 0;
2156
2157 r = userns_mkdir(directory, "/var", 0755, 0, 0);
2158 if (r < 0)
2159 return log_error_errno(r, "Failed to create /var: %m");
2160
2161 r = userns_mkdir(directory, "/var/log", 0755, 0, 0);
2162 if (r < 0)
2163 return log_error_errno(r, "Failed to create /var/log: %m");
2164
2165 r = userns_mkdir(directory, "/var/log/journal", 0755, 0, 0);
2166 if (r < 0)
2167 return log_error_errno(r, "Failed to create /var/log/journal: %m");
2168
2169 p = strjoina("/var/log/journal/", id);
2170 q = prefix_roota(directory, p);
2171
2172 if (path_is_mount_point(p, 0) > 0) {
2173 if (arg_link_journal != LINK_AUTO) {
2174 log_error("%s: already a mount point, refusing to use for journal", p);
2175 return -EEXIST;
2176 }
2177
2178 return 0;
2179 }
2180
2181 if (path_is_mount_point(q, 0) > 0) {
2182 if (arg_link_journal != LINK_AUTO) {
2183 log_error("%s: already a mount point, refusing to use for journal", q);
2184 return -EEXIST;
2185 }
2186
2187 return 0;
2188 }
2189
2190 r = readlink_and_make_absolute(p, &d);
2191 if (r >= 0) {
2192 if ((arg_link_journal == LINK_GUEST ||
2193 arg_link_journal == LINK_AUTO) &&
2194 path_equal(d, q)) {
2195
2196 r = userns_mkdir(directory, p, 0755, 0, 0);
2197 if (r < 0)
2198 log_warning_errno(errno, "Failed to create directory %s: %m", q);
2199 return 0;
2200 }
2201
2202 if (unlink(p) < 0)
2203 return log_error_errno(errno, "Failed to remove symlink %s: %m", p);
2204 } else if (r == -EINVAL) {
2205
2206 if (arg_link_journal == LINK_GUEST &&
2207 rmdir(p) < 0) {
2208
2209 if (errno == ENOTDIR) {
2210 log_error("%s already exists and is neither a symlink nor a directory", p);
2211 return r;
2212 } else {
2213 log_error_errno(errno, "Failed to remove %s: %m", p);
2214 return -errno;
2215 }
2216 }
2217 } else if (r != -ENOENT) {
2218 log_error_errno(errno, "readlink(%s) failed: %m", p);
2219 return r;
2220 }
2221
2222 if (arg_link_journal == LINK_GUEST) {
2223
2224 if (symlink(q, p) < 0) {
2225 if (arg_link_journal_try) {
2226 log_debug_errno(errno, "Failed to symlink %s to %s, skipping journal setup: %m", q, p);
2227 return 0;
2228 } else {
2229 log_error_errno(errno, "Failed to symlink %s to %s: %m", q, p);
2230 return -errno;
2231 }
2232 }
2233
2234 r = userns_mkdir(directory, p, 0755, 0, 0);
2235 if (r < 0)
2236 log_warning_errno(errno, "Failed to create directory %s: %m", q);
2237 return 0;
2238 }
2239
2240 if (arg_link_journal == LINK_HOST) {
2241 /* don't create parents here -- if the host doesn't have
2242 * permanent journal set up, don't force it here */
2243 r = mkdir(p, 0755);
2244 if (r < 0) {
2245 if (arg_link_journal_try) {
2246 log_debug_errno(errno, "Failed to create %s, skipping journal setup: %m", p);
2247 return 0;
2248 } else {
2249 log_error_errno(errno, "Failed to create %s: %m", p);
2250 return r;
2251 }
2252 }
2253
2254 } else if (access(p, F_OK) < 0)
2255 return 0;
2256
2257 if (dir_is_empty(q) == 0)
2258 log_warning("%s is not empty, proceeding anyway.", q);
2259
2260 r = userns_mkdir(directory, p, 0755, 0, 0);
2261 if (r < 0) {
2262 log_error_errno(errno, "Failed to create %s: %m", q);
2263 return r;
2264 }
2265
2266 if (mount(p, q, NULL, MS_BIND, NULL) < 0)
2267 return log_error_errno(errno, "Failed to bind mount journal from host into guest: %m");
2268
2269 return 0;
2270 }
2271
2272 static int drop_capabilities(void) {
2273 return capability_bounding_set_drop(~arg_retain, false);
2274 }
2275
2276 static int register_machine(pid_t pid, int local_ifindex) {
2277 _cleanup_bus_error_free_ sd_bus_error error = SD_BUS_ERROR_NULL;
2278 _cleanup_bus_flush_close_unref_ sd_bus *bus = NULL;
2279 int r;
2280
2281 if (!arg_register)
2282 return 0;
2283
2284 r = sd_bus_default_system(&bus);
2285 if (r < 0)
2286 return log_error_errno(r, "Failed to open system bus: %m");
2287
2288 if (arg_keep_unit) {
2289 r = sd_bus_call_method(
2290 bus,
2291 "org.freedesktop.machine1",
2292 "/org/freedesktop/machine1",
2293 "org.freedesktop.machine1.Manager",
2294 "RegisterMachineWithNetwork",
2295 &error,
2296 NULL,
2297 "sayssusai",
2298 arg_machine,
2299 SD_BUS_MESSAGE_APPEND_ID128(arg_uuid),
2300 "nspawn",
2301 "container",
2302 (uint32_t) pid,
2303 strempty(arg_directory),
2304 local_ifindex > 0 ? 1 : 0, local_ifindex);
2305 } else {
2306 _cleanup_bus_message_unref_ sd_bus_message *m = NULL;
2307 char **i;
2308 unsigned j;
2309
2310 r = sd_bus_message_new_method_call(
2311 bus,
2312 &m,
2313 "org.freedesktop.machine1",
2314 "/org/freedesktop/machine1",
2315 "org.freedesktop.machine1.Manager",
2316 "CreateMachineWithNetwork");
2317 if (r < 0)
2318 return bus_log_create_error(r);
2319
2320 r = sd_bus_message_append(
2321 m,
2322 "sayssusai",
2323 arg_machine,
2324 SD_BUS_MESSAGE_APPEND_ID128(arg_uuid),
2325 "nspawn",
2326 "container",
2327 (uint32_t) pid,
2328 strempty(arg_directory),
2329 local_ifindex > 0 ? 1 : 0, local_ifindex);
2330 if (r < 0)
2331 return bus_log_create_error(r);
2332
2333 r = sd_bus_message_open_container(m, 'a', "(sv)");
2334 if (r < 0)
2335 return bus_log_create_error(r);
2336
2337 if (!isempty(arg_slice)) {
2338 r = sd_bus_message_append(m, "(sv)", "Slice", "s", arg_slice);
2339 if (r < 0)
2340 return bus_log_create_error(r);
2341 }
2342
2343 r = sd_bus_message_append(m, "(sv)", "DevicePolicy", "s", "strict");
2344 if (r < 0)
2345 return bus_log_create_error(r);
2346
2347 /* If you make changes here, also make sure to update
2348 * systemd-nspawn@.service, to keep the device
2349 * policies in sync regardless if we are run with or
2350 * without the --keep-unit switch. */
2351 r = sd_bus_message_append(m, "(sv)", "DeviceAllow", "a(ss)", 9,
2352 /* Allow the container to
2353 * access and create the API
2354 * device nodes, so that
2355 * PrivateDevices= in the
2356 * container can work
2357 * fine */
2358 "/dev/null", "rwm",
2359 "/dev/zero", "rwm",
2360 "/dev/full", "rwm",
2361 "/dev/random", "rwm",
2362 "/dev/urandom", "rwm",
2363 "/dev/tty", "rwm",
2364 "/dev/net/tun", "rwm",
2365 /* Allow the container
2366 * access to ptys. However,
2367 * do not permit the
2368 * container to ever create
2369 * these device nodes. */
2370 "/dev/pts/ptmx", "rw",
2371 "char-pts", "rw");
2372 if (r < 0)
2373 return bus_log_create_error(r);
2374
2375 for (j = 0; j < arg_n_custom_mounts; j++) {
2376 CustomMount *cm = &arg_custom_mounts[j];
2377
2378 if (cm->type != CUSTOM_MOUNT_BIND)
2379 continue;
2380
2381 r = is_device_node(cm->source);
2382 if (r < 0)
2383 return log_error_errno(r, "Failed to stat %s: %m", cm->source);
2384
2385 if (r) {
2386 r = sd_bus_message_append(m, "(sv)", "DeviceAllow", "a(ss)", 1,
2387 cm->source, cm->read_only ? "r" : "rw");
2388 if (r < 0)
2389 return log_error_errno(r, "Failed to append message arguments: %m");
2390 }
2391 }
2392
2393 if (arg_kill_signal != 0) {
2394 r = sd_bus_message_append(m, "(sv)", "KillSignal", "i", arg_kill_signal);
2395 if (r < 0)
2396 return bus_log_create_error(r);
2397
2398 r = sd_bus_message_append(m, "(sv)", "KillMode", "s", "mixed");
2399 if (r < 0)
2400 return bus_log_create_error(r);
2401 }
2402
2403 STRV_FOREACH(i, arg_property) {
2404 r = sd_bus_message_open_container(m, 'r', "sv");
2405 if (r < 0)
2406 return bus_log_create_error(r);
2407
2408 r = bus_append_unit_property_assignment(m, *i);
2409 if (r < 0)
2410 return r;
2411
2412 r = sd_bus_message_close_container(m);
2413 if (r < 0)
2414 return bus_log_create_error(r);
2415 }
2416
2417 r = sd_bus_message_close_container(m);
2418 if (r < 0)
2419 return bus_log_create_error(r);
2420
2421 r = sd_bus_call(bus, m, 0, &error, NULL);
2422 }
2423
2424 if (r < 0) {
2425 log_error("Failed to register machine: %s", bus_error_message(&error, r));
2426 return r;
2427 }
2428
2429 return 0;
2430 }
2431
2432 static int terminate_machine(pid_t pid) {
2433 _cleanup_bus_error_free_ sd_bus_error error = SD_BUS_ERROR_NULL;
2434 _cleanup_bus_message_unref_ sd_bus_message *reply = NULL;
2435 _cleanup_bus_flush_close_unref_ sd_bus *bus = NULL;
2436 const char *path;
2437 int r;
2438
2439 if (!arg_register)
2440 return 0;
2441
2442 /* If we are reusing the unit, then just exit, systemd will do
2443 * the right thing when we exit. */
2444 if (arg_keep_unit)
2445 return 0;
2446
2447 r = sd_bus_default_system(&bus);
2448 if (r < 0)
2449 return log_error_errno(r, "Failed to open system bus: %m");
2450
2451 r = sd_bus_call_method(
2452 bus,
2453 "org.freedesktop.machine1",
2454 "/org/freedesktop/machine1",
2455 "org.freedesktop.machine1.Manager",
2456 "GetMachineByPID",
2457 &error,
2458 &reply,
2459 "u",
2460 (uint32_t) pid);
2461 if (r < 0) {
2462 /* Note that the machine might already have been
2463 * cleaned up automatically, hence don't consider it a
2464 * failure if we cannot get the machine object. */
2465 log_debug("Failed to get machine: %s", bus_error_message(&error, r));
2466 return 0;
2467 }
2468
2469 r = sd_bus_message_read(reply, "o", &path);
2470 if (r < 0)
2471 return bus_log_parse_error(r);
2472
2473 r = sd_bus_call_method(
2474 bus,
2475 "org.freedesktop.machine1",
2476 path,
2477 "org.freedesktop.machine1.Machine",
2478 "Terminate",
2479 &error,
2480 NULL,
2481 NULL);
2482 if (r < 0) {
2483 log_debug("Failed to terminate machine: %s", bus_error_message(&error, r));
2484 return 0;
2485 }
2486
2487 return 0;
2488 }
2489
2490 static int reset_audit_loginuid(void) {
2491 _cleanup_free_ char *p = NULL;
2492 int r;
2493
2494 if (arg_share_system)
2495 return 0;
2496
2497 r = read_one_line_file("/proc/self/loginuid", &p);
2498 if (r == -ENOENT)
2499 return 0;
2500 if (r < 0)
2501 return log_error_errno(r, "Failed to read /proc/self/loginuid: %m");
2502
2503 /* Already reset? */
2504 if (streq(p, "4294967295"))
2505 return 0;
2506
2507 r = write_string_file("/proc/self/loginuid", "4294967295", 0);
2508 if (r < 0) {
2509 log_error_errno(r,
2510 "Failed to reset audit login UID. This probably means that your kernel is too\n"
2511 "old and you have audit enabled. Note that the auditing subsystem is known to\n"
2512 "be incompatible with containers on old kernels. Please make sure to upgrade\n"
2513 "your kernel or to off auditing with 'audit=0' on the kernel command line before\n"
2514 "using systemd-nspawn. Sleeping for 5s... (%m)");
2515
2516 sleep(5);
2517 }
2518
2519 return 0;
2520 }
2521
2522 #define HOST_HASH_KEY SD_ID128_MAKE(1a,37,6f,c7,46,ec,45,0b,ad,a3,d5,31,06,60,5d,b1)
2523 #define CONTAINER_HASH_KEY SD_ID128_MAKE(c3,c4,f9,19,b5,57,b2,1c,e6,cf,14,27,03,9c,ee,a2)
2524 #define MACVLAN_HASH_KEY SD_ID128_MAKE(00,13,6d,bc,66,83,44,81,bb,0c,f9,51,1f,24,a6,6f)
2525
2526 static int generate_mac(struct ether_addr *mac, sd_id128_t hash_key, uint64_t idx) {
2527 uint8_t result[8];
2528 size_t l, sz;
2529 uint8_t *v, *i;
2530 int r;
2531
2532 l = strlen(arg_machine);
2533 sz = sizeof(sd_id128_t) + l;
2534 if (idx > 0)
2535 sz += sizeof(idx);
2536
2537 v = alloca(sz);
2538
2539 /* fetch some persistent data unique to the host */
2540 r = sd_id128_get_machine((sd_id128_t*) v);
2541 if (r < 0)
2542 return r;
2543
2544 /* combine with some data unique (on this host) to this
2545 * container instance */
2546 i = mempcpy(v + sizeof(sd_id128_t), arg_machine, l);
2547 if (idx > 0) {
2548 idx = htole64(idx);
2549 memcpy(i, &idx, sizeof(idx));
2550 }
2551
2552 /* Let's hash the host machine ID plus the container name. We
2553 * use a fixed, but originally randomly created hash key here. */
2554 siphash24(result, v, sz, hash_key.bytes);
2555
2556 assert_cc(ETH_ALEN <= sizeof(result));
2557 memcpy(mac->ether_addr_octet, result, ETH_ALEN);
2558
2559 /* see eth_random_addr in the kernel */
2560 mac->ether_addr_octet[0] &= 0xfe; /* clear multicast bit */
2561 mac->ether_addr_octet[0] |= 0x02; /* set local assignment bit (IEEE802) */
2562
2563 return 0;
2564 }
2565
2566 static int setup_veth(pid_t pid, char iface_name[IFNAMSIZ], int *ifi) {
2567 _cleanup_netlink_message_unref_ sd_netlink_message *m = NULL;
2568 _cleanup_netlink_unref_ sd_netlink *rtnl = NULL;
2569 struct ether_addr mac_host, mac_container;
2570 int r, i;
2571
2572 if (!arg_private_network)
2573 return 0;
2574
2575 if (!arg_network_veth)
2576 return 0;
2577
2578 /* Use two different interface name prefixes depending whether
2579 * we are in bridge mode or not. */
2580 snprintf(iface_name, IFNAMSIZ - 1, "%s-%s",
2581 arg_network_bridge ? "vb" : "ve", arg_machine);
2582
2583 r = generate_mac(&mac_container, CONTAINER_HASH_KEY, 0);
2584 if (r < 0)
2585 return log_error_errno(r, "Failed to generate predictable MAC address for container side: %m");
2586
2587 r = generate_mac(&mac_host, HOST_HASH_KEY, 0);
2588 if (r < 0)
2589 return log_error_errno(r, "Failed to generate predictable MAC address for host side: %m");
2590
2591 r = sd_netlink_open(&rtnl);
2592 if (r < 0)
2593 return log_error_errno(r, "Failed to connect to netlink: %m");
2594
2595 r = sd_rtnl_message_new_link(rtnl, &m, RTM_NEWLINK, 0);
2596 if (r < 0)
2597 return log_error_errno(r, "Failed to allocate netlink message: %m");
2598
2599 r = sd_netlink_message_append_string(m, IFLA_IFNAME, iface_name);
2600 if (r < 0)
2601 return log_error_errno(r, "Failed to add netlink interface name: %m");
2602
2603 r = sd_netlink_message_append_ether_addr(m, IFLA_ADDRESS, &mac_host);
2604 if (r < 0)
2605 return log_error_errno(r, "Failed to add netlink MAC address: %m");
2606
2607 r = sd_netlink_message_open_container(m, IFLA_LINKINFO);
2608 if (r < 0)
2609 return log_error_errno(r, "Failed to open netlink container: %m");
2610
2611 r = sd_netlink_message_open_container_union(m, IFLA_INFO_DATA, "veth");
2612 if (r < 0)
2613 return log_error_errno(r, "Failed to open netlink container: %m");
2614
2615 r = sd_netlink_message_open_container(m, VETH_INFO_PEER);
2616 if (r < 0)
2617 return log_error_errno(r, "Failed to open netlink container: %m");
2618
2619 r = sd_netlink_message_append_string(m, IFLA_IFNAME, "host0");
2620 if (r < 0)
2621 return log_error_errno(r, "Failed to add netlink interface name: %m");
2622
2623 r = sd_netlink_message_append_ether_addr(m, IFLA_ADDRESS, &mac_container);
2624 if (r < 0)
2625 return log_error_errno(r, "Failed to add netlink MAC address: %m");
2626
2627 r = sd_netlink_message_append_u32(m, IFLA_NET_NS_PID, pid);
2628 if (r < 0)
2629 return log_error_errno(r, "Failed to add netlink namespace field: %m");
2630
2631 r = sd_netlink_message_close_container(m);
2632 if (r < 0)
2633 return log_error_errno(r, "Failed to close netlink container: %m");
2634
2635 r = sd_netlink_message_close_container(m);
2636 if (r < 0)
2637 return log_error_errno(r, "Failed to close netlink container: %m");
2638
2639 r = sd_netlink_message_close_container(m);
2640 if (r < 0)
2641 return log_error_errno(r, "Failed to close netlink container: %m");
2642
2643 r = sd_netlink_call(rtnl, m, 0, NULL);
2644 if (r < 0)
2645 return log_error_errno(r, "Failed to add new veth interfaces (host0, %s): %m", iface_name);
2646
2647 i = (int) if_nametoindex(iface_name);
2648 if (i <= 0)
2649 return log_error_errno(errno, "Failed to resolve interface %s: %m", iface_name);
2650
2651 *ifi = i;
2652
2653 return 0;
2654 }
2655
2656 static int setup_bridge(const char veth_name[], int *ifi) {
2657 _cleanup_netlink_message_unref_ sd_netlink_message *m = NULL;
2658 _cleanup_netlink_unref_ sd_netlink *rtnl = NULL;
2659 int r, bridge;
2660
2661 if (!arg_private_network)
2662 return 0;
2663
2664 if (!arg_network_veth)
2665 return 0;
2666
2667 if (!arg_network_bridge)
2668 return 0;
2669
2670 bridge = (int) if_nametoindex(arg_network_bridge);
2671 if (bridge <= 0)
2672 return log_error_errno(errno, "Failed to resolve interface %s: %m", arg_network_bridge);
2673
2674 *ifi = bridge;
2675
2676 r = sd_netlink_open(&rtnl);
2677 if (r < 0)
2678 return log_error_errno(r, "Failed to connect to netlink: %m");
2679
2680 r = sd_rtnl_message_new_link(rtnl, &m, RTM_SETLINK, 0);
2681 if (r < 0)
2682 return log_error_errno(r, "Failed to allocate netlink message: %m");
2683
2684 r = sd_rtnl_message_link_set_flags(m, IFF_UP, IFF_UP);
2685 if (r < 0)
2686 return log_error_errno(r, "Failed to set IFF_UP flag: %m");
2687
2688 r = sd_netlink_message_append_string(m, IFLA_IFNAME, veth_name);
2689 if (r < 0)
2690 return log_error_errno(r, "Failed to add netlink interface name field: %m");
2691
2692 r = sd_netlink_message_append_u32(m, IFLA_MASTER, bridge);
2693 if (r < 0)
2694 return log_error_errno(r, "Failed to add netlink master field: %m");
2695
2696 r = sd_netlink_call(rtnl, m, 0, NULL);
2697 if (r < 0)
2698 return log_error_errno(r, "Failed to add veth interface to bridge: %m");
2699
2700 return 0;
2701 }
2702
2703 static int parse_interface(struct udev *udev, const char *name) {
2704 _cleanup_udev_device_unref_ struct udev_device *d = NULL;
2705 char ifi_str[2 + DECIMAL_STR_MAX(int)];
2706 int ifi;
2707
2708 ifi = (int) if_nametoindex(name);
2709 if (ifi <= 0)
2710 return log_error_errno(errno, "Failed to resolve interface %s: %m", name);
2711
2712 sprintf(ifi_str, "n%i", ifi);
2713 d = udev_device_new_from_device_id(udev, ifi_str);
2714 if (!d)
2715 return log_error_errno(errno, "Failed to get udev device for interface %s: %m", name);
2716
2717 if (udev_device_get_is_initialized(d) <= 0) {
2718 log_error("Network interface %s is not initialized yet.", name);
2719 return -EBUSY;
2720 }
2721
2722 return ifi;
2723 }
2724
2725 static int move_network_interfaces(pid_t pid) {
2726 _cleanup_udev_unref_ struct udev *udev = NULL;
2727 _cleanup_netlink_unref_ sd_netlink *rtnl = NULL;
2728 char **i;
2729 int r;
2730
2731 if (!arg_private_network)
2732 return 0;
2733
2734 if (strv_isempty(arg_network_interfaces))
2735 return 0;
2736
2737 r = sd_netlink_open(&rtnl);
2738 if (r < 0)
2739 return log_error_errno(r, "Failed to connect to netlink: %m");
2740
2741 udev = udev_new();
2742 if (!udev) {
2743 log_error("Failed to connect to udev.");
2744 return -ENOMEM;
2745 }
2746
2747 STRV_FOREACH(i, arg_network_interfaces) {
2748 _cleanup_netlink_message_unref_ sd_netlink_message *m = NULL;
2749 int ifi;
2750
2751 ifi = parse_interface(udev, *i);
2752 if (ifi < 0)
2753 return ifi;
2754
2755 r = sd_rtnl_message_new_link(rtnl, &m, RTM_SETLINK, ifi);
2756 if (r < 0)
2757 return log_error_errno(r, "Failed to allocate netlink message: %m");
2758
2759 r = sd_netlink_message_append_u32(m, IFLA_NET_NS_PID, pid);
2760 if (r < 0)
2761 return log_error_errno(r, "Failed to append namespace PID to netlink message: %m");
2762
2763 r = sd_netlink_call(rtnl, m, 0, NULL);
2764 if (r < 0)
2765 return log_error_errno(r, "Failed to move interface %s to namespace: %m", *i);
2766 }
2767
2768 return 0;
2769 }
2770
2771 static int setup_macvlan(pid_t pid) {
2772 _cleanup_udev_unref_ struct udev *udev = NULL;
2773 _cleanup_netlink_unref_ sd_netlink *rtnl = NULL;
2774 unsigned idx = 0;
2775 char **i;
2776 int r;
2777
2778 if (!arg_private_network)
2779 return 0;
2780
2781 if (strv_isempty(arg_network_macvlan))
2782 return 0;
2783
2784 r = sd_netlink_open(&rtnl);
2785 if (r < 0)
2786 return log_error_errno(r, "Failed to connect to netlink: %m");
2787
2788 udev = udev_new();
2789 if (!udev) {
2790 log_error("Failed to connect to udev.");
2791 return -ENOMEM;
2792 }
2793
2794 STRV_FOREACH(i, arg_network_macvlan) {
2795 _cleanup_netlink_message_unref_ sd_netlink_message *m = NULL;
2796 _cleanup_free_ char *n = NULL;
2797 struct ether_addr mac;
2798 int ifi;
2799
2800 ifi = parse_interface(udev, *i);
2801 if (ifi < 0)
2802 return ifi;
2803
2804 r = generate_mac(&mac, MACVLAN_HASH_KEY, idx++);
2805 if (r < 0)
2806 return log_error_errno(r, "Failed to create MACVLAN MAC address: %m");
2807
2808 r = sd_rtnl_message_new_link(rtnl, &m, RTM_NEWLINK, 0);
2809 if (r < 0)
2810 return log_error_errno(r, "Failed to allocate netlink message: %m");
2811
2812 r = sd_netlink_message_append_u32(m, IFLA_LINK, ifi);
2813 if (r < 0)
2814 return log_error_errno(r, "Failed to add netlink interface index: %m");
2815
2816 n = strappend("mv-", *i);
2817 if (!n)
2818 return log_oom();
2819
2820 strshorten(n, IFNAMSIZ-1);
2821
2822 r = sd_netlink_message_append_string(m, IFLA_IFNAME, n);
2823 if (r < 0)
2824 return log_error_errno(r, "Failed to add netlink interface name: %m");
2825
2826 r = sd_netlink_message_append_ether_addr(m, IFLA_ADDRESS, &mac);
2827 if (r < 0)
2828 return log_error_errno(r, "Failed to add netlink MAC address: %m");
2829
2830 r = sd_netlink_message_append_u32(m, IFLA_NET_NS_PID, pid);
2831 if (r < 0)
2832 return log_error_errno(r, "Failed to add netlink namespace field: %m");
2833
2834 r = sd_netlink_message_open_container(m, IFLA_LINKINFO);
2835 if (r < 0)
2836 return log_error_errno(r, "Failed to open netlink container: %m");
2837
2838 r = sd_netlink_message_open_container_union(m, IFLA_INFO_DATA, "macvlan");
2839 if (r < 0)
2840 return log_error_errno(r, "Failed to open netlink container: %m");
2841
2842 r = sd_netlink_message_append_u32(m, IFLA_MACVLAN_MODE, MACVLAN_MODE_BRIDGE);
2843 if (r < 0)
2844 return log_error_errno(r, "Failed to append macvlan mode: %m");
2845
2846 r = sd_netlink_message_close_container(m);
2847 if (r < 0)
2848 return log_error_errno(r, "Failed to close netlink container: %m");
2849
2850 r = sd_netlink_message_close_container(m);
2851 if (r < 0)
2852 return log_error_errno(r, "Failed to close netlink container: %m");
2853
2854 r = sd_netlink_call(rtnl, m, 0, NULL);
2855 if (r < 0)
2856 return log_error_errno(r, "Failed to add new macvlan interfaces: %m");
2857 }
2858
2859 return 0;
2860 }
2861
2862 static int setup_ipvlan(pid_t pid) {
2863 _cleanup_udev_unref_ struct udev *udev = NULL;
2864 _cleanup_netlink_unref_ sd_netlink *rtnl = NULL;
2865 char **i;
2866 int r;
2867
2868 if (!arg_private_network)
2869 return 0;
2870
2871 if (strv_isempty(arg_network_ipvlan))
2872 return 0;
2873
2874 r = sd_netlink_open(&rtnl);
2875 if (r < 0)
2876 return log_error_errno(r, "Failed to connect to netlink: %m");
2877
2878 udev = udev_new();
2879 if (!udev) {
2880 log_error("Failed to connect to udev.");
2881 return -ENOMEM;
2882 }
2883
2884 STRV_FOREACH(i, arg_network_ipvlan) {
2885 _cleanup_netlink_message_unref_ sd_netlink_message *m = NULL;
2886 _cleanup_free_ char *n = NULL;
2887 int ifi;
2888
2889 ifi = parse_interface(udev, *i);
2890 if (ifi < 0)
2891 return ifi;
2892
2893 r = sd_rtnl_message_new_link(rtnl, &m, RTM_NEWLINK, 0);
2894 if (r < 0)
2895 return log_error_errno(r, "Failed to allocate netlink message: %m");
2896
2897 r = sd_netlink_message_append_u32(m, IFLA_LINK, ifi);
2898 if (r < 0)
2899 return log_error_errno(r, "Failed to add netlink interface index: %m");
2900
2901 n = strappend("iv-", *i);
2902 if (!n)
2903 return log_oom();
2904
2905 strshorten(n, IFNAMSIZ-1);
2906
2907 r = sd_netlink_message_append_string(m, IFLA_IFNAME, n);
2908 if (r < 0)
2909 return log_error_errno(r, "Failed to add netlink interface name: %m");
2910
2911 r = sd_netlink_message_append_u32(m, IFLA_NET_NS_PID, pid);
2912 if (r < 0)
2913 return log_error_errno(r, "Failed to add netlink namespace field: %m");
2914
2915 r = sd_netlink_message_open_container(m, IFLA_LINKINFO);
2916 if (r < 0)
2917 return log_error_errno(r, "Failed to open netlink container: %m");
2918
2919 r = sd_netlink_message_open_container_union(m, IFLA_INFO_DATA, "ipvlan");
2920 if (r < 0)
2921 return log_error_errno(r, "Failed to open netlink container: %m");
2922
2923 r = sd_netlink_message_append_u16(m, IFLA_IPVLAN_MODE, IPVLAN_MODE_L2);
2924 if (r < 0)
2925 return log_error_errno(r, "Failed to add ipvlan mode: %m");
2926
2927 r = sd_netlink_message_close_container(m);
2928 if (r < 0)
2929 return log_error_errno(r, "Failed to close netlink container: %m");
2930
2931 r = sd_netlink_message_close_container(m);
2932 if (r < 0)
2933 return log_error_errno(r, "Failed to close netlink container: %m");
2934
2935 r = sd_netlink_call(rtnl, m, 0, NULL);
2936 if (r < 0)
2937 return log_error_errno(r, "Failed to add new ipvlan interfaces: %m");
2938 }
2939
2940 return 0;
2941 }
2942
2943 static int setup_seccomp(void) {
2944
2945 #ifdef HAVE_SECCOMP
2946 static const struct {
2947 uint64_t capability;
2948 int syscall_num;
2949 } blacklist[] = {
2950 { CAP_SYS_RAWIO, SCMP_SYS(iopl) },
2951 { CAP_SYS_RAWIO, SCMP_SYS(ioperm) },
2952 { CAP_SYS_BOOT, SCMP_SYS(kexec_load) },
2953 { CAP_SYS_ADMIN, SCMP_SYS(swapon) },
2954 { CAP_SYS_ADMIN, SCMP_SYS(swapoff) },
2955 { CAP_SYS_ADMIN, SCMP_SYS(open_by_handle_at) },
2956 { CAP_SYS_MODULE, SCMP_SYS(init_module) },
2957 { CAP_SYS_MODULE, SCMP_SYS(finit_module) },
2958 { CAP_SYS_MODULE, SCMP_SYS(delete_module) },
2959 { CAP_SYSLOG, SCMP_SYS(syslog) },
2960 };
2961
2962 scmp_filter_ctx seccomp;
2963 unsigned i;
2964 int r;
2965
2966 seccomp = seccomp_init(SCMP_ACT_ALLOW);
2967 if (!seccomp)
2968 return log_oom();
2969
2970 r = seccomp_add_secondary_archs(seccomp);
2971 if (r < 0) {
2972 log_error_errno(r, "Failed to add secondary archs to seccomp filter: %m");
2973 goto finish;
2974 }
2975
2976 for (i = 0; i < ELEMENTSOF(blacklist); i++) {
2977 if (arg_retain & (1ULL << blacklist[i].capability))
2978 continue;
2979
2980 r = seccomp_rule_add(seccomp, SCMP_ACT_ERRNO(EPERM), blacklist[i].syscall_num, 0);
2981 if (r == -EFAULT)
2982 continue; /* unknown syscall */
2983 if (r < 0) {
2984 log_error_errno(r, "Failed to block syscall: %m");
2985 goto finish;
2986 }
2987 }
2988
2989
2990 /*
2991 Audit is broken in containers, much of the userspace audit
2992 hookup will fail if running inside a container. We don't
2993 care and just turn off creation of audit sockets.
2994
2995 This will make socket(AF_NETLINK, *, NETLINK_AUDIT) fail
2996 with EAFNOSUPPORT which audit userspace uses as indication
2997 that audit is disabled in the kernel.
2998 */
2999
3000 r = seccomp_rule_add(
3001 seccomp,
3002 SCMP_ACT_ERRNO(EAFNOSUPPORT),
3003 SCMP_SYS(socket),
3004 2,
3005 SCMP_A0(SCMP_CMP_EQ, AF_NETLINK),
3006 SCMP_A2(SCMP_CMP_EQ, NETLINK_AUDIT));
3007 if (r < 0) {
3008 log_error_errno(r, "Failed to add audit seccomp rule: %m");
3009 goto finish;
3010 }
3011
3012 r = seccomp_attr_set(seccomp, SCMP_FLTATR_CTL_NNP, 0);
3013 if (r < 0) {
3014 log_error_errno(r, "Failed to unset NO_NEW_PRIVS: %m");
3015 goto finish;
3016 }
3017
3018 r = seccomp_load(seccomp);
3019 if (r == -EINVAL) {
3020 log_debug_errno(r, "Kernel is probably not configured with CONFIG_SECCOMP. Disabling seccomp audit filter: %m");
3021 r = 0;
3022 goto finish;
3023 }
3024 if (r < 0) {
3025 log_error_errno(r, "Failed to install seccomp audit filter: %m");
3026 goto finish;
3027 }
3028
3029 finish:
3030 seccomp_release(seccomp);
3031 return r;
3032 #else
3033 return 0;
3034 #endif
3035
3036 }
3037
3038 static int setup_propagate(const char *root) {
3039 const char *p, *q;
3040
3041 (void) mkdir_p("/run/systemd/nspawn/", 0755);
3042 (void) mkdir_p("/run/systemd/nspawn/propagate", 0600);
3043 p = strjoina("/run/systemd/nspawn/propagate/", arg_machine);
3044 (void) mkdir_p(p, 0600);
3045
3046 if (userns_mkdir(root, "/run/systemd", 0755, 0, 0) < 0)
3047 return log_error_errno(errno, "Failed to create /run/systemd: %m");
3048
3049 if (userns_mkdir(root, "/run/systemd/nspawn", 0755, 0, 0) < 0)
3050 return log_error_errno(errno, "Failed to create /run/systemd/nspawn: %m");
3051
3052 if (userns_mkdir(root, "/run/systemd/nspawn/incoming", 0600, 0, 0) < 0)
3053 return log_error_errno(errno, "Failed to create /run/systemd/nspawn/incoming: %m");
3054
3055 q = prefix_roota(root, "/run/systemd/nspawn/incoming");
3056 if (mount(p, q, NULL, MS_BIND, NULL) < 0)
3057 return log_error_errno(errno, "Failed to install propagation bind mount.");
3058
3059 if (mount(NULL, q, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY, NULL) < 0)
3060 return log_error_errno(errno, "Failed to make propagation mount read-only");
3061
3062 return 0;
3063 }
3064
3065 static int setup_image(char **device_path, int *loop_nr) {
3066 struct loop_info64 info = {
3067 .lo_flags = LO_FLAGS_AUTOCLEAR|LO_FLAGS_PARTSCAN
3068 };
3069 _cleanup_close_ int fd = -1, control = -1, loop = -1;
3070 _cleanup_free_ char* loopdev = NULL;
3071 struct stat st;
3072 int r, nr;
3073
3074 assert(device_path);
3075 assert(loop_nr);
3076 assert(arg_image);
3077
3078 fd = open(arg_image, O_CLOEXEC|(arg_read_only ? O_RDONLY : O_RDWR)|O_NONBLOCK|O_NOCTTY);
3079 if (fd < 0)
3080 return log_error_errno(errno, "Failed to open %s: %m", arg_image);
3081
3082 if (fstat(fd, &st) < 0)
3083 return log_error_errno(errno, "Failed to stat %s: %m", arg_image);
3084
3085 if (S_ISBLK(st.st_mode)) {
3086 char *p;
3087
3088 p = strdup(arg_image);
3089 if (!p)
3090 return log_oom();
3091
3092 *device_path = p;
3093
3094 *loop_nr = -1;
3095
3096 r = fd;
3097 fd = -1;
3098
3099 return r;
3100 }
3101
3102 if (!S_ISREG(st.st_mode)) {
3103 log_error_errno(errno, "%s is not a regular file or block device: %m", arg_image);
3104 return -EINVAL;
3105 }
3106
3107 control = open("/dev/loop-control", O_RDWR|O_CLOEXEC|O_NOCTTY|O_NONBLOCK);
3108 if (control < 0)
3109 return log_error_errno(errno, "Failed to open /dev/loop-control: %m");
3110
3111 nr = ioctl(control, LOOP_CTL_GET_FREE);
3112 if (nr < 0)
3113 return log_error_errno(errno, "Failed to allocate loop device: %m");
3114
3115 if (asprintf(&loopdev, "/dev/loop%i", nr) < 0)
3116 return log_oom();
3117
3118 loop = open(loopdev, O_CLOEXEC|(arg_read_only ? O_RDONLY : O_RDWR)|O_NONBLOCK|O_NOCTTY);
3119 if (loop < 0)
3120 return log_error_errno(errno, "Failed to open loop device %s: %m", loopdev);
3121
3122 if (ioctl(loop, LOOP_SET_FD, fd) < 0)
3123 return log_error_errno(errno, "Failed to set loopback file descriptor on %s: %m", loopdev);
3124
3125 if (arg_read_only)
3126 info.lo_flags |= LO_FLAGS_READ_ONLY;
3127
3128 if (ioctl(loop, LOOP_SET_STATUS64, &info) < 0)
3129 return log_error_errno(errno, "Failed to set loopback settings on %s: %m", loopdev);
3130
3131 *device_path = loopdev;
3132 loopdev = NULL;
3133
3134 *loop_nr = nr;
3135
3136 r = loop;
3137 loop = -1;
3138
3139 return r;
3140 }
3141
3142 #define PARTITION_TABLE_BLURB \
3143 "Note that the disk image needs to either contain only a single MBR partition of\n" \
3144 "type 0x83 that is marked bootable, or a single GPT partition of type " \
3145 "0FC63DAF-8483-4772-8E79-3D69D8477DE4 or follow\n" \
3146 " http://www.freedesktop.org/wiki/Specifications/DiscoverablePartitionsSpec/\n" \
3147 "to be bootable with systemd-nspawn."
3148
3149 static int dissect_image(
3150 int fd,
3151 char **root_device, bool *root_device_rw,
3152 char **home_device, bool *home_device_rw,
3153 char **srv_device, bool *srv_device_rw,
3154 bool *secondary) {
3155
3156 #ifdef HAVE_BLKID
3157 int home_nr = -1, srv_nr = -1;
3158 #ifdef GPT_ROOT_NATIVE
3159 int root_nr = -1;
3160 #endif
3161 #ifdef GPT_ROOT_SECONDARY
3162 int secondary_root_nr = -1;
3163 #endif
3164 _cleanup_free_ char *home = NULL, *root = NULL, *secondary_root = NULL, *srv = NULL, *generic = NULL;
3165 _cleanup_udev_enumerate_unref_ struct udev_enumerate *e = NULL;
3166 _cleanup_udev_device_unref_ struct udev_device *d = NULL;
3167 _cleanup_blkid_free_probe_ blkid_probe b = NULL;
3168 _cleanup_udev_unref_ struct udev *udev = NULL;
3169 struct udev_list_entry *first, *item;
3170 bool home_rw = true, root_rw = true, secondary_root_rw = true, srv_rw = true, generic_rw = true;
3171 bool is_gpt, is_mbr, multiple_generic = false;
3172 const char *pttype = NULL;
3173 blkid_partlist pl;
3174 struct stat st;
3175 unsigned i;
3176 int r;
3177
3178 assert(fd >= 0);
3179 assert(root_device);
3180 assert(home_device);
3181 assert(srv_device);
3182 assert(secondary);
3183 assert(arg_image);
3184
3185 b = blkid_new_probe();
3186 if (!b)
3187 return log_oom();
3188
3189 errno = 0;
3190 r = blkid_probe_set_device(b, fd, 0, 0);
3191 if (r != 0) {
3192 if (errno == 0)
3193 return log_oom();
3194
3195 log_error_errno(errno, "Failed to set device on blkid probe: %m");
3196 return -errno;
3197 }
3198
3199 blkid_probe_enable_partitions(b, 1);
3200 blkid_probe_set_partitions_flags(b, BLKID_PARTS_ENTRY_DETAILS);
3201
3202 errno = 0;
3203 r = blkid_do_safeprobe(b);
3204 if (r == -2 || r == 1) {
3205 log_error("Failed to identify any partition table on\n"
3206 " %s\n"
3207 PARTITION_TABLE_BLURB, arg_image);
3208 return -EINVAL;
3209 } else if (r != 0) {
3210 if (errno == 0)
3211 errno = EIO;
3212 log_error_errno(errno, "Failed to probe: %m");
3213 return -errno;
3214 }
3215
3216 (void) blkid_probe_lookup_value(b, "PTTYPE", &pttype, NULL);
3217
3218 is_gpt = streq_ptr(pttype, "gpt");
3219 is_mbr = streq_ptr(pttype, "dos");
3220
3221 if (!is_gpt && !is_mbr) {
3222 log_error("No GPT or MBR partition table discovered on\n"
3223 " %s\n"
3224 PARTITION_TABLE_BLURB, arg_image);
3225 return -EINVAL;
3226 }
3227
3228 errno = 0;
3229 pl = blkid_probe_get_partitions(b);
3230 if (!pl) {
3231 if (errno == 0)
3232 return log_oom();
3233
3234 log_error("Failed to list partitions of %s", arg_image);
3235 return -errno;
3236 }
3237
3238 udev = udev_new();
3239 if (!udev)
3240 return log_oom();
3241
3242 if (fstat(fd, &st) < 0)
3243 return log_error_errno(errno, "Failed to stat block device: %m");
3244
3245 d = udev_device_new_from_devnum(udev, 'b', st.st_rdev);
3246 if (!d)
3247 return log_oom();
3248
3249 for (i = 0;; i++) {
3250 int n, m;
3251
3252 if (i >= 10) {
3253 log_error("Kernel partitions never appeared.");
3254 return -ENXIO;
3255 }
3256
3257 e = udev_enumerate_new(udev);
3258 if (!e)
3259 return log_oom();
3260
3261 r = udev_enumerate_add_match_parent(e, d);
3262 if (r < 0)
3263 return log_oom();
3264
3265 r = udev_enumerate_scan_devices(e);
3266 if (r < 0)
3267 return log_error_errno(r, "Failed to scan for partition devices of %s: %m", arg_image);
3268
3269 /* Count the partitions enumerated by the kernel */
3270 n = 0;
3271 first = udev_enumerate_get_list_entry(e);
3272 udev_list_entry_foreach(item, first)
3273 n++;
3274
3275 /* Count the partitions enumerated by blkid */
3276 m = blkid_partlist_numof_partitions(pl);
3277 if (n == m + 1)
3278 break;
3279 if (n > m + 1) {
3280 log_error("blkid and kernel partition list do not match.");
3281 return -EIO;
3282 }
3283 if (n < m + 1) {
3284 unsigned j;
3285
3286 /* The kernel has probed fewer partitions than
3287 * blkid? Maybe the kernel prober is still
3288 * running or it got EBUSY because udev
3289 * already opened the device. Let's reprobe
3290 * the device, which is a synchronous call
3291 * that waits until probing is complete. */
3292
3293 for (j = 0; j < 20; j++) {
3294
3295 r = ioctl(fd, BLKRRPART, 0);
3296 if (r < 0)
3297 r = -errno;
3298 if (r >= 0 || r != -EBUSY)
3299 break;
3300
3301 /* If something else has the device
3302 * open, such as an udev rule, the
3303 * ioctl will return EBUSY. Since
3304 * there's no way to wait until it
3305 * isn't busy anymore, let's just wait
3306 * a bit, and try again.
3307 *
3308 * This is really something they
3309 * should fix in the kernel! */
3310
3311 usleep(50 * USEC_PER_MSEC);
3312 }
3313
3314 if (r < 0)
3315 return log_error_errno(r, "Failed to reread partition table: %m");
3316 }
3317
3318 e = udev_enumerate_unref(e);
3319 }
3320
3321 first = udev_enumerate_get_list_entry(e);
3322 udev_list_entry_foreach(item, first) {
3323 _cleanup_udev_device_unref_ struct udev_device *q;
3324 const char *node;
3325 unsigned long long flags;
3326 blkid_partition pp;
3327 dev_t qn;
3328 int nr;
3329
3330 errno = 0;
3331 q = udev_device_new_from_syspath(udev, udev_list_entry_get_name(item));
3332 if (!q) {
3333 if (!errno)
3334 errno = ENOMEM;
3335
3336 log_error_errno(errno, "Failed to get partition device of %s: %m", arg_image);
3337 return -errno;
3338 }
3339
3340 qn = udev_device_get_devnum(q);
3341 if (major(qn) == 0)
3342 continue;
3343
3344 if (st.st_rdev == qn)
3345 continue;
3346
3347 node = udev_device_get_devnode(q);
3348 if (!node)
3349 continue;
3350
3351 pp = blkid_partlist_devno_to_partition(pl, qn);
3352 if (!pp)
3353 continue;
3354
3355 flags = blkid_partition_get_flags(pp);
3356
3357 nr = blkid_partition_get_partno(pp);
3358 if (nr < 0)
3359 continue;
3360
3361 if (is_gpt) {
3362 sd_id128_t type_id;
3363 const char *stype;
3364
3365 if (flags & GPT_FLAG_NO_AUTO)
3366 continue;
3367
3368 stype = blkid_partition_get_type_string(pp);
3369 if (!stype)
3370 continue;
3371
3372 if (sd_id128_from_string(stype, &type_id) < 0)
3373 continue;
3374
3375 if (sd_id128_equal(type_id, GPT_HOME)) {
3376
3377 if (home && nr >= home_nr)
3378 continue;
3379
3380 home_nr = nr;
3381 home_rw = !(flags & GPT_FLAG_READ_ONLY);
3382
3383 r = free_and_strdup(&home, node);
3384 if (r < 0)
3385 return log_oom();
3386
3387 } else if (sd_id128_equal(type_id, GPT_SRV)) {
3388
3389 if (srv && nr >= srv_nr)
3390 continue;
3391
3392 srv_nr = nr;
3393 srv_rw = !(flags & GPT_FLAG_READ_ONLY);
3394
3395 r = free_and_strdup(&srv, node);
3396 if (r < 0)
3397 return log_oom();
3398 }
3399 #ifdef GPT_ROOT_NATIVE
3400 else if (sd_id128_equal(type_id, GPT_ROOT_NATIVE)) {
3401
3402 if (root && nr >= root_nr)
3403 continue;
3404
3405 root_nr = nr;
3406 root_rw = !(flags & GPT_FLAG_READ_ONLY);
3407
3408 r = free_and_strdup(&root, node);
3409 if (r < 0)
3410 return log_oom();
3411 }
3412 #endif
3413 #ifdef GPT_ROOT_SECONDARY
3414 else if (sd_id128_equal(type_id, GPT_ROOT_SECONDARY)) {
3415
3416 if (secondary_root && nr >= secondary_root_nr)
3417 continue;
3418
3419 secondary_root_nr = nr;
3420 secondary_root_rw = !(flags & GPT_FLAG_READ_ONLY);
3421
3422 r = free_and_strdup(&secondary_root, node);
3423 if (r < 0)
3424 return log_oom();
3425 }
3426 #endif
3427 else if (sd_id128_equal(type_id, GPT_LINUX_GENERIC)) {
3428
3429 if (generic)
3430 multiple_generic = true;
3431 else {
3432 generic_rw = !(flags & GPT_FLAG_READ_ONLY);
3433
3434 r = free_and_strdup(&generic, node);
3435 if (r < 0)
3436 return log_oom();
3437 }
3438 }
3439
3440 } else if (is_mbr) {
3441 int type;
3442
3443 if (flags != 0x80) /* Bootable flag */
3444 continue;
3445
3446 type = blkid_partition_get_type(pp);
3447 if (type != 0x83) /* Linux partition */
3448 continue;
3449
3450 if (generic)
3451 multiple_generic = true;
3452 else {
3453 generic_rw = true;
3454
3455 r = free_and_strdup(&root, node);
3456 if (r < 0)
3457 return log_oom();
3458 }
3459 }
3460 }
3461
3462 if (root) {
3463 *root_device = root;
3464 root = NULL;
3465
3466 *root_device_rw = root_rw;
3467 *secondary = false;
3468 } else if (secondary_root) {
3469 *root_device = secondary_root;
3470 secondary_root = NULL;
3471
3472 *root_device_rw = secondary_root_rw;
3473 *secondary = true;
3474 } else if (generic) {
3475
3476 /* There were no partitions with precise meanings
3477 * around, but we found generic partitions. In this
3478 * case, if there's only one, we can go ahead and boot
3479 * it, otherwise we bail out, because we really cannot
3480 * make any sense of it. */
3481
3482 if (multiple_generic) {
3483 log_error("Identified multiple bootable Linux partitions on\n"
3484 " %s\n"
3485 PARTITION_TABLE_BLURB, arg_image);
3486 return -EINVAL;
3487 }
3488
3489 *root_device = generic;
3490 generic = NULL;
3491
3492 *root_device_rw = generic_rw;
3493 *secondary = false;
3494 } else {
3495 log_error("Failed to identify root partition in disk image\n"
3496 " %s\n"
3497 PARTITION_TABLE_BLURB, arg_image);
3498 return -EINVAL;
3499 }
3500
3501 if (home) {
3502 *home_device = home;
3503 home = NULL;
3504
3505 *home_device_rw = home_rw;
3506 }
3507
3508 if (srv) {
3509 *srv_device = srv;
3510 srv = NULL;
3511
3512 *srv_device_rw = srv_rw;
3513 }
3514
3515 return 0;
3516 #else
3517 log_error("--image= is not supported, compiled without blkid support.");
3518 return -EOPNOTSUPP;
3519 #endif
3520 }
3521
3522 static int mount_device(const char *what, const char *where, const char *directory, bool rw) {
3523 #ifdef HAVE_BLKID
3524 _cleanup_blkid_free_probe_ blkid_probe b = NULL;
3525 const char *fstype, *p;
3526 int r;
3527
3528 assert(what);
3529 assert(where);
3530
3531 if (arg_read_only)
3532 rw = false;
3533
3534 if (directory)
3535 p = strjoina(where, directory);
3536 else
3537 p = where;
3538
3539 errno = 0;
3540 b = blkid_new_probe_from_filename(what);
3541 if (!b) {
3542 if (errno == 0)
3543 return log_oom();
3544 log_error_errno(errno, "Failed to allocate prober for %s: %m", what);
3545 return -errno;
3546 }
3547
3548 blkid_probe_enable_superblocks(b, 1);
3549 blkid_probe_set_superblocks_flags(b, BLKID_SUBLKS_TYPE);
3550
3551 errno = 0;
3552 r = blkid_do_safeprobe(b);
3553 if (r == -1 || r == 1) {
3554 log_error("Cannot determine file system type of %s", what);
3555 return -EINVAL;
3556 } else if (r != 0) {
3557 if (errno == 0)
3558 errno = EIO;
3559 log_error_errno(errno, "Failed to probe %s: %m", what);
3560 return -errno;
3561 }
3562
3563 errno = 0;
3564 if (blkid_probe_lookup_value(b, "TYPE", &fstype, NULL) < 0) {
3565 if (errno == 0)
3566 errno = EINVAL;
3567 log_error("Failed to determine file system type of %s", what);
3568 return -errno;
3569 }
3570
3571 if (streq(fstype, "crypto_LUKS")) {
3572 log_error("nspawn currently does not support LUKS disk images.");
3573 return -EOPNOTSUPP;
3574 }
3575
3576 if (mount(what, p, fstype, MS_NODEV|(rw ? 0 : MS_RDONLY), NULL) < 0)
3577 return log_error_errno(errno, "Failed to mount %s: %m", what);
3578
3579 return 0;
3580 #else
3581 log_error("--image= is not supported, compiled without blkid support.");
3582 return -EOPNOTSUPP;
3583 #endif
3584 }
3585
3586 static int mount_devices(
3587 const char *where,
3588 const char *root_device, bool root_device_rw,
3589 const char *home_device, bool home_device_rw,
3590 const char *srv_device, bool srv_device_rw) {
3591 int r;
3592
3593 assert(where);
3594
3595 if (root_device) {
3596 r = mount_device(root_device, arg_directory, NULL, root_device_rw);
3597 if (r < 0)
3598 return log_error_errno(r, "Failed to mount root directory: %m");
3599 }
3600
3601 if (home_device) {
3602 r = mount_device(home_device, arg_directory, "/home", home_device_rw);
3603 if (r < 0)
3604 return log_error_errno(r, "Failed to mount home directory: %m");
3605 }
3606
3607 if (srv_device) {
3608 r = mount_device(srv_device, arg_directory, "/srv", srv_device_rw);
3609 if (r < 0)
3610 return log_error_errno(r, "Failed to mount server data directory: %m");
3611 }
3612
3613 return 0;
3614 }
3615
3616 static void loop_remove(int nr, int *image_fd) {
3617 _cleanup_close_ int control = -1;
3618 int r;
3619
3620 if (nr < 0)
3621 return;
3622
3623 if (image_fd && *image_fd >= 0) {
3624 r = ioctl(*image_fd, LOOP_CLR_FD);
3625 if (r < 0)
3626 log_debug_errno(errno, "Failed to close loop image: %m");
3627 *image_fd = safe_close(*image_fd);
3628 }
3629
3630 control = open("/dev/loop-control", O_RDWR|O_CLOEXEC|O_NOCTTY|O_NONBLOCK);
3631 if (control < 0) {
3632 log_warning_errno(errno, "Failed to open /dev/loop-control: %m");
3633 return;
3634 }
3635
3636 r = ioctl(control, LOOP_CTL_REMOVE, nr);
3637 if (r < 0)
3638 log_debug_errno(errno, "Failed to remove loop %d: %m", nr);
3639 }
3640
3641 static int spawn_getent(const char *database, const char *key, pid_t *rpid) {
3642 int pipe_fds[2];
3643 pid_t pid;
3644
3645 assert(database);
3646 assert(key);
3647 assert(rpid);
3648
3649 if (pipe2(pipe_fds, O_CLOEXEC) < 0)
3650 return log_error_errno(errno, "Failed to allocate pipe: %m");
3651
3652 pid = fork();
3653 if (pid < 0)
3654 return log_error_errno(errno, "Failed to fork getent child: %m");
3655 else if (pid == 0) {
3656 int nullfd;
3657 char *empty_env = NULL;
3658
3659 if (dup3(pipe_fds[1], STDOUT_FILENO, 0) < 0)
3660 _exit(EXIT_FAILURE);
3661
3662 if (pipe_fds[0] > 2)
3663 safe_close(pipe_fds[0]);
3664 if (pipe_fds[1] > 2)
3665 safe_close(pipe_fds[1]);
3666
3667 nullfd = open("/dev/null", O_RDWR);
3668 if (nullfd < 0)
3669 _exit(EXIT_FAILURE);
3670
3671 if (dup3(nullfd, STDIN_FILENO, 0) < 0)
3672 _exit(EXIT_FAILURE);
3673
3674 if (dup3(nullfd, STDERR_FILENO, 0) < 0)
3675 _exit(EXIT_FAILURE);
3676
3677 if (nullfd > 2)
3678 safe_close(nullfd);
3679
3680 (void) reset_all_signal_handlers();
3681 (void) reset_signal_mask();
3682 close_all_fds(NULL, 0);
3683
3684 execle("/usr/bin/getent", "getent", database, key, NULL, &empty_env);
3685 execle("/bin/getent", "getent", database, key, NULL, &empty_env);
3686 _exit(EXIT_FAILURE);
3687 }
3688
3689 pipe_fds[1] = safe_close(pipe_fds[1]);
3690
3691 *rpid = pid;
3692
3693 return pipe_fds[0];
3694 }
3695
3696 static int change_uid_gid(char **_home) {
3697 char line[LINE_MAX], *x, *u, *g, *h;
3698 const char *word, *state;
3699 _cleanup_free_ uid_t *uids = NULL;
3700 _cleanup_free_ char *home = NULL;
3701 _cleanup_fclose_ FILE *f = NULL;
3702 _cleanup_close_ int fd = -1;
3703 unsigned n_uids = 0;
3704 size_t sz = 0, l;
3705 uid_t uid;
3706 gid_t gid;
3707 pid_t pid;
3708 int r;
3709
3710 assert(_home);
3711
3712 if (!arg_user || streq(arg_user, "root") || streq(arg_user, "0")) {
3713 /* Reset everything fully to 0, just in case */
3714
3715 r = reset_uid_gid();
3716 if (r < 0)
3717 return log_error_errno(r, "Failed to become root: %m");
3718
3719 *_home = NULL;
3720 return 0;
3721 }
3722
3723 /* First, get user credentials */
3724 fd = spawn_getent("passwd", arg_user, &pid);
3725 if (fd < 0)
3726 return fd;
3727
3728 f = fdopen(fd, "r");
3729 if (!f)
3730 return log_oom();
3731 fd = -1;
3732
3733 if (!fgets(line, sizeof(line), f)) {
3734
3735 if (!ferror(f)) {
3736 log_error("Failed to resolve user %s.", arg_user);
3737 return -ESRCH;
3738 }
3739
3740 log_error_errno(errno, "Failed to read from getent: %m");
3741 return -errno;
3742 }
3743
3744 truncate_nl(line);
3745
3746 wait_for_terminate_and_warn("getent passwd", pid, true);
3747
3748 x = strchr(line, ':');
3749 if (!x) {
3750 log_error("/etc/passwd entry has invalid user field.");
3751 return -EIO;
3752 }
3753
3754 u = strchr(x+1, ':');
3755 if (!u) {
3756 log_error("/etc/passwd entry has invalid password field.");
3757 return -EIO;
3758 }
3759
3760 u++;
3761 g = strchr(u, ':');
3762 if (!g) {
3763 log_error("/etc/passwd entry has invalid UID field.");
3764 return -EIO;
3765 }
3766
3767 *g = 0;
3768 g++;
3769 x = strchr(g, ':');
3770 if (!x) {
3771 log_error("/etc/passwd entry has invalid GID field.");
3772 return -EIO;
3773 }
3774
3775 *x = 0;
3776 h = strchr(x+1, ':');
3777 if (!h) {
3778 log_error("/etc/passwd entry has invalid GECOS field.");
3779 return -EIO;
3780 }
3781
3782 h++;
3783 x = strchr(h, ':');
3784 if (!x) {
3785 log_error("/etc/passwd entry has invalid home directory field.");
3786 return -EIO;
3787 }
3788
3789 *x = 0;
3790
3791 r = parse_uid(u, &uid);
3792 if (r < 0) {
3793 log_error("Failed to parse UID of user.");
3794 return -EIO;
3795 }
3796
3797 r = parse_gid(g, &gid);
3798 if (r < 0) {
3799 log_error("Failed to parse GID of user.");
3800 return -EIO;
3801 }
3802
3803 home = strdup(h);
3804 if (!home)
3805 return log_oom();
3806
3807 /* Second, get group memberships */
3808 fd = spawn_getent("initgroups", arg_user, &pid);
3809 if (fd < 0)
3810 return fd;
3811
3812 fclose(f);
3813 f = fdopen(fd, "r");
3814 if (!f)
3815 return log_oom();
3816 fd = -1;
3817
3818 if (!fgets(line, sizeof(line), f)) {
3819 if (!ferror(f)) {
3820 log_error("Failed to resolve user %s.", arg_user);
3821 return -ESRCH;
3822 }
3823
3824 log_error_errno(errno, "Failed to read from getent: %m");
3825 return -errno;
3826 }
3827
3828 truncate_nl(line);
3829
3830 wait_for_terminate_and_warn("getent initgroups", pid, true);
3831
3832 /* Skip over the username and subsequent separator whitespace */
3833 x = line;
3834 x += strcspn(x, WHITESPACE);
3835 x += strspn(x, WHITESPACE);
3836
3837 FOREACH_WORD(word, l, x, state) {
3838 char c[l+1];
3839
3840 memcpy(c, word, l);
3841 c[l] = 0;
3842
3843 if (!GREEDY_REALLOC(uids, sz, n_uids+1))
3844 return log_oom();
3845
3846 r = parse_uid(c, &uids[n_uids++]);
3847 if (r < 0) {
3848 log_error("Failed to parse group data from getent.");
3849 return -EIO;
3850 }
3851 }
3852
3853 r = mkdir_parents(home, 0775);
3854 if (r < 0)
3855 return log_error_errno(r, "Failed to make home root directory: %m");
3856
3857 r = mkdir_safe(home, 0755, uid, gid);
3858 if (r < 0 && r != -EEXIST)
3859 return log_error_errno(r, "Failed to make home directory: %m");
3860
3861 (void) fchown(STDIN_FILENO, uid, gid);
3862 (void) fchown(STDOUT_FILENO, uid, gid);
3863 (void) fchown(STDERR_FILENO, uid, gid);
3864
3865 if (setgroups(n_uids, uids) < 0)
3866 return log_error_errno(errno, "Failed to set auxiliary groups: %m");
3867
3868 if (setresgid(gid, gid, gid) < 0)
3869 return log_error_errno(errno, "setregid() failed: %m");
3870
3871 if (setresuid(uid, uid, uid) < 0)
3872 return log_error_errno(errno, "setreuid() failed: %m");
3873
3874 if (_home) {
3875 *_home = home;
3876 home = NULL;
3877 }
3878
3879 return 0;
3880 }
3881
3882 /*
3883 * Return values:
3884 * < 0 : wait_for_terminate() failed to get the state of the
3885 * container, the container was terminated by a signal, or
3886 * failed for an unknown reason. No change is made to the
3887 * container argument.
3888 * > 0 : The program executed in the container terminated with an
3889 * error. The exit code of the program executed in the
3890 * container is returned. The container argument has been set
3891 * to CONTAINER_TERMINATED.
3892 * 0 : The container is being rebooted, has been shut down or exited
3893 * successfully. The container argument has been set to either
3894 * CONTAINER_TERMINATED or CONTAINER_REBOOTED.
3895 *
3896 * That is, success is indicated by a return value of zero, and an
3897 * error is indicated by a non-zero value.
3898 */
3899 static int wait_for_container(pid_t pid, ContainerStatus *container) {
3900 siginfo_t status;
3901 int r;
3902
3903 r = wait_for_terminate(pid, &status);
3904 if (r < 0)
3905 return log_warning_errno(r, "Failed to wait for container: %m");
3906
3907 switch (status.si_code) {
3908
3909 case CLD_EXITED:
3910 if (status.si_status == 0) {
3911 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s exited successfully.", arg_machine);
3912
3913 } else
3914 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s failed with error code %i.", arg_machine, status.si_status);
3915
3916 *container = CONTAINER_TERMINATED;
3917 return status.si_status;
3918
3919 case CLD_KILLED:
3920 if (status.si_status == SIGINT) {
3921
3922 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s has been shut down.", arg_machine);
3923 *container = CONTAINER_TERMINATED;
3924 return 0;
3925
3926 } else if (status.si_status == SIGHUP) {
3927
3928 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s is being rebooted.", arg_machine);
3929 *container = CONTAINER_REBOOTED;
3930 return 0;
3931 }
3932
3933 /* CLD_KILLED fallthrough */
3934
3935 case CLD_DUMPED:
3936 log_error("Container %s terminated by signal %s.", arg_machine, signal_to_string(status.si_status));
3937 return -EIO;
3938
3939 default:
3940 log_error("Container %s failed due to unknown reason.", arg_machine);
3941 return -EIO;
3942 }
3943
3944 return r;
3945 }
3946
3947 static void nop_handler(int sig) {}
3948
3949 static int on_orderly_shutdown(sd_event_source *s, const struct signalfd_siginfo *si, void *userdata) {
3950 pid_t pid;
3951
3952 pid = PTR_TO_UINT32(userdata);
3953 if (pid > 0) {
3954 if (kill(pid, arg_kill_signal) >= 0) {
3955 log_info("Trying to halt container. Send SIGTERM again to trigger immediate termination.");
3956 sd_event_source_set_userdata(s, NULL);
3957 return 0;
3958 }
3959 }
3960
3961 sd_event_exit(sd_event_source_get_event(s), 0);
3962 return 0;
3963 }
3964
3965 static int determine_names(void) {
3966 int r;
3967
3968 if (!arg_image && !arg_directory) {
3969 if (arg_machine) {
3970 _cleanup_(image_unrefp) Image *i = NULL;
3971
3972 r = image_find(arg_machine, &i);
3973 if (r < 0)
3974 return log_error_errno(r, "Failed to find image for machine '%s': %m", arg_machine);
3975 else if (r == 0) {
3976 log_error("No image for machine '%s': %m", arg_machine);
3977 return -ENOENT;
3978 }
3979
3980 if (i->type == IMAGE_RAW)
3981 r = set_sanitized_path(&arg_image, i->path);
3982 else
3983 r = set_sanitized_path(&arg_directory, i->path);
3984 if (r < 0)
3985 return log_error_errno(r, "Invalid image directory: %m");
3986
3987 if (!arg_ephemeral)
3988 arg_read_only = arg_read_only || i->read_only;
3989 } else
3990 arg_directory = get_current_dir_name();
3991
3992 if (!arg_directory && !arg_machine) {
3993 log_error("Failed to determine path, please use -D or -i.");
3994 return -EINVAL;
3995 }
3996 }
3997
3998 if (!arg_machine) {
3999 if (arg_directory && path_equal(arg_directory, "/"))
4000 arg_machine = gethostname_malloc();
4001 else
4002 arg_machine = strdup(basename(arg_image ?: arg_directory));
4003
4004 if (!arg_machine)
4005 return log_oom();
4006
4007 hostname_cleanup(arg_machine, false);
4008 if (!machine_name_is_valid(arg_machine)) {
4009 log_error("Failed to determine machine name automatically, please use -M.");
4010 return -EINVAL;
4011 }
4012
4013 if (arg_ephemeral) {
4014 char *b;
4015
4016 /* Add a random suffix when this is an
4017 * ephemeral machine, so that we can run many
4018 * instances at once without manually having
4019 * to specify -M each time. */
4020
4021 if (asprintf(&b, "%s-%016" PRIx64, arg_machine, random_u64()) < 0)
4022 return log_oom();
4023
4024 free(arg_machine);
4025 arg_machine = b;
4026 }
4027 }
4028
4029 return 0;
4030 }
4031
4032 static int determine_uid_shift(const char *directory) {
4033 int r;
4034
4035 if (!arg_userns) {
4036 arg_uid_shift = 0;
4037 return 0;
4038 }
4039
4040 if (arg_uid_shift == UID_INVALID) {
4041 struct stat st;
4042
4043 r = stat(directory, &st);
4044 if (r < 0)
4045 return log_error_errno(errno, "Failed to determine UID base of %s: %m", directory);
4046
4047 arg_uid_shift = st.st_uid & UINT32_C(0xffff0000);
4048
4049 if (arg_uid_shift != (st.st_gid & UINT32_C(0xffff0000))) {
4050 log_error("UID and GID base of %s don't match.", directory);
4051 return -EINVAL;
4052 }
4053
4054 arg_uid_range = UINT32_C(0x10000);
4055 }
4056
4057 if (arg_uid_shift > (uid_t) -1 - arg_uid_range) {
4058 log_error("UID base too high for UID range.");
4059 return -EINVAL;
4060 }
4061
4062 log_info("Using user namespaces with base " UID_FMT " and range " UID_FMT ".", arg_uid_shift, arg_uid_range);
4063 return 0;
4064 }
4065
4066 static int inner_child(
4067 Barrier *barrier,
4068 const char *directory,
4069 bool secondary,
4070 int kmsg_socket,
4071 int rtnl_socket,
4072 FDSet *fds,
4073 int argc,
4074 char *argv[]) {
4075
4076 _cleanup_free_ char *home = NULL;
4077 unsigned n_env = 2;
4078 const char *envp[] = {
4079 "PATH=" DEFAULT_PATH_SPLIT_USR,
4080 "container=systemd-nspawn", /* LXC sets container=lxc, so follow the scheme here */
4081 NULL, /* TERM */
4082 NULL, /* HOME */
4083 NULL, /* USER */
4084 NULL, /* LOGNAME */
4085 NULL, /* container_uuid */
4086 NULL, /* LISTEN_FDS */
4087 NULL, /* LISTEN_PID */
4088 NULL
4089 };
4090
4091 _cleanup_strv_free_ char **env_use = NULL;
4092 int r;
4093
4094 assert(barrier);
4095 assert(directory);
4096 assert(kmsg_socket >= 0);
4097
4098 if (arg_userns) {
4099 /* Tell the parent, that it now can write the UID map. */
4100 (void) barrier_place(barrier); /* #1 */
4101
4102 /* Wait until the parent wrote the UID map */
4103 if (!barrier_place_and_sync(barrier)) { /* #2 */
4104 log_error("Parent died too early");
4105 return -ESRCH;
4106 }
4107 }
4108
4109 r = mount_all(NULL, true);
4110 if (r < 0)
4111 return r;
4112
4113 /* Wait until we are cgroup-ified, so that we
4114 * can mount the right cgroup path writable */
4115 if (!barrier_place_and_sync(barrier)) { /* #3 */
4116 log_error("Parent died too early");
4117 return -ESRCH;
4118 }
4119
4120 r = mount_systemd_cgroup_writable("");
4121 if (r < 0)
4122 return r;
4123
4124 r = reset_uid_gid();
4125 if (r < 0)
4126 return log_error_errno(r, "Couldn't become new root: %m");
4127
4128 r = setup_boot_id(NULL);
4129 if (r < 0)
4130 return r;
4131
4132 r = setup_kmsg(NULL, kmsg_socket);
4133 if (r < 0)
4134 return r;
4135 kmsg_socket = safe_close(kmsg_socket);
4136
4137 umask(0022);
4138
4139 if (setsid() < 0)
4140 return log_error_errno(errno, "setsid() failed: %m");
4141
4142 if (arg_private_network)
4143 loopback_setup();
4144
4145 r = send_rtnl(rtnl_socket);
4146 if (r < 0)
4147 return r;
4148 rtnl_socket = safe_close(rtnl_socket);
4149
4150 if (drop_capabilities() < 0)
4151 return log_error_errno(errno, "drop_capabilities() failed: %m");
4152
4153 setup_hostname();
4154
4155 if (arg_personality != PERSONALITY_INVALID) {
4156 if (personality(arg_personality) < 0)
4157 return log_error_errno(errno, "personality() failed: %m");
4158 } else if (secondary) {
4159 if (personality(PER_LINUX32) < 0)
4160 return log_error_errno(errno, "personality() failed: %m");
4161 }
4162
4163 #ifdef HAVE_SELINUX
4164 if (arg_selinux_context)
4165 if (setexeccon((security_context_t) arg_selinux_context) < 0)
4166 return log_error_errno(errno, "setexeccon(\"%s\") failed: %m", arg_selinux_context);
4167 #endif
4168
4169 r = change_uid_gid(&home);
4170 if (r < 0)
4171 return r;
4172
4173 envp[n_env] = strv_find_prefix(environ, "TERM=");
4174 if (envp[n_env])
4175 n_env ++;
4176
4177 if ((asprintf((char**)(envp + n_env++), "HOME=%s", home ? home: "/root") < 0) ||
4178 (asprintf((char**)(envp + n_env++), "USER=%s", arg_user ? arg_user : "root") < 0) ||
4179 (asprintf((char**)(envp + n_env++), "LOGNAME=%s", arg_user ? arg_user : "root") < 0))
4180 return log_oom();
4181
4182 if (!sd_id128_equal(arg_uuid, SD_ID128_NULL)) {
4183 char as_uuid[37];
4184
4185 if (asprintf((char**)(envp + n_env++), "container_uuid=%s", id128_format_as_uuid(arg_uuid, as_uuid)) < 0)
4186 return log_oom();
4187 }
4188
4189 if (fdset_size(fds) > 0) {
4190 r = fdset_cloexec(fds, false);
4191 if (r < 0)
4192 return log_error_errno(r, "Failed to unset O_CLOEXEC for file descriptors.");
4193
4194 if ((asprintf((char **)(envp + n_env++), "LISTEN_FDS=%u", fdset_size(fds)) < 0) ||
4195 (asprintf((char **)(envp + n_env++), "LISTEN_PID=1") < 0))
4196 return log_oom();
4197 }
4198
4199 env_use = strv_env_merge(2, envp, arg_setenv);
4200 if (!env_use)
4201 return log_oom();
4202
4203 /* Let the parent know that we are ready and
4204 * wait until the parent is ready with the
4205 * setup, too... */
4206 if (!barrier_place_and_sync(barrier)) { /* #4 */
4207 log_error("Parent died too early");
4208 return -ESRCH;
4209 }
4210
4211 /* Now, explicitly close the log, so that we
4212 * then can close all remaining fds. Closing
4213 * the log explicitly first has the benefit
4214 * that the logging subsystem knows about it,
4215 * and is thus ready to be reopened should we
4216 * need it again. Note that the other fds
4217 * closed here are at least the locking and
4218 * barrier fds. */
4219 log_close();
4220 (void) fdset_close_others(fds);
4221
4222 if (arg_boot) {
4223 char **a;
4224 size_t m;
4225
4226 /* Automatically search for the init system */
4227
4228 m = 1 + argc - optind;
4229 a = newa(char*, m + 1);
4230 memcpy(a + 1, argv + optind, m * sizeof(char*));
4231
4232 a[0] = (char*) "/usr/lib/systemd/systemd";
4233 execve(a[0], a, env_use);
4234
4235 a[0] = (char*) "/lib/systemd/systemd";
4236 execve(a[0], a, env_use);
4237
4238 a[0] = (char*) "/sbin/init";
4239 execve(a[0], a, env_use);
4240 } else if (argc > optind)
4241 execvpe(argv[optind], argv + optind, env_use);
4242 else {
4243 chdir(home ? home : "/root");
4244 execle("/bin/bash", "-bash", NULL, env_use);
4245 execle("/bin/sh", "-sh", NULL, env_use);
4246 }
4247
4248 (void) log_open();
4249 return log_error_errno(errno, "execv() failed: %m");
4250 }
4251
4252 static int outer_child(
4253 Barrier *barrier,
4254 const char *directory,
4255 const char *console,
4256 const char *root_device, bool root_device_rw,
4257 const char *home_device, bool home_device_rw,
4258 const char *srv_device, bool srv_device_rw,
4259 bool interactive,
4260 bool secondary,
4261 int pid_socket,
4262 int kmsg_socket,
4263 int rtnl_socket,
4264 int uid_shift_socket,
4265 FDSet *fds,
4266 int argc,
4267 char *argv[]) {
4268
4269 pid_t pid;
4270 ssize_t l;
4271 int r;
4272
4273 assert(barrier);
4274 assert(directory);
4275 assert(console);
4276 assert(pid_socket >= 0);
4277 assert(kmsg_socket >= 0);
4278
4279 if (prctl(PR_SET_PDEATHSIG, SIGKILL) < 0)
4280 return log_error_errno(errno, "PR_SET_PDEATHSIG failed: %m");
4281
4282 if (interactive) {
4283 close_nointr(STDIN_FILENO);
4284 close_nointr(STDOUT_FILENO);
4285 close_nointr(STDERR_FILENO);
4286
4287 r = open_terminal(console, O_RDWR);
4288 if (r != STDIN_FILENO) {
4289 if (r >= 0) {
4290 safe_close(r);
4291 r = -EINVAL;
4292 }
4293
4294 return log_error_errno(r, "Failed to open console: %m");
4295 }
4296
4297 if (dup2(STDIN_FILENO, STDOUT_FILENO) != STDOUT_FILENO ||
4298 dup2(STDIN_FILENO, STDERR_FILENO) != STDERR_FILENO)
4299 return log_error_errno(errno, "Failed to duplicate console: %m");
4300 }
4301
4302 r = reset_audit_loginuid();
4303 if (r < 0)
4304 return r;
4305
4306 /* Mark everything as slave, so that we still
4307 * receive mounts from the real root, but don't
4308 * propagate mounts to the real root. */
4309 if (mount(NULL, "/", NULL, MS_SLAVE|MS_REC, NULL) < 0)
4310 return log_error_errno(errno, "MS_SLAVE|MS_REC failed: %m");
4311
4312 r = mount_devices(directory,
4313 root_device, root_device_rw,
4314 home_device, home_device_rw,
4315 srv_device, srv_device_rw);
4316 if (r < 0)
4317 return r;
4318
4319 r = determine_uid_shift(directory);
4320 if (r < 0)
4321 return r;
4322
4323 if (arg_userns) {
4324 l = send(uid_shift_socket, &arg_uid_shift, sizeof(arg_uid_shift), MSG_NOSIGNAL);
4325 if (l < 0)
4326 return log_error_errno(errno, "Failed to send UID shift: %m");
4327 if (l != sizeof(arg_uid_shift)) {
4328 log_error("Short write while sending UID shift.");
4329 return -EIO;
4330 }
4331 }
4332
4333 /* Turn directory into bind mount */
4334 if (mount(directory, directory, NULL, MS_BIND|MS_REC, NULL) < 0)
4335 return log_error_errno(errno, "Failed to make bind mount: %m");
4336
4337 r = setup_volatile(directory);
4338 if (r < 0)
4339 return r;
4340
4341 r = setup_volatile_state(directory);
4342 if (r < 0)
4343 return r;
4344
4345 r = base_filesystem_create(directory, arg_uid_shift, (gid_t) arg_uid_shift);
4346 if (r < 0)
4347 return r;
4348
4349 if (arg_read_only) {
4350 r = bind_remount_recursive(directory, true);
4351 if (r < 0)
4352 return log_error_errno(r, "Failed to make tree read-only: %m");
4353 }
4354
4355 r = mount_all(directory, false);
4356 if (r < 0)
4357 return r;
4358
4359 if (copy_devnodes(directory) < 0)
4360 return r;
4361
4362 dev_setup(directory, arg_uid_shift, arg_uid_shift);
4363
4364 if (setup_pts(directory) < 0)
4365 return r;
4366
4367 r = setup_propagate(directory);
4368 if (r < 0)
4369 return r;
4370
4371 r = setup_dev_console(directory, console);
4372 if (r < 0)
4373 return r;
4374
4375 r = setup_seccomp();
4376 if (r < 0)
4377 return r;
4378
4379 r = setup_timezone(directory);
4380 if (r < 0)
4381 return r;
4382
4383 r = setup_resolv_conf(directory);
4384 if (r < 0)
4385 return r;
4386
4387 r = setup_journal(directory);
4388 if (r < 0)
4389 return r;
4390
4391 r = mount_custom(directory);
4392 if (r < 0)
4393 return r;
4394
4395 r = mount_cgroup(directory);
4396 if (r < 0)
4397 return r;
4398
4399 r = mount_move_root(directory);
4400 if (r < 0)
4401 return log_error_errno(r, "Failed to move root directory: %m");
4402
4403 pid = raw_clone(SIGCHLD|CLONE_NEWNS|
4404 (arg_share_system ? 0 : CLONE_NEWIPC|CLONE_NEWPID|CLONE_NEWUTS) |
4405 (arg_private_network ? CLONE_NEWNET : 0) |
4406 (arg_userns ? CLONE_NEWUSER : 0),
4407 NULL);
4408 if (pid < 0)
4409 return log_error_errno(errno, "Failed to fork inner child: %m");
4410
4411 if (pid == 0) {
4412 pid_socket = safe_close(pid_socket);
4413 uid_shift_socket = safe_close(uid_shift_socket);
4414
4415 /* The inner child has all namespaces that are
4416 * requested, so that we all are owned by the user if
4417 * user namespaces are turned on. */
4418
4419 r = inner_child(barrier, directory, secondary, kmsg_socket, rtnl_socket, fds, argc, argv);
4420 if (r < 0)
4421 _exit(EXIT_FAILURE);
4422
4423 _exit(EXIT_SUCCESS);
4424 }
4425
4426 l = send(pid_socket, &pid, sizeof(pid), MSG_NOSIGNAL);
4427 if (l < 0)
4428 return log_error_errno(errno, "Failed to send PID: %m");
4429 if (l != sizeof(pid)) {
4430 log_error("Short write while sending PID.");
4431 return -EIO;
4432 }
4433
4434 pid_socket = safe_close(pid_socket);
4435
4436 return 0;
4437 }
4438
4439 static int setup_uid_map(pid_t pid) {
4440 char uid_map[strlen("/proc//uid_map") + DECIMAL_STR_MAX(uid_t) + 1], line[DECIMAL_STR_MAX(uid_t)*3+3+1];
4441 int r;
4442
4443 assert(pid > 1);
4444
4445 xsprintf(uid_map, "/proc/" PID_FMT "/uid_map", pid);
4446 xsprintf(line, UID_FMT " " UID_FMT " " UID_FMT "\n", 0, arg_uid_shift, arg_uid_range);
4447 r = write_string_file(uid_map, line, 0);
4448 if (r < 0)
4449 return log_error_errno(r, "Failed to write UID map: %m");
4450
4451 /* We always assign the same UID and GID ranges */
4452 xsprintf(uid_map, "/proc/" PID_FMT "/gid_map", pid);
4453 r = write_string_file(uid_map, line, 0);
4454 if (r < 0)
4455 return log_error_errno(r, "Failed to write GID map: %m");
4456
4457 return 0;
4458 }
4459
4460 static int chown_cgroup(pid_t pid) {
4461 _cleanup_free_ char *path = NULL, *fs = NULL;
4462 _cleanup_close_ int fd = -1;
4463 const char *fn;
4464 int r;
4465
4466 r = cg_pid_get_path(NULL, pid, &path);
4467 if (r < 0)
4468 return log_error_errno(r, "Failed to get container cgroup path: %m");
4469
4470 r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, path, NULL, &fs);
4471 if (r < 0)
4472 return log_error_errno(r, "Failed to get file system path for container cgroup: %m");
4473
4474 fd = open(fs, O_RDONLY|O_CLOEXEC|O_DIRECTORY);
4475 if (fd < 0)
4476 return log_error_errno(errno, "Failed to open %s: %m", fs);
4477
4478 FOREACH_STRING(fn, ".", "tasks", "notify_on_release", "cgroup.procs", "cgroup.clone_children")
4479 if (fchownat(fd, fn, arg_uid_shift, arg_uid_shift, 0) < 0)
4480 log_warning_errno(errno, "Failed to chown() cgroup file %s, ignoring: %m", fn);
4481
4482 return 0;
4483 }
4484
4485 int main(int argc, char *argv[]) {
4486
4487 _cleanup_free_ char *device_path = NULL, *root_device = NULL, *home_device = NULL, *srv_device = NULL, *console = NULL;
4488 bool root_device_rw = true, home_device_rw = true, srv_device_rw = true;
4489 _cleanup_close_ int master = -1, image_fd = -1;
4490 _cleanup_fdset_free_ FDSet *fds = NULL;
4491 int r, n_fd_passed, loop_nr = -1;
4492 char veth_name[IFNAMSIZ];
4493 bool secondary = false, remove_subvol = false;
4494 sigset_t mask_chld;
4495 pid_t pid = 0;
4496 int ret = EXIT_SUCCESS;
4497 union in_addr_union exposed = {};
4498 _cleanup_release_lock_file_ LockFile tree_global_lock = LOCK_FILE_INIT, tree_local_lock = LOCK_FILE_INIT;
4499 bool interactive;
4500
4501 log_parse_environment();
4502 log_open();
4503
4504 r = parse_argv(argc, argv);
4505 if (r <= 0)
4506 goto finish;
4507
4508 r = determine_names();
4509 if (r < 0)
4510 goto finish;
4511
4512 if (geteuid() != 0) {
4513 log_error("Need to be root.");
4514 r = -EPERM;
4515 goto finish;
4516 }
4517
4518 n_fd_passed = sd_listen_fds(false);
4519 if (n_fd_passed > 0) {
4520 r = fdset_new_listen_fds(&fds, false);
4521 if (r < 0) {
4522 log_error_errno(r, "Failed to collect file descriptors: %m");
4523 goto finish;
4524 }
4525 }
4526
4527 if (arg_directory) {
4528 assert(!arg_image);
4529
4530 if (path_equal(arg_directory, "/") && !arg_ephemeral) {
4531 log_error("Spawning container on root directory is not supported. Consider using --ephemeral.");
4532 r = -EINVAL;
4533 goto finish;
4534 }
4535
4536 if (arg_ephemeral) {
4537 _cleanup_free_ char *np = NULL;
4538
4539 /* If the specified path is a mount point we
4540 * generate the new snapshot immediately
4541 * inside it under a random name. However if
4542 * the specified is not a mount point we
4543 * create the new snapshot in the parent
4544 * directory, just next to it. */
4545 r = path_is_mount_point(arg_directory, 0);
4546 if (r < 0) {
4547 log_error_errno(r, "Failed to determine whether directory %s is mount point: %m", arg_directory);
4548 goto finish;
4549 }
4550 if (r > 0)
4551 r = tempfn_random_child(arg_directory, "machine.", &np);
4552 else
4553 r = tempfn_random(arg_directory, "machine.", &np);
4554 if (r < 0) {
4555 log_error_errno(r, "Failed to generate name for snapshot: %m");
4556 goto finish;
4557 }
4558
4559 r = image_path_lock(np, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
4560 if (r < 0) {
4561 log_error_errno(r, "Failed to lock %s: %m", np);
4562 goto finish;
4563 }
4564
4565 r = btrfs_subvol_snapshot(arg_directory, np, (arg_read_only ? BTRFS_SNAPSHOT_READ_ONLY : 0) | BTRFS_SNAPSHOT_FALLBACK_COPY | BTRFS_SNAPSHOT_RECURSIVE);
4566 if (r < 0) {
4567 log_error_errno(r, "Failed to create snapshot %s from %s: %m", np, arg_directory);
4568 goto finish;
4569 }
4570
4571 free(arg_directory);
4572 arg_directory = np;
4573 np = NULL;
4574
4575 remove_subvol = true;
4576
4577 } else {
4578 r = image_path_lock(arg_directory, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
4579 if (r == -EBUSY) {
4580 log_error_errno(r, "Directory tree %s is currently busy.", arg_directory);
4581 goto finish;
4582 }
4583 if (r < 0) {
4584 log_error_errno(r, "Failed to lock %s: %m", arg_directory);
4585 return r;
4586 }
4587
4588 if (arg_template) {
4589 r = btrfs_subvol_snapshot(arg_template, arg_directory, (arg_read_only ? BTRFS_SNAPSHOT_READ_ONLY : 0) | BTRFS_SNAPSHOT_FALLBACK_COPY | BTRFS_SNAPSHOT_RECURSIVE);
4590 if (r == -EEXIST) {
4591 if (!arg_quiet)
4592 log_info("Directory %s already exists, not populating from template %s.", arg_directory, arg_template);
4593 } else if (r < 0) {
4594 log_error_errno(r, "Couldn't create snapshot %s from %s: %m", arg_directory, arg_template);
4595 goto finish;
4596 } else {
4597 if (!arg_quiet)
4598 log_info("Populated %s from template %s.", arg_directory, arg_template);
4599 }
4600 }
4601 }
4602
4603 if (arg_boot) {
4604 if (path_is_os_tree(arg_directory) <= 0) {
4605 log_error("Directory %s doesn't look like an OS root directory (os-release file is missing). Refusing.", arg_directory);
4606 r = -EINVAL;
4607 goto finish;
4608 }
4609 } else {
4610 const char *p;
4611
4612 p = strjoina(arg_directory,
4613 argc > optind && path_is_absolute(argv[optind]) ? argv[optind] : "/usr/bin/");
4614 if (access(p, F_OK) < 0) {
4615 log_error("Directory %s lacks the binary to execute or doesn't look like a binary tree. Refusing.", arg_directory);
4616 r = -EINVAL;
4617 goto finish;
4618 }
4619 }
4620
4621 } else {
4622 char template[] = "/tmp/nspawn-root-XXXXXX";
4623
4624 assert(arg_image);
4625 assert(!arg_template);
4626
4627 r = image_path_lock(arg_image, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
4628 if (r == -EBUSY) {
4629 r = log_error_errno(r, "Disk image %s is currently busy.", arg_image);
4630 goto finish;
4631 }
4632 if (r < 0) {
4633 r = log_error_errno(r, "Failed to create image lock: %m");
4634 goto finish;
4635 }
4636
4637 if (!mkdtemp(template)) {
4638 log_error_errno(errno, "Failed to create temporary directory: %m");
4639 r = -errno;
4640 goto finish;
4641 }
4642
4643 arg_directory = strdup(template);
4644 if (!arg_directory) {
4645 r = log_oom();
4646 goto finish;
4647 }
4648
4649 image_fd = setup_image(&device_path, &loop_nr);
4650 if (image_fd < 0) {
4651 r = image_fd;
4652 goto finish;
4653 }
4654
4655 r = dissect_image(image_fd,
4656 &root_device, &root_device_rw,
4657 &home_device, &home_device_rw,
4658 &srv_device, &srv_device_rw,
4659 &secondary);
4660 if (r < 0)
4661 goto finish;
4662 }
4663
4664 r = custom_mounts_prepare();
4665 if (r < 0)
4666 goto finish;
4667
4668 interactive =
4669 isatty(STDIN_FILENO) > 0 &&
4670 isatty(STDOUT_FILENO) > 0;
4671
4672 master = posix_openpt(O_RDWR|O_NOCTTY|O_CLOEXEC|O_NDELAY);
4673 if (master < 0) {
4674 r = log_error_errno(errno, "Failed to acquire pseudo tty: %m");
4675 goto finish;
4676 }
4677
4678 r = ptsname_malloc(master, &console);
4679 if (r < 0) {
4680 r = log_error_errno(r, "Failed to determine tty name: %m");
4681 goto finish;
4682 }
4683
4684 if (unlockpt(master) < 0) {
4685 r = log_error_errno(errno, "Failed to unlock tty: %m");
4686 goto finish;
4687 }
4688
4689 if (!arg_quiet)
4690 log_info("Spawning container %s on %s.\nPress ^] three times within 1s to kill container.",
4691 arg_machine, arg_image ?: arg_directory);
4692
4693 assert_se(sigprocmask_many(SIG_BLOCK, NULL, SIGCHLD, SIGWINCH, SIGTERM, SIGINT, -1) >= 0);
4694
4695 assert_se(sigemptyset(&mask_chld) == 0);
4696 assert_se(sigaddset(&mask_chld, SIGCHLD) == 0);
4697
4698 if (prctl(PR_SET_CHILD_SUBREAPER, 1) < 0) {
4699 r = log_error_errno(errno, "Failed to become subreaper: %m");
4700 goto finish;
4701 }
4702
4703 for (;;) {
4704 _cleanup_close_pair_ int kmsg_socket_pair[2] = { -1, -1 }, rtnl_socket_pair[2] = { -1, -1 }, pid_socket_pair[2] = { -1, -1 },
4705 uid_shift_socket_pair[2] = { -1, -1 };
4706 ContainerStatus container_status;
4707 _cleanup_(barrier_destroy) Barrier barrier = BARRIER_NULL;
4708 static const struct sigaction sa = {
4709 .sa_handler = nop_handler,
4710 .sa_flags = SA_NOCLDSTOP,
4711 };
4712 int ifi = 0;
4713 ssize_t l;
4714 _cleanup_event_unref_ sd_event *event = NULL;
4715 _cleanup_(pty_forward_freep) PTYForward *forward = NULL;
4716 _cleanup_netlink_unref_ sd_netlink *rtnl = NULL;
4717 char last_char = 0;
4718
4719 r = barrier_create(&barrier);
4720 if (r < 0) {
4721 log_error_errno(r, "Cannot initialize IPC barrier: %m");
4722 goto finish;
4723 }
4724
4725 if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC, 0, kmsg_socket_pair) < 0) {
4726 r = log_error_errno(errno, "Failed to create kmsg socket pair: %m");
4727 goto finish;
4728 }
4729
4730 if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC, 0, rtnl_socket_pair) < 0) {
4731 r = log_error_errno(errno, "Failed to create rtnl socket pair: %m");
4732 goto finish;
4733 }
4734
4735 if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC, 0, pid_socket_pair) < 0) {
4736 r = log_error_errno(errno, "Failed to create pid socket pair: %m");
4737 goto finish;
4738 }
4739
4740 if (arg_userns)
4741 if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC, 0, uid_shift_socket_pair) < 0) {
4742 r = log_error_errno(errno, "Failed to create uid shift socket pair: %m");
4743 goto finish;
4744 }
4745
4746 /* Child can be killed before execv(), so handle SIGCHLD
4747 * in order to interrupt parent's blocking calls and
4748 * give it a chance to call wait() and terminate. */
4749 r = sigprocmask(SIG_UNBLOCK, &mask_chld, NULL);
4750 if (r < 0) {
4751 r = log_error_errno(errno, "Failed to change the signal mask: %m");
4752 goto finish;
4753 }
4754
4755 r = sigaction(SIGCHLD, &sa, NULL);
4756 if (r < 0) {
4757 r = log_error_errno(errno, "Failed to install SIGCHLD handler: %m");
4758 goto finish;
4759 }
4760
4761 pid = raw_clone(SIGCHLD|CLONE_NEWNS, NULL);
4762 if (pid < 0) {
4763 if (errno == EINVAL)
4764 r = log_error_errno(errno, "clone() failed, do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in): %m");
4765 else
4766 r = log_error_errno(errno, "clone() failed: %m");
4767
4768 goto finish;
4769 }
4770
4771 if (pid == 0) {
4772 /* The outer child only has a file system namespace. */
4773 barrier_set_role(&barrier, BARRIER_CHILD);
4774
4775 master = safe_close(master);
4776
4777 kmsg_socket_pair[0] = safe_close(kmsg_socket_pair[0]);
4778 rtnl_socket_pair[0] = safe_close(rtnl_socket_pair[0]);
4779 pid_socket_pair[0] = safe_close(pid_socket_pair[0]);
4780 uid_shift_socket_pair[0] = safe_close(uid_shift_socket_pair[0]);
4781
4782 (void) reset_all_signal_handlers();
4783 (void) reset_signal_mask();
4784
4785 r = outer_child(&barrier,
4786 arg_directory,
4787 console,
4788 root_device, root_device_rw,
4789 home_device, home_device_rw,
4790 srv_device, srv_device_rw,
4791 interactive,
4792 secondary,
4793 pid_socket_pair[1],
4794 kmsg_socket_pair[1],
4795 rtnl_socket_pair[1],
4796 uid_shift_socket_pair[1],
4797 fds,
4798 argc, argv);
4799 if (r < 0)
4800 _exit(EXIT_FAILURE);
4801
4802 _exit(EXIT_SUCCESS);
4803 }
4804
4805 barrier_set_role(&barrier, BARRIER_PARENT);
4806
4807 fdset_free(fds);
4808 fds = NULL;
4809
4810 kmsg_socket_pair[1] = safe_close(kmsg_socket_pair[1]);
4811 rtnl_socket_pair[1] = safe_close(rtnl_socket_pair[1]);
4812 pid_socket_pair[1] = safe_close(pid_socket_pair[1]);
4813
4814 /* Wait for the outer child. */
4815 r = wait_for_terminate_and_warn("namespace helper", pid, NULL);
4816 if (r < 0)
4817 goto finish;
4818 if (r != 0) {
4819 r = -EIO;
4820 goto finish;
4821 }
4822 pid = 0;
4823
4824 /* And now retrieve the PID of the inner child. */
4825 l = recv(pid_socket_pair[0], &pid, sizeof(pid), 0);
4826 if (l < 0) {
4827 r = log_error_errno(errno, "Failed to read inner child PID: %m");
4828 goto finish;
4829 }
4830 if (l != sizeof(pid)) {
4831 log_error("Short read while reading inner child PID: %m");
4832 r = EIO;
4833 goto finish;
4834 }
4835
4836 log_debug("Init process invoked as PID " PID_FMT, pid);
4837
4838 if (arg_userns) {
4839 if (!barrier_place_and_sync(&barrier)) { /* #1 */
4840 log_error("Child died too early.");
4841 r = -ESRCH;
4842 goto finish;
4843 }
4844
4845 l = recv(uid_shift_socket_pair[0], &arg_uid_shift, sizeof(arg_uid_shift), 0);
4846 if (l < 0) {
4847 r = log_error_errno(errno, "Failed to read UID shift: %m");
4848 goto finish;
4849 }
4850 if (l != sizeof(arg_uid_shift)) {
4851 log_error("Short read while reading UID shift: %m");
4852 r = EIO;
4853 goto finish;
4854 }
4855
4856 r = setup_uid_map(pid);
4857 if (r < 0)
4858 goto finish;
4859
4860 (void) barrier_place(&barrier); /* #2 */
4861 }
4862
4863 r = move_network_interfaces(pid);
4864 if (r < 0)
4865 goto finish;
4866
4867 r = setup_veth(pid, veth_name, &ifi);
4868 if (r < 0)
4869 goto finish;
4870
4871 r = setup_bridge(veth_name, &ifi);
4872 if (r < 0)
4873 goto finish;
4874
4875 r = setup_macvlan(pid);
4876 if (r < 0)
4877 goto finish;
4878
4879 r = setup_ipvlan(pid);
4880 if (r < 0)
4881 goto finish;
4882
4883 r = register_machine(pid, ifi);
4884 if (r < 0)
4885 goto finish;
4886
4887 r = chown_cgroup(pid);
4888 if (r < 0)
4889 goto finish;
4890
4891 /* Notify the child that the parent is ready with all
4892 * its setup (including cgroup-ification), and that
4893 * the child can now hand over control to the code to
4894 * run inside the container. */
4895 (void) barrier_place(&barrier); /* #3 */
4896
4897 /* Block SIGCHLD here, before notifying child.
4898 * process_pty() will handle it with the other signals. */
4899 assert_se(sigprocmask(SIG_BLOCK, &mask_chld, NULL) >= 0);
4900
4901 /* Reset signal to default */
4902 r = default_signals(SIGCHLD, -1);
4903 if (r < 0) {
4904 log_error_errno(r, "Failed to reset SIGCHLD: %m");
4905 goto finish;
4906 }
4907
4908 /* Let the child know that we are ready and wait that the child is completely ready now. */
4909 if (!barrier_place_and_sync(&barrier)) { /* #5 */
4910 log_error("Client died too early.");
4911 r = -ESRCH;
4912 goto finish;
4913 }
4914
4915 sd_notifyf(false,
4916 "READY=1\n"
4917 "STATUS=Container running.\n"
4918 "X_NSPAWN_LEADER_PID=" PID_FMT, pid);
4919
4920 r = sd_event_new(&event);
4921 if (r < 0) {
4922 log_error_errno(r, "Failed to get default event source: %m");
4923 goto finish;
4924 }
4925
4926 if (arg_kill_signal > 0) {
4927 /* Try to kill the init system on SIGINT or SIGTERM */
4928 sd_event_add_signal(event, NULL, SIGINT, on_orderly_shutdown, UINT32_TO_PTR(pid));
4929 sd_event_add_signal(event, NULL, SIGTERM, on_orderly_shutdown, UINT32_TO_PTR(pid));
4930 } else {
4931 /* Immediately exit */
4932 sd_event_add_signal(event, NULL, SIGINT, NULL, NULL);
4933 sd_event_add_signal(event, NULL, SIGTERM, NULL, NULL);
4934 }
4935
4936 /* simply exit on sigchld */
4937 sd_event_add_signal(event, NULL, SIGCHLD, NULL, NULL);
4938
4939 if (arg_expose_ports) {
4940 r = watch_rtnl(event, rtnl_socket_pair[0], &exposed, &rtnl);
4941 if (r < 0)
4942 goto finish;
4943
4944 (void) expose_ports(rtnl, &exposed);
4945 }
4946
4947 rtnl_socket_pair[0] = safe_close(rtnl_socket_pair[0]);
4948
4949 r = pty_forward_new(event, master, true, !interactive, &forward);
4950 if (r < 0) {
4951 log_error_errno(r, "Failed to create PTY forwarder: %m");
4952 goto finish;
4953 }
4954
4955 r = sd_event_loop(event);
4956 if (r < 0) {
4957 log_error_errno(r, "Failed to run event loop: %m");
4958 goto finish;
4959 }
4960
4961 pty_forward_get_last_char(forward, &last_char);
4962
4963 forward = pty_forward_free(forward);
4964
4965 if (!arg_quiet && last_char != '\n')
4966 putc('\n', stdout);
4967
4968 /* Kill if it is not dead yet anyway */
4969 terminate_machine(pid);
4970
4971 /* Normally redundant, but better safe than sorry */
4972 kill(pid, SIGKILL);
4973
4974 r = wait_for_container(pid, &container_status);
4975 pid = 0;
4976
4977 if (r < 0)
4978 /* We failed to wait for the container, or the
4979 * container exited abnormally */
4980 goto finish;
4981 else if (r > 0 || container_status == CONTAINER_TERMINATED){
4982 /* The container exited with a non-zero
4983 * status, or with zero status and no reboot
4984 * was requested. */
4985 ret = r;
4986 break;
4987 }
4988
4989 /* CONTAINER_REBOOTED, loop again */
4990
4991 if (arg_keep_unit) {
4992 /* Special handling if we are running as a
4993 * service: instead of simply restarting the
4994 * machine we want to restart the entire
4995 * service, so let's inform systemd about this
4996 * with the special exit code 133. The service
4997 * file uses RestartForceExitStatus=133 so
4998 * that this results in a full nspawn
4999 * restart. This is necessary since we might
5000 * have cgroup parameters set we want to have
5001 * flushed out. */
5002 ret = 133;
5003 r = 0;
5004 break;
5005 }
5006
5007 flush_ports(&exposed);
5008 }
5009
5010 finish:
5011 sd_notify(false,
5012 "STOPPING=1\n"
5013 "STATUS=Terminating...");
5014
5015 if (pid > 0)
5016 kill(pid, SIGKILL);
5017
5018 /* Try to flush whatever is still queued in the pty */
5019 if (master >= 0)
5020 (void) copy_bytes(master, STDOUT_FILENO, (off_t) -1, false);
5021
5022 loop_remove(loop_nr, &image_fd);
5023
5024 if (remove_subvol && arg_directory) {
5025 int k;
5026
5027 k = btrfs_subvol_remove(arg_directory, true);
5028 if (k < 0)
5029 log_warning_errno(k, "Cannot remove subvolume '%s', ignoring: %m", arg_directory);
5030 }
5031
5032 if (arg_machine) {
5033 const char *p;
5034
5035 p = strjoina("/run/systemd/nspawn/propagate/", arg_machine);
5036 (void) rm_rf(p, REMOVE_ROOT);
5037 }
5038
5039 free(arg_directory);
5040 free(arg_template);
5041 free(arg_image);
5042 free(arg_machine);
5043 free(arg_user);
5044 strv_free(arg_setenv);
5045 strv_free(arg_network_interfaces);
5046 strv_free(arg_network_macvlan);
5047 strv_free(arg_network_ipvlan);
5048 custom_mount_free_all();
5049
5050 flush_ports(&exposed);
5051
5052 while (arg_expose_ports) {
5053 ExposePort *p = arg_expose_ports;
5054 LIST_REMOVE(ports, arg_expose_ports, p);
5055 free(p);
5056 }
5057
5058 return r < 0 ? EXIT_FAILURE : ret;
5059 }