]> git.ipfire.org Git - thirdparty/systemd.git/blob - src/nspawn/nspawn.c
Merge pull request #1012 from gentoo-root/master
[thirdparty/systemd.git] / src / nspawn / nspawn.c
1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
2
3 /***
4 This file is part of systemd.
5
6 Copyright 2010 Lennart Poettering
7
8 systemd is free software; you can redistribute it and/or modify it
9 under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 2.1 of the License, or
11 (at your option) any later version.
12
13 systemd is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 Lesser General Public License for more details.
17
18 You should have received a copy of the GNU Lesser General Public License
19 along with systemd; If not, see <http://www.gnu.org/licenses/>.
20 ***/
21
22 #include <signal.h>
23 #include <sched.h>
24 #include <unistd.h>
25 #include <sys/types.h>
26 #include <sys/mount.h>
27 #include <stdlib.h>
28 #include <string.h>
29 #include <stdio.h>
30 #include <errno.h>
31 #include <sys/prctl.h>
32 #include <getopt.h>
33 #include <grp.h>
34 #include <linux/fs.h>
35 #include <sys/socket.h>
36 #include <linux/netlink.h>
37 #include <net/if.h>
38 #include <linux/veth.h>
39 #include <sys/personality.h>
40 #include <linux/loop.h>
41 #include <sys/file.h>
42
43 #ifdef HAVE_SELINUX
44 #include <selinux/selinux.h>
45 #endif
46
47 #ifdef HAVE_SECCOMP
48 #include <seccomp.h>
49 #endif
50
51 #ifdef HAVE_BLKID
52 #include <blkid/blkid.h>
53 #endif
54
55 #include "sd-daemon.h"
56 #include "sd-bus.h"
57 #include "sd-id128.h"
58 #include "sd-netlink.h"
59 #include "random-util.h"
60 #include "log.h"
61 #include "util.h"
62 #include "mkdir.h"
63 #include "rm-rf.h"
64 #include "macro.h"
65 #include "missing.h"
66 #include "cgroup-util.h"
67 #include "strv.h"
68 #include "path-util.h"
69 #include "loopback-setup.h"
70 #include "dev-setup.h"
71 #include "fdset.h"
72 #include "build.h"
73 #include "fileio.h"
74 #include "bus-util.h"
75 #include "bus-error.h"
76 #include "ptyfwd.h"
77 #include "env-util.h"
78 #include "netlink-util.h"
79 #include "udev-util.h"
80 #include "blkid-util.h"
81 #include "gpt.h"
82 #include "siphash24.h"
83 #include "copy.h"
84 #include "base-filesystem.h"
85 #include "barrier.h"
86 #include "event-util.h"
87 #include "capability.h"
88 #include "cap-list.h"
89 #include "btrfs-util.h"
90 #include "machine-image.h"
91 #include "list.h"
92 #include "in-addr-util.h"
93 #include "firewall-util.h"
94 #include "local-addresses.h"
95 #include "formats-util.h"
96 #include "process-util.h"
97 #include "terminal-util.h"
98 #include "hostname-util.h"
99 #include "signal-util.h"
100
101 #ifdef HAVE_SECCOMP
102 #include "seccomp-util.h"
103 #endif
104
105 typedef struct ExposePort {
106 int protocol;
107 uint16_t host_port;
108 uint16_t container_port;
109 LIST_FIELDS(struct ExposePort, ports);
110 } ExposePort;
111
112 typedef enum ContainerStatus {
113 CONTAINER_TERMINATED,
114 CONTAINER_REBOOTED
115 } ContainerStatus;
116
117 typedef enum LinkJournal {
118 LINK_NO,
119 LINK_AUTO,
120 LINK_HOST,
121 LINK_GUEST
122 } LinkJournal;
123
124 typedef enum Volatile {
125 VOLATILE_NO,
126 VOLATILE_YES,
127 VOLATILE_STATE,
128 } Volatile;
129
130 typedef enum CustomMountType {
131 CUSTOM_MOUNT_BIND,
132 CUSTOM_MOUNT_TMPFS,
133 CUSTOM_MOUNT_OVERLAY,
134 } CustomMountType;
135
136 typedef struct CustomMount {
137 CustomMountType type;
138 bool read_only;
139 char *source; /* for overlayfs this is the upper directory */
140 char *destination;
141 char *options;
142 char *work_dir;
143 char **lower;
144 } CustomMount;
145
146 static char *arg_directory = NULL;
147 static char *arg_template = NULL;
148 static char *arg_user = NULL;
149 static sd_id128_t arg_uuid = {};
150 static char *arg_machine = NULL;
151 static const char *arg_selinux_context = NULL;
152 static const char *arg_selinux_apifs_context = NULL;
153 static const char *arg_slice = NULL;
154 static bool arg_private_network = false;
155 static bool arg_read_only = false;
156 static bool arg_boot = false;
157 static bool arg_ephemeral = false;
158 static LinkJournal arg_link_journal = LINK_AUTO;
159 static bool arg_link_journal_try = false;
160 static uint64_t arg_retain =
161 (1ULL << CAP_CHOWN) |
162 (1ULL << CAP_DAC_OVERRIDE) |
163 (1ULL << CAP_DAC_READ_SEARCH) |
164 (1ULL << CAP_FOWNER) |
165 (1ULL << CAP_FSETID) |
166 (1ULL << CAP_IPC_OWNER) |
167 (1ULL << CAP_KILL) |
168 (1ULL << CAP_LEASE) |
169 (1ULL << CAP_LINUX_IMMUTABLE) |
170 (1ULL << CAP_NET_BIND_SERVICE) |
171 (1ULL << CAP_NET_BROADCAST) |
172 (1ULL << CAP_NET_RAW) |
173 (1ULL << CAP_SETGID) |
174 (1ULL << CAP_SETFCAP) |
175 (1ULL << CAP_SETPCAP) |
176 (1ULL << CAP_SETUID) |
177 (1ULL << CAP_SYS_ADMIN) |
178 (1ULL << CAP_SYS_CHROOT) |
179 (1ULL << CAP_SYS_NICE) |
180 (1ULL << CAP_SYS_PTRACE) |
181 (1ULL << CAP_SYS_TTY_CONFIG) |
182 (1ULL << CAP_SYS_RESOURCE) |
183 (1ULL << CAP_SYS_BOOT) |
184 (1ULL << CAP_AUDIT_WRITE) |
185 (1ULL << CAP_AUDIT_CONTROL) |
186 (1ULL << CAP_MKNOD);
187 static CustomMount *arg_custom_mounts = NULL;
188 static unsigned arg_n_custom_mounts = 0;
189 static char **arg_setenv = NULL;
190 static bool arg_quiet = false;
191 static bool arg_share_system = false;
192 static bool arg_register = true;
193 static bool arg_keep_unit = false;
194 static char **arg_network_interfaces = NULL;
195 static char **arg_network_macvlan = NULL;
196 static char **arg_network_ipvlan = NULL;
197 static bool arg_network_veth = false;
198 static const char *arg_network_bridge = NULL;
199 static unsigned long arg_personality = PERSONALITY_INVALID;
200 static char *arg_image = NULL;
201 static Volatile arg_volatile = VOLATILE_NO;
202 static ExposePort *arg_expose_ports = NULL;
203 static char **arg_property = NULL;
204 static uid_t arg_uid_shift = UID_INVALID, arg_uid_range = 0x10000U;
205 static bool arg_userns = false;
206 static int arg_kill_signal = 0;
207
208 static void help(void) {
209 printf("%s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
210 "Spawn a minimal namespace container for debugging, testing and building.\n\n"
211 " -h --help Show this help\n"
212 " --version Print version string\n"
213 " -q --quiet Do not show status information\n"
214 " -D --directory=PATH Root directory for the container\n"
215 " --template=PATH Initialize root directory from template directory,\n"
216 " if missing\n"
217 " -x --ephemeral Run container with snapshot of root directory, and\n"
218 " remove it after exit\n"
219 " -i --image=PATH File system device or disk image for the container\n"
220 " -b --boot Boot up full system (i.e. invoke init)\n"
221 " -u --user=USER Run the command under specified user or uid\n"
222 " -M --machine=NAME Set the machine name for the container\n"
223 " --uuid=UUID Set a specific machine UUID for the container\n"
224 " -S --slice=SLICE Place the container in the specified slice\n"
225 " --property=NAME=VALUE Set scope unit property\n"
226 " --private-users[=UIDBASE[:NUIDS]]\n"
227 " Run within user namespace\n"
228 " --private-network Disable network in container\n"
229 " --network-interface=INTERFACE\n"
230 " Assign an existing network interface to the\n"
231 " container\n"
232 " --network-macvlan=INTERFACE\n"
233 " Create a macvlan network interface based on an\n"
234 " existing network interface to the container\n"
235 " --network-ipvlan=INTERFACE\n"
236 " Create a ipvlan network interface based on an\n"
237 " existing network interface to the container\n"
238 " -n --network-veth Add a virtual ethernet connection between host\n"
239 " and container\n"
240 " --network-bridge=INTERFACE\n"
241 " Add a virtual ethernet connection between host\n"
242 " and container and add it to an existing bridge on\n"
243 " the host\n"
244 " -p --port=[PROTOCOL:]HOSTPORT[:CONTAINERPORT]\n"
245 " Expose a container IP port on the host\n"
246 " -Z --selinux-context=SECLABEL\n"
247 " Set the SELinux security context to be used by\n"
248 " processes in the container\n"
249 " -L --selinux-apifs-context=SECLABEL\n"
250 " Set the SELinux security context to be used by\n"
251 " API/tmpfs file systems in the container\n"
252 " --capability=CAP In addition to the default, retain specified\n"
253 " capability\n"
254 " --drop-capability=CAP Drop the specified capability from the default set\n"
255 " --kill-signal=SIGNAL Select signal to use for shutting down PID 1\n"
256 " --link-journal=MODE Link up guest journal, one of no, auto, guest, host,\n"
257 " try-guest, try-host\n"
258 " -j Equivalent to --link-journal=try-guest\n"
259 " --read-only Mount the root directory read-only\n"
260 " --bind=PATH[:PATH] Bind mount a file or directory from the host into\n"
261 " the container\n"
262 " --bind-ro=PATH[:PATH] Similar, but creates a read-only bind mount\n"
263 " --tmpfs=PATH:[OPTIONS] Mount an empty tmpfs to the specified directory\n"
264 " --overlay=PATH[:PATH...]:PATH\n"
265 " Create an overlay mount from the host to \n"
266 " the container\n"
267 " --overlay-ro=PATH[:PATH...]:PATH\n"
268 " Similar, but creates a read-only overlay mount\n"
269 " --setenv=NAME=VALUE Pass an environment variable to PID 1\n"
270 " --share-system Share system namespaces with host\n"
271 " --register=BOOLEAN Register container as machine\n"
272 " --keep-unit Do not register a scope for the machine, reuse\n"
273 " the service unit nspawn is running in\n"
274 " --volatile[=MODE] Run the system in volatile mode\n"
275 , program_invocation_short_name);
276 }
277
278 static CustomMount* custom_mount_add(CustomMountType t) {
279 CustomMount *c, *ret;
280
281 c = realloc(arg_custom_mounts, (arg_n_custom_mounts + 1) * sizeof(CustomMount));
282 if (!c)
283 return NULL;
284
285 arg_custom_mounts = c;
286 ret = arg_custom_mounts + arg_n_custom_mounts;
287 arg_n_custom_mounts++;
288
289 *ret = (CustomMount) { .type = t };
290
291 return ret;
292 }
293
294 static void custom_mount_free_all(void) {
295 unsigned i;
296
297 for (i = 0; i < arg_n_custom_mounts; i++) {
298 CustomMount *m = &arg_custom_mounts[i];
299
300 free(m->source);
301 free(m->destination);
302 free(m->options);
303
304 if (m->work_dir) {
305 (void) rm_rf(m->work_dir, REMOVE_ROOT|REMOVE_PHYSICAL);
306 free(m->work_dir);
307 }
308
309 strv_free(m->lower);
310 }
311
312 arg_custom_mounts = mfree(arg_custom_mounts);
313 arg_n_custom_mounts = 0;
314 }
315
316 static int custom_mount_compare(const void *a, const void *b) {
317 const CustomMount *x = a, *y = b;
318 int r;
319
320 r = path_compare(x->destination, y->destination);
321 if (r != 0)
322 return r;
323
324 if (x->type < y->type)
325 return -1;
326 if (x->type > y->type)
327 return 1;
328
329 return 0;
330 }
331
332 static int custom_mounts_prepare(void) {
333 unsigned i;
334 int r;
335
336 /* Ensure the mounts are applied prefix first. */
337 qsort_safe(arg_custom_mounts, arg_n_custom_mounts, sizeof(CustomMount), custom_mount_compare);
338
339 /* Allocate working directories for the overlay file systems that need it */
340 for (i = 0; i < arg_n_custom_mounts; i++) {
341 CustomMount *m = &arg_custom_mounts[i];
342
343 if (arg_userns && arg_uid_shift == UID_INVALID && path_equal(m->destination, "/")) {
344 log_error("--private-users with automatic UID shift may not be combined with custom root mounts.");
345 return -EINVAL;
346 }
347
348 if (m->type != CUSTOM_MOUNT_OVERLAY)
349 continue;
350
351 if (m->work_dir)
352 continue;
353
354 if (m->read_only)
355 continue;
356
357 r = tempfn_random(m->source, NULL, &m->work_dir);
358 if (r < 0)
359 return log_error_errno(r, "Failed to generate work directory from %s: %m", m->source);
360 }
361
362 return 0;
363 }
364
365 static int set_sanitized_path(char **b, const char *path) {
366 char *p;
367
368 assert(b);
369 assert(path);
370
371 p = canonicalize_file_name(path);
372 if (!p) {
373 if (errno != ENOENT)
374 return -errno;
375
376 p = path_make_absolute_cwd(path);
377 if (!p)
378 return -ENOMEM;
379 }
380
381 free(*b);
382 *b = path_kill_slashes(p);
383 return 0;
384 }
385
386 static int parse_argv(int argc, char *argv[]) {
387
388 enum {
389 ARG_VERSION = 0x100,
390 ARG_PRIVATE_NETWORK,
391 ARG_UUID,
392 ARG_READ_ONLY,
393 ARG_CAPABILITY,
394 ARG_DROP_CAPABILITY,
395 ARG_LINK_JOURNAL,
396 ARG_BIND,
397 ARG_BIND_RO,
398 ARG_TMPFS,
399 ARG_OVERLAY,
400 ARG_OVERLAY_RO,
401 ARG_SETENV,
402 ARG_SHARE_SYSTEM,
403 ARG_REGISTER,
404 ARG_KEEP_UNIT,
405 ARG_NETWORK_INTERFACE,
406 ARG_NETWORK_MACVLAN,
407 ARG_NETWORK_IPVLAN,
408 ARG_NETWORK_BRIDGE,
409 ARG_PERSONALITY,
410 ARG_VOLATILE,
411 ARG_TEMPLATE,
412 ARG_PROPERTY,
413 ARG_PRIVATE_USERS,
414 ARG_KILL_SIGNAL,
415 };
416
417 static const struct option options[] = {
418 { "help", no_argument, NULL, 'h' },
419 { "version", no_argument, NULL, ARG_VERSION },
420 { "directory", required_argument, NULL, 'D' },
421 { "template", required_argument, NULL, ARG_TEMPLATE },
422 { "ephemeral", no_argument, NULL, 'x' },
423 { "user", required_argument, NULL, 'u' },
424 { "private-network", no_argument, NULL, ARG_PRIVATE_NETWORK },
425 { "boot", no_argument, NULL, 'b' },
426 { "uuid", required_argument, NULL, ARG_UUID },
427 { "read-only", no_argument, NULL, ARG_READ_ONLY },
428 { "capability", required_argument, NULL, ARG_CAPABILITY },
429 { "drop-capability", required_argument, NULL, ARG_DROP_CAPABILITY },
430 { "link-journal", required_argument, NULL, ARG_LINK_JOURNAL },
431 { "bind", required_argument, NULL, ARG_BIND },
432 { "bind-ro", required_argument, NULL, ARG_BIND_RO },
433 { "tmpfs", required_argument, NULL, ARG_TMPFS },
434 { "overlay", required_argument, NULL, ARG_OVERLAY },
435 { "overlay-ro", required_argument, NULL, ARG_OVERLAY_RO },
436 { "machine", required_argument, NULL, 'M' },
437 { "slice", required_argument, NULL, 'S' },
438 { "setenv", required_argument, NULL, ARG_SETENV },
439 { "selinux-context", required_argument, NULL, 'Z' },
440 { "selinux-apifs-context", required_argument, NULL, 'L' },
441 { "quiet", no_argument, NULL, 'q' },
442 { "share-system", no_argument, NULL, ARG_SHARE_SYSTEM },
443 { "register", required_argument, NULL, ARG_REGISTER },
444 { "keep-unit", no_argument, NULL, ARG_KEEP_UNIT },
445 { "network-interface", required_argument, NULL, ARG_NETWORK_INTERFACE },
446 { "network-macvlan", required_argument, NULL, ARG_NETWORK_MACVLAN },
447 { "network-ipvlan", required_argument, NULL, ARG_NETWORK_IPVLAN },
448 { "network-veth", no_argument, NULL, 'n' },
449 { "network-bridge", required_argument, NULL, ARG_NETWORK_BRIDGE },
450 { "personality", required_argument, NULL, ARG_PERSONALITY },
451 { "image", required_argument, NULL, 'i' },
452 { "volatile", optional_argument, NULL, ARG_VOLATILE },
453 { "port", required_argument, NULL, 'p' },
454 { "property", required_argument, NULL, ARG_PROPERTY },
455 { "private-users", optional_argument, NULL, ARG_PRIVATE_USERS },
456 { "kill-signal", required_argument, NULL, ARG_KILL_SIGNAL },
457 {}
458 };
459
460 int c, r;
461 uint64_t plus = 0, minus = 0;
462
463 assert(argc >= 0);
464 assert(argv);
465
466 while ((c = getopt_long(argc, argv, "+hD:u:bL:M:jS:Z:qi:xp:n", options, NULL)) >= 0)
467
468 switch (c) {
469
470 case 'h':
471 help();
472 return 0;
473
474 case ARG_VERSION:
475 puts(PACKAGE_STRING);
476 puts(SYSTEMD_FEATURES);
477 return 0;
478
479 case 'D':
480 r = set_sanitized_path(&arg_directory, optarg);
481 if (r < 0)
482 return log_error_errno(r, "Invalid root directory: %m");
483
484 break;
485
486 case ARG_TEMPLATE:
487 r = set_sanitized_path(&arg_template, optarg);
488 if (r < 0)
489 return log_error_errno(r, "Invalid template directory: %m");
490
491 break;
492
493 case 'i':
494 r = set_sanitized_path(&arg_image, optarg);
495 if (r < 0)
496 return log_error_errno(r, "Invalid image path: %m");
497
498 break;
499
500 case 'x':
501 arg_ephemeral = true;
502 break;
503
504 case 'u':
505 r = free_and_strdup(&arg_user, optarg);
506 if (r < 0)
507 return log_oom();
508
509 break;
510
511 case ARG_NETWORK_BRIDGE:
512 arg_network_bridge = optarg;
513
514 /* fall through */
515
516 case 'n':
517 arg_network_veth = true;
518 arg_private_network = true;
519 break;
520
521 case ARG_NETWORK_INTERFACE:
522 if (strv_extend(&arg_network_interfaces, optarg) < 0)
523 return log_oom();
524
525 arg_private_network = true;
526 break;
527
528 case ARG_NETWORK_MACVLAN:
529 if (strv_extend(&arg_network_macvlan, optarg) < 0)
530 return log_oom();
531
532 arg_private_network = true;
533 break;
534
535 case ARG_NETWORK_IPVLAN:
536 if (strv_extend(&arg_network_ipvlan, optarg) < 0)
537 return log_oom();
538
539 /* fall through */
540
541 case ARG_PRIVATE_NETWORK:
542 arg_private_network = true;
543 break;
544
545 case 'b':
546 arg_boot = true;
547 break;
548
549 case ARG_UUID:
550 r = sd_id128_from_string(optarg, &arg_uuid);
551 if (r < 0) {
552 log_error("Invalid UUID: %s", optarg);
553 return r;
554 }
555 break;
556
557 case 'S':
558 arg_slice = optarg;
559 break;
560
561 case 'M':
562 if (isempty(optarg)) {
563 arg_machine = mfree(arg_machine);
564 } else {
565 if (!machine_name_is_valid(optarg)) {
566 log_error("Invalid machine name: %s", optarg);
567 return -EINVAL;
568 }
569
570 r = free_and_strdup(&arg_machine, optarg);
571 if (r < 0)
572 return log_oom();
573
574 break;
575 }
576
577 case 'Z':
578 arg_selinux_context = optarg;
579 break;
580
581 case 'L':
582 arg_selinux_apifs_context = optarg;
583 break;
584
585 case ARG_READ_ONLY:
586 arg_read_only = true;
587 break;
588
589 case ARG_CAPABILITY:
590 case ARG_DROP_CAPABILITY: {
591 const char *state, *word;
592 size_t length;
593
594 FOREACH_WORD_SEPARATOR(word, length, optarg, ",", state) {
595 _cleanup_free_ char *t;
596
597 t = strndup(word, length);
598 if (!t)
599 return log_oom();
600
601 if (streq(t, "all")) {
602 if (c == ARG_CAPABILITY)
603 plus = (uint64_t) -1;
604 else
605 minus = (uint64_t) -1;
606 } else {
607 int cap;
608
609 cap = capability_from_name(t);
610 if (cap < 0) {
611 log_error("Failed to parse capability %s.", t);
612 return -EINVAL;
613 }
614
615 if (c == ARG_CAPABILITY)
616 plus |= 1ULL << (uint64_t) cap;
617 else
618 minus |= 1ULL << (uint64_t) cap;
619 }
620 }
621
622 break;
623 }
624
625 case 'j':
626 arg_link_journal = LINK_GUEST;
627 arg_link_journal_try = true;
628 break;
629
630 case ARG_LINK_JOURNAL:
631 if (streq(optarg, "auto")) {
632 arg_link_journal = LINK_AUTO;
633 arg_link_journal_try = false;
634 } else if (streq(optarg, "no")) {
635 arg_link_journal = LINK_NO;
636 arg_link_journal_try = false;
637 } else if (streq(optarg, "guest")) {
638 arg_link_journal = LINK_GUEST;
639 arg_link_journal_try = false;
640 } else if (streq(optarg, "host")) {
641 arg_link_journal = LINK_HOST;
642 arg_link_journal_try = false;
643 } else if (streq(optarg, "try-guest")) {
644 arg_link_journal = LINK_GUEST;
645 arg_link_journal_try = true;
646 } else if (streq(optarg, "try-host")) {
647 arg_link_journal = LINK_HOST;
648 arg_link_journal_try = true;
649 } else {
650 log_error("Failed to parse link journal mode %s", optarg);
651 return -EINVAL;
652 }
653
654 break;
655
656 case ARG_BIND:
657 case ARG_BIND_RO: {
658 const char *current = optarg;
659 _cleanup_free_ char *source = NULL, *destination = NULL;
660 CustomMount *m;
661
662 r = extract_many_words(&current, ":", EXTRACT_DONT_COALESCE_SEPARATORS, &source, &destination, NULL);
663 switch (r) {
664 case 1:
665 destination = strdup(source);
666 case 2:
667 break;
668 case -ENOMEM:
669 return log_oom();
670 default:
671 log_error("Invalid bind mount specification: %s", optarg);
672 return -EINVAL;
673 }
674
675 if (!source || !destination)
676 return log_oom();
677
678 if (!path_is_absolute(source) || !path_is_absolute(destination)) {
679 log_error("Invalid bind mount specification: %s", optarg);
680 return -EINVAL;
681 }
682
683 m = custom_mount_add(CUSTOM_MOUNT_BIND);
684 if (!m)
685 return log_oom();
686
687 m->source = source;
688 m->destination = destination;
689 m->read_only = c == ARG_BIND_RO;
690
691 source = destination = NULL;
692
693 break;
694 }
695
696 case ARG_TMPFS: {
697 const char *current = optarg;
698 _cleanup_free_ char *path = NULL, *opts = NULL;
699 CustomMount *m;
700
701 r = extract_first_word(&current, &path, ":", EXTRACT_DONT_COALESCE_SEPARATORS);
702 if (r == -ENOMEM)
703 return log_oom();
704 else if (r < 0) {
705 log_error("Invalid tmpfs specification: %s", optarg);
706 return r;
707 }
708 if (r)
709 opts = strdup(current);
710 else
711 opts = strdup("mode=0755");
712
713 if (!path || !opts)
714 return log_oom();
715
716 if (!path_is_absolute(path)) {
717 log_error("Invalid tmpfs specification: %s", optarg);
718 return -EINVAL;
719 }
720
721 m = custom_mount_add(CUSTOM_MOUNT_TMPFS);
722 if (!m)
723 return log_oom();
724
725 m->destination = path;
726 m->options = opts;
727
728 path = opts = NULL;
729
730 break;
731 }
732
733 case ARG_OVERLAY:
734 case ARG_OVERLAY_RO: {
735 _cleanup_free_ char *upper = NULL, *destination = NULL;
736 _cleanup_strv_free_ char **lower = NULL;
737 CustomMount *m;
738 unsigned n = 0;
739 char **i;
740
741 r = strv_split_extract(&lower, optarg, ":", EXTRACT_DONT_COALESCE_SEPARATORS);
742 if (r == -ENOMEM)
743 return log_oom();
744 else if (r < 0) {
745 log_error("Invalid overlay specification: %s", optarg);
746 return r;
747 }
748
749 STRV_FOREACH(i, lower) {
750 if (!path_is_absolute(*i)) {
751 log_error("Overlay path %s is not absolute.", *i);
752 return -EINVAL;
753 }
754
755 n++;
756 }
757
758 if (n < 2) {
759 log_error("--overlay= needs at least two colon-separated directories specified.");
760 return -EINVAL;
761 }
762
763 if (n == 2) {
764 /* If two parameters are specified,
765 * the first one is the lower, the
766 * second one the upper directory. And
767 * we'll also define the destination
768 * mount point the same as the upper. */
769 upper = lower[1];
770 lower[1] = NULL;
771
772 destination = strdup(upper);
773 if (!destination)
774 return log_oom();
775
776 } else {
777 upper = lower[n - 2];
778 destination = lower[n - 1];
779 lower[n - 2] = NULL;
780 }
781
782 m = custom_mount_add(CUSTOM_MOUNT_OVERLAY);
783 if (!m)
784 return log_oom();
785
786 m->destination = destination;
787 m->source = upper;
788 m->lower = lower;
789 m->read_only = c == ARG_OVERLAY_RO;
790
791 upper = destination = NULL;
792 lower = NULL;
793
794 break;
795 }
796
797 case ARG_SETENV: {
798 char **n;
799
800 if (!env_assignment_is_valid(optarg)) {
801 log_error("Environment variable assignment '%s' is not valid.", optarg);
802 return -EINVAL;
803 }
804
805 n = strv_env_set(arg_setenv, optarg);
806 if (!n)
807 return log_oom();
808
809 strv_free(arg_setenv);
810 arg_setenv = n;
811 break;
812 }
813
814 case 'q':
815 arg_quiet = true;
816 break;
817
818 case ARG_SHARE_SYSTEM:
819 arg_share_system = true;
820 break;
821
822 case ARG_REGISTER:
823 r = parse_boolean(optarg);
824 if (r < 0) {
825 log_error("Failed to parse --register= argument: %s", optarg);
826 return r;
827 }
828
829 arg_register = r;
830 break;
831
832 case ARG_KEEP_UNIT:
833 arg_keep_unit = true;
834 break;
835
836 case ARG_PERSONALITY:
837
838 arg_personality = personality_from_string(optarg);
839 if (arg_personality == PERSONALITY_INVALID) {
840 log_error("Unknown or unsupported personality '%s'.", optarg);
841 return -EINVAL;
842 }
843
844 break;
845
846 case ARG_VOLATILE:
847
848 if (!optarg)
849 arg_volatile = VOLATILE_YES;
850 else {
851 r = parse_boolean(optarg);
852 if (r < 0) {
853 if (streq(optarg, "state"))
854 arg_volatile = VOLATILE_STATE;
855 else {
856 log_error("Failed to parse --volatile= argument: %s", optarg);
857 return r;
858 }
859 } else
860 arg_volatile = r ? VOLATILE_YES : VOLATILE_NO;
861 }
862
863 break;
864
865 case 'p': {
866 const char *split, *e;
867 uint16_t container_port, host_port;
868 int protocol;
869 ExposePort *p;
870
871 if ((e = startswith(optarg, "tcp:")))
872 protocol = IPPROTO_TCP;
873 else if ((e = startswith(optarg, "udp:")))
874 protocol = IPPROTO_UDP;
875 else {
876 e = optarg;
877 protocol = IPPROTO_TCP;
878 }
879
880 split = strchr(e, ':');
881 if (split) {
882 char v[split - e + 1];
883
884 memcpy(v, e, split - e);
885 v[split - e] = 0;
886
887 r = safe_atou16(v, &host_port);
888 if (r < 0 || host_port <= 0) {
889 log_error("Failed to parse host port: %s", optarg);
890 return -EINVAL;
891 }
892
893 r = safe_atou16(split + 1, &container_port);
894 } else {
895 r = safe_atou16(e, &container_port);
896 host_port = container_port;
897 }
898
899 if (r < 0 || container_port <= 0) {
900 log_error("Failed to parse host port: %s", optarg);
901 return -EINVAL;
902 }
903
904 LIST_FOREACH(ports, p, arg_expose_ports) {
905 if (p->protocol == protocol && p->host_port == host_port) {
906 log_error("Duplicate port specification: %s", optarg);
907 return -EINVAL;
908 }
909 }
910
911 p = new(ExposePort, 1);
912 if (!p)
913 return log_oom();
914
915 p->protocol = protocol;
916 p->host_port = host_port;
917 p->container_port = container_port;
918
919 LIST_PREPEND(ports, arg_expose_ports, p);
920
921 break;
922 }
923
924 case ARG_PROPERTY:
925 if (strv_extend(&arg_property, optarg) < 0)
926 return log_oom();
927
928 break;
929
930 case ARG_PRIVATE_USERS:
931 if (optarg) {
932 _cleanup_free_ char *buffer = NULL;
933 const char *range, *shift;
934
935 range = strchr(optarg, ':');
936 if (range) {
937 buffer = strndup(optarg, range - optarg);
938 if (!buffer)
939 return log_oom();
940 shift = buffer;
941
942 range++;
943 if (safe_atou32(range, &arg_uid_range) < 0 || arg_uid_range <= 0) {
944 log_error("Failed to parse UID range: %s", range);
945 return -EINVAL;
946 }
947 } else
948 shift = optarg;
949
950 if (parse_uid(shift, &arg_uid_shift) < 0) {
951 log_error("Failed to parse UID: %s", optarg);
952 return -EINVAL;
953 }
954 }
955
956 arg_userns = true;
957 break;
958
959 case ARG_KILL_SIGNAL:
960 arg_kill_signal = signal_from_string_try_harder(optarg);
961 if (arg_kill_signal < 0) {
962 log_error("Cannot parse signal: %s", optarg);
963 return -EINVAL;
964 }
965
966 break;
967
968 case '?':
969 return -EINVAL;
970
971 default:
972 assert_not_reached("Unhandled option");
973 }
974
975 if (arg_share_system)
976 arg_register = false;
977
978 if (arg_boot && arg_share_system) {
979 log_error("--boot and --share-system may not be combined.");
980 return -EINVAL;
981 }
982
983 if (arg_keep_unit && cg_pid_get_owner_uid(0, NULL) >= 0) {
984 log_error("--keep-unit may not be used when invoked from a user session.");
985 return -EINVAL;
986 }
987
988 if (arg_directory && arg_image) {
989 log_error("--directory= and --image= may not be combined.");
990 return -EINVAL;
991 }
992
993 if (arg_template && arg_image) {
994 log_error("--template= and --image= may not be combined.");
995 return -EINVAL;
996 }
997
998 if (arg_template && !(arg_directory || arg_machine)) {
999 log_error("--template= needs --directory= or --machine=.");
1000 return -EINVAL;
1001 }
1002
1003 if (arg_ephemeral && arg_template) {
1004 log_error("--ephemeral and --template= may not be combined.");
1005 return -EINVAL;
1006 }
1007
1008 if (arg_ephemeral && arg_image) {
1009 log_error("--ephemeral and --image= may not be combined.");
1010 return -EINVAL;
1011 }
1012
1013 if (arg_ephemeral && !IN_SET(arg_link_journal, LINK_NO, LINK_AUTO)) {
1014 log_error("--ephemeral and --link-journal= may not be combined.");
1015 return -EINVAL;
1016 }
1017
1018 if (arg_volatile != VOLATILE_NO && arg_read_only) {
1019 log_error("Cannot combine --read-only with --volatile. Note that --volatile already implies a read-only base hierarchy.");
1020 return -EINVAL;
1021 }
1022
1023 if (arg_expose_ports && !arg_private_network) {
1024 log_error("Cannot use --port= without private networking.");
1025 return -EINVAL;
1026 }
1027
1028 if (arg_userns && access("/proc/self/uid_map", F_OK) < 0)
1029 return log_error_errno(EOPNOTSUPP, "--private-users= is not supported, kernel compiled without user namespace support.");
1030
1031 arg_retain = (arg_retain | plus | (arg_private_network ? 1ULL << CAP_NET_ADMIN : 0)) & ~minus;
1032
1033 if (arg_boot && arg_kill_signal <= 0)
1034 arg_kill_signal = SIGRTMIN+3;
1035
1036 return 1;
1037 }
1038
1039 static int tmpfs_patch_options(const char *options, char **ret) {
1040 char *buf = NULL;
1041
1042 if (arg_userns && arg_uid_shift != 0) {
1043 assert(arg_uid_shift != UID_INVALID);
1044
1045 if (options)
1046 (void) asprintf(&buf, "%s,uid=" UID_FMT ",gid=" UID_FMT, options, arg_uid_shift, arg_uid_shift);
1047 else
1048 (void) asprintf(&buf, "uid=" UID_FMT ",gid=" UID_FMT, arg_uid_shift, arg_uid_shift);
1049 if (!buf)
1050 return -ENOMEM;
1051
1052 options = buf;
1053 }
1054
1055 #ifdef HAVE_SELINUX
1056 if (arg_selinux_apifs_context) {
1057 char *t;
1058
1059 if (options)
1060 t = strjoin(options, ",context=\"", arg_selinux_apifs_context, "\"", NULL);
1061 else
1062 t = strjoin("context=\"", arg_selinux_apifs_context, "\"", NULL);
1063 if (!t) {
1064 free(buf);
1065 return -ENOMEM;
1066 }
1067
1068 free(buf);
1069 buf = t;
1070 }
1071 #endif
1072
1073 *ret = buf;
1074 return !!buf;
1075 }
1076
1077 static int mount_all(const char *dest, bool userns) {
1078
1079 typedef struct MountPoint {
1080 const char *what;
1081 const char *where;
1082 const char *type;
1083 const char *options;
1084 unsigned long flags;
1085 bool fatal;
1086 bool userns;
1087 } MountPoint;
1088
1089 static const MountPoint mount_table[] = {
1090 { "proc", "/proc", "proc", NULL, MS_NOSUID|MS_NOEXEC|MS_NODEV, true, true },
1091 { "/proc/sys", "/proc/sys", NULL, NULL, MS_BIND, true, true }, /* Bind mount first */
1092 { NULL, "/proc/sys", NULL, NULL, MS_BIND|MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_REMOUNT, true, true }, /* Then, make it r/o */
1093 { "sysfs", "/sys", "sysfs", NULL, MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV, true, false },
1094 { "tmpfs", "/sys/fs/cgroup", "tmpfs", "mode=755", MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_STRICTATIME, true, false },
1095 { "tmpfs", "/dev", "tmpfs", "mode=755", MS_NOSUID|MS_STRICTATIME, true, false },
1096 { "tmpfs", "/dev/shm", "tmpfs", "mode=1777", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true, false },
1097 { "tmpfs", "/run", "tmpfs", "mode=755", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true, false },
1098 { "tmpfs", "/tmp", "tmpfs", "mode=1777", MS_STRICTATIME, true, false },
1099 #ifdef HAVE_SELINUX
1100 { "/sys/fs/selinux", "/sys/fs/selinux", NULL, NULL, MS_BIND, false, false }, /* Bind mount first */
1101 { NULL, "/sys/fs/selinux", NULL, NULL, MS_BIND|MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_REMOUNT, false, false }, /* Then, make it r/o */
1102 #endif
1103 };
1104
1105 unsigned k;
1106 int r;
1107
1108 for (k = 0; k < ELEMENTSOF(mount_table); k++) {
1109 _cleanup_free_ char *where = NULL, *options = NULL;
1110 const char *o;
1111
1112 if (userns != mount_table[k].userns)
1113 continue;
1114
1115 where = prefix_root(dest, mount_table[k].where);
1116 if (!where)
1117 return log_oom();
1118
1119 r = path_is_mount_point(where, AT_SYMLINK_FOLLOW);
1120 if (r < 0 && r != -ENOENT)
1121 return log_error_errno(r, "Failed to detect whether %s is a mount point: %m", where);
1122
1123 /* Skip this entry if it is not a remount. */
1124 if (mount_table[k].what && r > 0)
1125 continue;
1126
1127 r = mkdir_p(where, 0755);
1128 if (r < 0) {
1129 if (mount_table[k].fatal)
1130 return log_error_errno(r, "Failed to create directory %s: %m", where);
1131
1132 log_warning_errno(r, "Failed to create directory %s: %m", where);
1133 continue;
1134 }
1135
1136 o = mount_table[k].options;
1137 if (streq_ptr(mount_table[k].type, "tmpfs")) {
1138 r = tmpfs_patch_options(o, &options);
1139 if (r < 0)
1140 return log_oom();
1141 if (r > 0)
1142 o = options;
1143 }
1144
1145 if (mount(mount_table[k].what,
1146 where,
1147 mount_table[k].type,
1148 mount_table[k].flags,
1149 o) < 0) {
1150
1151 if (mount_table[k].fatal)
1152 return log_error_errno(errno, "mount(%s) failed: %m", where);
1153
1154 log_warning_errno(errno, "mount(%s) failed, ignoring: %m", where);
1155 }
1156 }
1157
1158 return 0;
1159 }
1160
1161 static int mount_bind(const char *dest, CustomMount *m) {
1162 struct stat source_st, dest_st;
1163 const char *where;
1164 int r;
1165
1166 assert(m);
1167
1168 if (stat(m->source, &source_st) < 0)
1169 return log_error_errno(errno, "Failed to stat %s: %m", m->source);
1170
1171 where = prefix_roota(dest, m->destination);
1172
1173 if (stat(where, &dest_st) >= 0) {
1174 if (S_ISDIR(source_st.st_mode) && !S_ISDIR(dest_st.st_mode)) {
1175 log_error("Cannot bind mount directory %s on file %s.", m->source, where);
1176 return -EINVAL;
1177 }
1178
1179 if (!S_ISDIR(source_st.st_mode) && S_ISDIR(dest_st.st_mode)) {
1180 log_error("Cannot bind mount file %s on directory %s.", m->source, where);
1181 return -EINVAL;
1182 }
1183
1184 } else if (errno == ENOENT) {
1185 r = mkdir_parents_label(where, 0755);
1186 if (r < 0)
1187 return log_error_errno(r, "Failed to make parents of %s: %m", where);
1188 } else {
1189 log_error_errno(errno, "Failed to stat %s: %m", where);
1190 return -errno;
1191 }
1192
1193 /* Create the mount point. Any non-directory file can be
1194 * mounted on any non-directory file (regular, fifo, socket,
1195 * char, block).
1196 */
1197 if (S_ISDIR(source_st.st_mode))
1198 r = mkdir_label(where, 0755);
1199 else
1200 r = touch(where);
1201 if (r < 0 && r != -EEXIST)
1202 return log_error_errno(r, "Failed to create mount point %s: %m", where);
1203
1204 if (mount(m->source, where, NULL, MS_BIND, NULL) < 0)
1205 return log_error_errno(errno, "mount(%s) failed: %m", where);
1206
1207 if (m->read_only) {
1208 r = bind_remount_recursive(where, true);
1209 if (r < 0)
1210 return log_error_errno(r, "Read-only bind mount failed: %m");
1211 }
1212
1213 return 0;
1214 }
1215
1216 static int mount_tmpfs(const char *dest, CustomMount *m) {
1217 const char *where, *options;
1218 _cleanup_free_ char *buf = NULL;
1219 int r;
1220
1221 assert(dest);
1222 assert(m);
1223
1224 where = prefix_roota(dest, m->destination);
1225
1226 r = mkdir_p_label(where, 0755);
1227 if (r < 0 && r != -EEXIST)
1228 return log_error_errno(r, "Creating mount point for tmpfs %s failed: %m", where);
1229
1230 r = tmpfs_patch_options(m->options, &buf);
1231 if (r < 0)
1232 return log_oom();
1233 options = r > 0 ? buf : m->options;
1234
1235 if (mount("tmpfs", where, "tmpfs", MS_NODEV|MS_STRICTATIME, options) < 0)
1236 return log_error_errno(errno, "tmpfs mount to %s failed: %m", where);
1237
1238 return 0;
1239 }
1240
1241 static char *joined_and_escaped_lower_dirs(char * const *lower) {
1242 _cleanup_strv_free_ char **sv = NULL;
1243
1244 sv = strv_copy(lower);
1245 if (!sv)
1246 return NULL;
1247
1248 strv_reverse(sv);
1249
1250 if (!strv_shell_escape(sv, ",:"))
1251 return NULL;
1252
1253 return strv_join(sv, ":");
1254 }
1255
1256 static int mount_overlay(const char *dest, CustomMount *m) {
1257 _cleanup_free_ char *lower = NULL;
1258 const char *where, *options;
1259 int r;
1260
1261 assert(dest);
1262 assert(m);
1263
1264 where = prefix_roota(dest, m->destination);
1265
1266 r = mkdir_label(where, 0755);
1267 if (r < 0 && r != -EEXIST)
1268 return log_error_errno(r, "Creating mount point for overlay %s failed: %m", where);
1269
1270 (void) mkdir_p_label(m->source, 0755);
1271
1272 lower = joined_and_escaped_lower_dirs(m->lower);
1273 if (!lower)
1274 return log_oom();
1275
1276 if (m->read_only) {
1277 _cleanup_free_ char *escaped_source = NULL;
1278
1279 escaped_source = shell_escape(m->source, ",:");
1280 if (!escaped_source)
1281 return log_oom();
1282
1283 options = strjoina("lowerdir=", escaped_source, ":", lower);
1284 } else {
1285 _cleanup_free_ char *escaped_source = NULL, *escaped_work_dir = NULL;
1286
1287 assert(m->work_dir);
1288 (void) mkdir_label(m->work_dir, 0700);
1289
1290 escaped_source = shell_escape(m->source, ",:");
1291 if (!escaped_source)
1292 return log_oom();
1293 escaped_work_dir = shell_escape(m->work_dir, ",:");
1294 if (!escaped_work_dir)
1295 return log_oom();
1296
1297 options = strjoina("lowerdir=", lower, ",upperdir=", escaped_source, ",workdir=", escaped_work_dir);
1298 }
1299
1300 if (mount("overlay", where, "overlay", m->read_only ? MS_RDONLY : 0, options) < 0)
1301 return log_error_errno(errno, "overlay mount to %s failed: %m", where);
1302
1303 return 0;
1304 }
1305
1306 static int mount_custom(const char *dest) {
1307 unsigned i;
1308 int r;
1309
1310 assert(dest);
1311
1312 for (i = 0; i < arg_n_custom_mounts; i++) {
1313 CustomMount *m = &arg_custom_mounts[i];
1314
1315 switch (m->type) {
1316
1317 case CUSTOM_MOUNT_BIND:
1318 r = mount_bind(dest, m);
1319 break;
1320
1321 case CUSTOM_MOUNT_TMPFS:
1322 r = mount_tmpfs(dest, m);
1323 break;
1324
1325 case CUSTOM_MOUNT_OVERLAY:
1326 r = mount_overlay(dest, m);
1327 break;
1328
1329 default:
1330 assert_not_reached("Unknown custom mount type");
1331 }
1332
1333 if (r < 0)
1334 return r;
1335 }
1336
1337 return 0;
1338 }
1339
1340 static int mount_cgroup_hierarchy(const char *dest, const char *controller, const char *hierarchy, bool read_only) {
1341 char *to;
1342 int r;
1343
1344 to = strjoina(dest, "/sys/fs/cgroup/", hierarchy);
1345
1346 r = path_is_mount_point(to, 0);
1347 if (r < 0 && r != -ENOENT)
1348 return log_error_errno(r, "Failed to determine if %s is mounted already: %m", to);
1349 if (r > 0)
1350 return 0;
1351
1352 mkdir_p(to, 0755);
1353
1354 /* The superblock mount options of the mount point need to be
1355 * identical to the hosts', and hence writable... */
1356 if (mount("cgroup", to, "cgroup", MS_NOSUID|MS_NOEXEC|MS_NODEV, controller) < 0)
1357 return log_error_errno(errno, "Failed to mount to %s: %m", to);
1358
1359 /* ... hence let's only make the bind mount read-only, not the
1360 * superblock. */
1361 if (read_only) {
1362 if (mount(NULL, to, NULL, MS_BIND|MS_REMOUNT|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_RDONLY, NULL) < 0)
1363 return log_error_errno(errno, "Failed to remount %s read-only: %m", to);
1364 }
1365 return 1;
1366 }
1367
1368 static int mount_cgroup(const char *dest) {
1369 _cleanup_set_free_free_ Set *controllers = NULL;
1370 const char *cgroup_root;
1371 int r;
1372
1373 controllers = set_new(&string_hash_ops);
1374 if (!controllers)
1375 return log_oom();
1376
1377 r = cg_kernel_controllers(controllers);
1378 if (r < 0)
1379 return log_error_errno(r, "Failed to determine cgroup controllers: %m");
1380
1381 for (;;) {
1382 _cleanup_free_ char *controller = NULL, *origin = NULL, *combined = NULL;
1383
1384 controller = set_steal_first(controllers);
1385 if (!controller)
1386 break;
1387
1388 origin = prefix_root("/sys/fs/cgroup/", controller);
1389 if (!origin)
1390 return log_oom();
1391
1392 r = readlink_malloc(origin, &combined);
1393 if (r == -EINVAL) {
1394 /* Not a symbolic link, but directly a single cgroup hierarchy */
1395
1396 r = mount_cgroup_hierarchy(dest, controller, controller, true);
1397 if (r < 0)
1398 return r;
1399
1400 } else if (r < 0)
1401 return log_error_errno(r, "Failed to read link %s: %m", origin);
1402 else {
1403 _cleanup_free_ char *target = NULL;
1404
1405 target = prefix_root(dest, origin);
1406 if (!target)
1407 return log_oom();
1408
1409 /* A symbolic link, a combination of controllers in one hierarchy */
1410
1411 if (!filename_is_valid(combined)) {
1412 log_warning("Ignoring invalid combined hierarchy %s.", combined);
1413 continue;
1414 }
1415
1416 r = mount_cgroup_hierarchy(dest, combined, combined, true);
1417 if (r < 0)
1418 return r;
1419
1420 r = symlink_idempotent(combined, target);
1421 if (r == -EINVAL) {
1422 log_error("Invalid existing symlink for combined hierarchy");
1423 return r;
1424 }
1425 if (r < 0)
1426 return log_error_errno(r, "Failed to create symlink for combined hierarchy: %m");
1427 }
1428 }
1429
1430 r = mount_cgroup_hierarchy(dest, "name=systemd,xattr", "systemd", false);
1431 if (r < 0)
1432 return r;
1433
1434 cgroup_root = prefix_roota(dest, "/sys/fs/cgroup");
1435 if (mount(NULL, cgroup_root, NULL, MS_REMOUNT|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_STRICTATIME|MS_RDONLY, "mode=755") < 0)
1436 return log_error_errno(errno, "Failed to remount %s read-only: %m", cgroup_root);
1437
1438 return 0;
1439 }
1440
1441 static int mount_systemd_cgroup_writable(const char *dest) {
1442 _cleanup_free_ char *own_cgroup_path = NULL;
1443 const char *systemd_root, *systemd_own;
1444 int r;
1445
1446 assert(dest);
1447
1448 r = cg_pid_get_path(NULL, 0, &own_cgroup_path);
1449 if (r < 0)
1450 return log_error_errno(r, "Failed to determine our own cgroup path: %m");
1451
1452 /* Make our own cgroup a (writable) bind mount */
1453 systemd_own = strjoina(dest, "/sys/fs/cgroup/systemd", own_cgroup_path);
1454 if (mount(systemd_own, systemd_own, NULL, MS_BIND, NULL) < 0)
1455 return log_error_errno(errno, "Failed to turn %s into a bind mount: %m", own_cgroup_path);
1456
1457 /* And then remount the systemd cgroup root read-only */
1458 systemd_root = prefix_roota(dest, "/sys/fs/cgroup/systemd");
1459 if (mount(NULL, systemd_root, NULL, MS_BIND|MS_REMOUNT|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_RDONLY, NULL) < 0)
1460 return log_error_errno(errno, "Failed to mount cgroup root read-only: %m");
1461
1462 return 0;
1463 }
1464
1465 static int userns_lchown(const char *p, uid_t uid, gid_t gid) {
1466 assert(p);
1467
1468 if (!arg_userns)
1469 return 0;
1470
1471 if (uid == UID_INVALID && gid == GID_INVALID)
1472 return 0;
1473
1474 if (uid != UID_INVALID) {
1475 uid += arg_uid_shift;
1476
1477 if (uid < arg_uid_shift || uid >= arg_uid_shift + arg_uid_range)
1478 return -EOVERFLOW;
1479 }
1480
1481 if (gid != GID_INVALID) {
1482 gid += (gid_t) arg_uid_shift;
1483
1484 if (gid < (gid_t) arg_uid_shift || gid >= (gid_t) (arg_uid_shift + arg_uid_range))
1485 return -EOVERFLOW;
1486 }
1487
1488 if (lchown(p, uid, gid) < 0)
1489 return -errno;
1490
1491 return 0;
1492 }
1493
1494 static int userns_mkdir(const char *root, const char *path, mode_t mode, uid_t uid, gid_t gid) {
1495 const char *q;
1496
1497 q = prefix_roota(root, path);
1498 if (mkdir(q, mode) < 0) {
1499 if (errno == EEXIST)
1500 return 0;
1501 return -errno;
1502 }
1503
1504 return userns_lchown(q, uid, gid);
1505 }
1506
1507 static int setup_timezone(const char *dest) {
1508 _cleanup_free_ char *p = NULL, *q = NULL;
1509 const char *where, *check, *what;
1510 char *z, *y;
1511 int r;
1512
1513 assert(dest);
1514
1515 /* Fix the timezone, if possible */
1516 r = readlink_malloc("/etc/localtime", &p);
1517 if (r < 0) {
1518 log_warning("/etc/localtime is not a symlink, not updating container timezone.");
1519 return 0;
1520 }
1521
1522 z = path_startswith(p, "../usr/share/zoneinfo/");
1523 if (!z)
1524 z = path_startswith(p, "/usr/share/zoneinfo/");
1525 if (!z) {
1526 log_warning("/etc/localtime does not point into /usr/share/zoneinfo/, not updating container timezone.");
1527 return 0;
1528 }
1529
1530 where = prefix_roota(dest, "/etc/localtime");
1531 r = readlink_malloc(where, &q);
1532 if (r >= 0) {
1533 y = path_startswith(q, "../usr/share/zoneinfo/");
1534 if (!y)
1535 y = path_startswith(q, "/usr/share/zoneinfo/");
1536
1537 /* Already pointing to the right place? Then do nothing .. */
1538 if (y && streq(y, z))
1539 return 0;
1540 }
1541
1542 check = strjoina("/usr/share/zoneinfo/", z);
1543 check = prefix_root(dest, check);
1544 if (laccess(check, F_OK) < 0) {
1545 log_warning("Timezone %s does not exist in container, not updating container timezone.", z);
1546 return 0;
1547 }
1548
1549 r = unlink(where);
1550 if (r < 0 && errno != ENOENT) {
1551 log_error_errno(errno, "Failed to remove existing timezone info %s in container: %m", where);
1552 return 0;
1553 }
1554
1555 what = strjoina("../usr/share/zoneinfo/", z);
1556 if (symlink(what, where) < 0) {
1557 log_error_errno(errno, "Failed to correct timezone of container: %m");
1558 return 0;
1559 }
1560
1561 r = userns_lchown(where, 0, 0);
1562 if (r < 0)
1563 return log_warning_errno(r, "Failed to chown /etc/localtime: %m");
1564
1565 return 0;
1566 }
1567
1568 static int setup_resolv_conf(const char *dest) {
1569 const char *where = NULL;
1570 int r;
1571
1572 assert(dest);
1573
1574 if (arg_private_network)
1575 return 0;
1576
1577 /* Fix resolv.conf, if possible */
1578 where = prefix_roota(dest, "/etc/resolv.conf");
1579
1580 r = copy_file("/etc/resolv.conf", where, O_TRUNC|O_NOFOLLOW, 0644, 0);
1581 if (r < 0) {
1582 /* If the file already exists as symlink, let's
1583 * suppress the warning, under the assumption that
1584 * resolved or something similar runs inside and the
1585 * symlink points there.
1586 *
1587 * If the disk image is read-only, there's also no
1588 * point in complaining.
1589 */
1590 log_full_errno(IN_SET(r, -ELOOP, -EROFS) ? LOG_DEBUG : LOG_WARNING, r,
1591 "Failed to copy /etc/resolv.conf to %s: %m", where);
1592 return 0;
1593 }
1594
1595 r = userns_lchown(where, 0, 0);
1596 if (r < 0)
1597 log_warning_errno(r, "Failed to chown /etc/resolv.conf: %m");
1598
1599 return 0;
1600 }
1601
1602 static int setup_volatile_state(const char *directory) {
1603 _cleanup_free_ char *buf = NULL;
1604 const char *p, *options;
1605 int r;
1606
1607 assert(directory);
1608
1609 if (arg_volatile != VOLATILE_STATE)
1610 return 0;
1611
1612 /* --volatile=state means we simply overmount /var
1613 with a tmpfs, and the rest read-only. */
1614
1615 r = bind_remount_recursive(directory, true);
1616 if (r < 0)
1617 return log_error_errno(r, "Failed to remount %s read-only: %m", directory);
1618
1619 p = prefix_roota(directory, "/var");
1620 r = mkdir(p, 0755);
1621 if (r < 0 && errno != EEXIST)
1622 return log_error_errno(errno, "Failed to create %s: %m", directory);
1623
1624 options = "mode=755";
1625 r = tmpfs_patch_options(options, &buf);
1626 if (r < 0)
1627 return log_oom();
1628 if (r > 0)
1629 options = buf;
1630
1631 if (mount("tmpfs", p, "tmpfs", MS_STRICTATIME, options) < 0)
1632 return log_error_errno(errno, "Failed to mount tmpfs to /var: %m");
1633
1634 return 0;
1635 }
1636
1637 static int setup_volatile(const char *directory) {
1638 bool tmpfs_mounted = false, bind_mounted = false;
1639 char template[] = "/tmp/nspawn-volatile-XXXXXX";
1640 _cleanup_free_ char *buf = NULL;
1641 const char *f, *t, *options;
1642 int r;
1643
1644 assert(directory);
1645
1646 if (arg_volatile != VOLATILE_YES)
1647 return 0;
1648
1649 /* --volatile=yes means we mount a tmpfs to the root dir, and
1650 the original /usr to use inside it, and that read-only. */
1651
1652 if (!mkdtemp(template))
1653 return log_error_errno(errno, "Failed to create temporary directory: %m");
1654
1655 options = "mode=755";
1656 r = tmpfs_patch_options(options, &buf);
1657 if (r < 0)
1658 return log_oom();
1659 if (r > 0)
1660 options = buf;
1661
1662 if (mount("tmpfs", template, "tmpfs", MS_STRICTATIME, options) < 0) {
1663 r = log_error_errno(errno, "Failed to mount tmpfs for root directory: %m");
1664 goto fail;
1665 }
1666
1667 tmpfs_mounted = true;
1668
1669 f = prefix_roota(directory, "/usr");
1670 t = prefix_roota(template, "/usr");
1671
1672 r = mkdir(t, 0755);
1673 if (r < 0 && errno != EEXIST) {
1674 r = log_error_errno(errno, "Failed to create %s: %m", t);
1675 goto fail;
1676 }
1677
1678 if (mount(f, t, NULL, MS_BIND|MS_REC, NULL) < 0) {
1679 r = log_error_errno(errno, "Failed to create /usr bind mount: %m");
1680 goto fail;
1681 }
1682
1683 bind_mounted = true;
1684
1685 r = bind_remount_recursive(t, true);
1686 if (r < 0) {
1687 log_error_errno(r, "Failed to remount %s read-only: %m", t);
1688 goto fail;
1689 }
1690
1691 if (mount(template, directory, NULL, MS_MOVE, NULL) < 0) {
1692 r = log_error_errno(errno, "Failed to move root mount: %m");
1693 goto fail;
1694 }
1695
1696 (void) rmdir(template);
1697
1698 return 0;
1699
1700 fail:
1701 if (bind_mounted)
1702 (void) umount(t);
1703
1704 if (tmpfs_mounted)
1705 (void) umount(template);
1706 (void) rmdir(template);
1707 return r;
1708 }
1709
1710 static char* id128_format_as_uuid(sd_id128_t id, char s[37]) {
1711 assert(s);
1712
1713 snprintf(s, 37,
1714 "%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
1715 SD_ID128_FORMAT_VAL(id));
1716
1717 return s;
1718 }
1719
1720 static int setup_boot_id(const char *dest) {
1721 const char *from, *to;
1722 sd_id128_t rnd = {};
1723 char as_uuid[37];
1724 int r;
1725
1726 if (arg_share_system)
1727 return 0;
1728
1729 /* Generate a new randomized boot ID, so that each boot-up of
1730 * the container gets a new one */
1731
1732 from = prefix_roota(dest, "/run/proc-sys-kernel-random-boot-id");
1733 to = prefix_roota(dest, "/proc/sys/kernel/random/boot_id");
1734
1735 r = sd_id128_randomize(&rnd);
1736 if (r < 0)
1737 return log_error_errno(r, "Failed to generate random boot id: %m");
1738
1739 id128_format_as_uuid(rnd, as_uuid);
1740
1741 r = write_string_file(from, as_uuid, WRITE_STRING_FILE_CREATE);
1742 if (r < 0)
1743 return log_error_errno(r, "Failed to write boot id: %m");
1744
1745 if (mount(from, to, NULL, MS_BIND, NULL) < 0)
1746 r = log_error_errno(errno, "Failed to bind mount boot id: %m");
1747 else if (mount(NULL, to, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY|MS_NOSUID|MS_NODEV, NULL) < 0)
1748 log_warning_errno(errno, "Failed to make boot id read-only: %m");
1749
1750 unlink(from);
1751 return r;
1752 }
1753
1754 static int copy_devnodes(const char *dest) {
1755
1756 static const char devnodes[] =
1757 "null\0"
1758 "zero\0"
1759 "full\0"
1760 "random\0"
1761 "urandom\0"
1762 "tty\0"
1763 "net/tun\0";
1764
1765 const char *d;
1766 int r = 0;
1767 _cleanup_umask_ mode_t u;
1768
1769 assert(dest);
1770
1771 u = umask(0000);
1772
1773 /* Create /dev/net, so that we can create /dev/net/tun in it */
1774 if (userns_mkdir(dest, "/dev/net", 0755, 0, 0) < 0)
1775 return log_error_errno(r, "Failed to create /dev/net directory: %m");
1776
1777 NULSTR_FOREACH(d, devnodes) {
1778 _cleanup_free_ char *from = NULL, *to = NULL;
1779 struct stat st;
1780
1781 from = strappend("/dev/", d);
1782 to = prefix_root(dest, from);
1783
1784 if (stat(from, &st) < 0) {
1785
1786 if (errno != ENOENT)
1787 return log_error_errno(errno, "Failed to stat %s: %m", from);
1788
1789 } else if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
1790
1791 log_error("%s is not a char or block device, cannot copy.", from);
1792 return -EIO;
1793
1794 } else {
1795 if (mknod(to, st.st_mode, st.st_rdev) < 0) {
1796 if (errno != EPERM)
1797 return log_error_errno(errno, "mknod(%s) failed: %m", to);
1798
1799 /* Some systems abusively restrict mknod but
1800 * allow bind mounts. */
1801 r = touch(to);
1802 if (r < 0)
1803 return log_error_errno(r, "touch (%s) failed: %m", to);
1804 if (mount(from, to, NULL, MS_BIND, NULL) < 0)
1805 return log_error_errno(errno, "Both mknod and bind mount (%s) failed: %m", to);
1806 }
1807
1808 r = userns_lchown(to, 0, 0);
1809 if (r < 0)
1810 return log_error_errno(r, "chown() of device node %s failed: %m", to);
1811 }
1812 }
1813
1814 return r;
1815 }
1816
1817 static int setup_pts(const char *dest) {
1818 _cleanup_free_ char *options = NULL;
1819 const char *p;
1820
1821 #ifdef HAVE_SELINUX
1822 if (arg_selinux_apifs_context)
1823 (void) asprintf(&options,
1824 "newinstance,ptmxmode=0666,mode=620,gid=" GID_FMT ",context=\"%s\"",
1825 arg_uid_shift + TTY_GID,
1826 arg_selinux_apifs_context);
1827 else
1828 #endif
1829 (void) asprintf(&options,
1830 "newinstance,ptmxmode=0666,mode=620,gid=" GID_FMT,
1831 arg_uid_shift + TTY_GID);
1832
1833 if (!options)
1834 return log_oom();
1835
1836 /* Mount /dev/pts itself */
1837 p = prefix_roota(dest, "/dev/pts");
1838 if (mkdir(p, 0755) < 0)
1839 return log_error_errno(errno, "Failed to create /dev/pts: %m");
1840 if (mount("devpts", p, "devpts", MS_NOSUID|MS_NOEXEC, options) < 0)
1841 return log_error_errno(errno, "Failed to mount /dev/pts: %m");
1842 if (userns_lchown(p, 0, 0) < 0)
1843 return log_error_errno(errno, "Failed to chown /dev/pts: %m");
1844
1845 /* Create /dev/ptmx symlink */
1846 p = prefix_roota(dest, "/dev/ptmx");
1847 if (symlink("pts/ptmx", p) < 0)
1848 return log_error_errno(errno, "Failed to create /dev/ptmx symlink: %m");
1849 if (userns_lchown(p, 0, 0) < 0)
1850 return log_error_errno(errno, "Failed to chown /dev/ptmx: %m");
1851
1852 /* And fix /dev/pts/ptmx ownership */
1853 p = prefix_roota(dest, "/dev/pts/ptmx");
1854 if (userns_lchown(p, 0, 0) < 0)
1855 return log_error_errno(errno, "Failed to chown /dev/pts/ptmx: %m");
1856
1857 return 0;
1858 }
1859
1860 static int setup_dev_console(const char *dest, const char *console) {
1861 _cleanup_umask_ mode_t u;
1862 const char *to;
1863 int r;
1864
1865 assert(dest);
1866 assert(console);
1867
1868 u = umask(0000);
1869
1870 r = chmod_and_chown(console, 0600, arg_uid_shift, arg_uid_shift);
1871 if (r < 0)
1872 return log_error_errno(r, "Failed to correct access mode for TTY: %m");
1873
1874 /* We need to bind mount the right tty to /dev/console since
1875 * ptys can only exist on pts file systems. To have something
1876 * to bind mount things on we create a empty regular file. */
1877
1878 to = prefix_roota(dest, "/dev/console");
1879 r = touch(to);
1880 if (r < 0)
1881 return log_error_errno(r, "touch() for /dev/console failed: %m");
1882
1883 if (mount(console, to, NULL, MS_BIND, NULL) < 0)
1884 return log_error_errno(errno, "Bind mount for /dev/console failed: %m");
1885
1886 return 0;
1887 }
1888
1889 static int setup_kmsg(const char *dest, int kmsg_socket) {
1890 const char *from, *to;
1891 _cleanup_umask_ mode_t u;
1892 int fd, k;
1893 union {
1894 struct cmsghdr cmsghdr;
1895 uint8_t buf[CMSG_SPACE(sizeof(int))];
1896 } control = {};
1897 struct msghdr mh = {
1898 .msg_control = &control,
1899 .msg_controllen = sizeof(control),
1900 };
1901 struct cmsghdr *cmsg;
1902
1903 assert(kmsg_socket >= 0);
1904
1905 u = umask(0000);
1906
1907 /* We create the kmsg FIFO as /run/kmsg, but immediately
1908 * delete it after bind mounting it to /proc/kmsg. While FIFOs
1909 * on the reading side behave very similar to /proc/kmsg,
1910 * their writing side behaves differently from /dev/kmsg in
1911 * that writing blocks when nothing is reading. In order to
1912 * avoid any problems with containers deadlocking due to this
1913 * we simply make /dev/kmsg unavailable to the container. */
1914 from = prefix_roota(dest, "/run/kmsg");
1915 to = prefix_roota(dest, "/proc/kmsg");
1916
1917 if (mkfifo(from, 0600) < 0)
1918 return log_error_errno(errno, "mkfifo() for /run/kmsg failed: %m");
1919 if (mount(from, to, NULL, MS_BIND, NULL) < 0)
1920 return log_error_errno(errno, "Bind mount for /proc/kmsg failed: %m");
1921
1922 fd = open(from, O_RDWR|O_NDELAY|O_CLOEXEC);
1923 if (fd < 0)
1924 return log_error_errno(errno, "Failed to open fifo: %m");
1925
1926 cmsg = CMSG_FIRSTHDR(&mh);
1927 cmsg->cmsg_level = SOL_SOCKET;
1928 cmsg->cmsg_type = SCM_RIGHTS;
1929 cmsg->cmsg_len = CMSG_LEN(sizeof(int));
1930 memcpy(CMSG_DATA(cmsg), &fd, sizeof(int));
1931
1932 mh.msg_controllen = cmsg->cmsg_len;
1933
1934 /* Store away the fd in the socket, so that it stays open as
1935 * long as we run the child */
1936 k = sendmsg(kmsg_socket, &mh, MSG_NOSIGNAL);
1937 safe_close(fd);
1938
1939 if (k < 0)
1940 return log_error_errno(errno, "Failed to send FIFO fd: %m");
1941
1942 /* And now make the FIFO unavailable as /run/kmsg... */
1943 (void) unlink(from);
1944
1945 return 0;
1946 }
1947
1948 static int send_rtnl(int send_fd) {
1949 union {
1950 struct cmsghdr cmsghdr;
1951 uint8_t buf[CMSG_SPACE(sizeof(int))];
1952 } control = {};
1953 struct msghdr mh = {
1954 .msg_control = &control,
1955 .msg_controllen = sizeof(control),
1956 };
1957 struct cmsghdr *cmsg;
1958 _cleanup_close_ int fd = -1;
1959 ssize_t k;
1960
1961 assert(send_fd >= 0);
1962
1963 if (!arg_expose_ports)
1964 return 0;
1965
1966 fd = socket(PF_NETLINK, SOCK_RAW|SOCK_CLOEXEC|SOCK_NONBLOCK, NETLINK_ROUTE);
1967 if (fd < 0)
1968 return log_error_errno(errno, "Failed to allocate container netlink: %m");
1969
1970 cmsg = CMSG_FIRSTHDR(&mh);
1971 cmsg->cmsg_level = SOL_SOCKET;
1972 cmsg->cmsg_type = SCM_RIGHTS;
1973 cmsg->cmsg_len = CMSG_LEN(sizeof(int));
1974 memcpy(CMSG_DATA(cmsg), &fd, sizeof(int));
1975
1976 mh.msg_controllen = cmsg->cmsg_len;
1977
1978 /* Store away the fd in the socket, so that it stays open as
1979 * long as we run the child */
1980 k = sendmsg(send_fd, &mh, MSG_NOSIGNAL);
1981 if (k < 0)
1982 return log_error_errno(errno, "Failed to send netlink fd: %m");
1983
1984 return 0;
1985 }
1986
1987 static int flush_ports(union in_addr_union *exposed) {
1988 ExposePort *p;
1989 int r, af = AF_INET;
1990
1991 assert(exposed);
1992
1993 if (!arg_expose_ports)
1994 return 0;
1995
1996 if (in_addr_is_null(af, exposed))
1997 return 0;
1998
1999 log_debug("Lost IP address.");
2000
2001 LIST_FOREACH(ports, p, arg_expose_ports) {
2002 r = fw_add_local_dnat(false,
2003 af,
2004 p->protocol,
2005 NULL,
2006 NULL, 0,
2007 NULL, 0,
2008 p->host_port,
2009 exposed,
2010 p->container_port,
2011 NULL);
2012 if (r < 0)
2013 log_warning_errno(r, "Failed to modify firewall: %m");
2014 }
2015
2016 *exposed = IN_ADDR_NULL;
2017 return 0;
2018 }
2019
2020 static int expose_ports(sd_netlink *rtnl, union in_addr_union *exposed) {
2021 _cleanup_free_ struct local_address *addresses = NULL;
2022 _cleanup_free_ char *pretty = NULL;
2023 union in_addr_union new_exposed;
2024 ExposePort *p;
2025 bool add;
2026 int af = AF_INET, r;
2027
2028 assert(exposed);
2029
2030 /* Invoked each time an address is added or removed inside the
2031 * container */
2032
2033 if (!arg_expose_ports)
2034 return 0;
2035
2036 r = local_addresses(rtnl, 0, af, &addresses);
2037 if (r < 0)
2038 return log_error_errno(r, "Failed to enumerate local addresses: %m");
2039
2040 add = r > 0 &&
2041 addresses[0].family == af &&
2042 addresses[0].scope < RT_SCOPE_LINK;
2043
2044 if (!add)
2045 return flush_ports(exposed);
2046
2047 new_exposed = addresses[0].address;
2048 if (in_addr_equal(af, exposed, &new_exposed))
2049 return 0;
2050
2051 in_addr_to_string(af, &new_exposed, &pretty);
2052 log_debug("New container IP is %s.", strna(pretty));
2053
2054 LIST_FOREACH(ports, p, arg_expose_ports) {
2055
2056 r = fw_add_local_dnat(true,
2057 af,
2058 p->protocol,
2059 NULL,
2060 NULL, 0,
2061 NULL, 0,
2062 p->host_port,
2063 &new_exposed,
2064 p->container_port,
2065 in_addr_is_null(af, exposed) ? NULL : exposed);
2066 if (r < 0)
2067 log_warning_errno(r, "Failed to modify firewall: %m");
2068 }
2069
2070 *exposed = new_exposed;
2071 return 0;
2072 }
2073
2074 static int on_address_change(sd_netlink *rtnl, sd_netlink_message *m, void *userdata) {
2075 union in_addr_union *exposed = userdata;
2076
2077 assert(rtnl);
2078 assert(m);
2079 assert(exposed);
2080
2081 expose_ports(rtnl, exposed);
2082 return 0;
2083 }
2084
2085 static int watch_rtnl(sd_event *event, int recv_fd, union in_addr_union *exposed, sd_netlink **ret) {
2086 union {
2087 struct cmsghdr cmsghdr;
2088 uint8_t buf[CMSG_SPACE(sizeof(int))];
2089 } control = {};
2090 struct msghdr mh = {
2091 .msg_control = &control,
2092 .msg_controllen = sizeof(control),
2093 };
2094 struct cmsghdr *cmsg;
2095 _cleanup_netlink_unref_ sd_netlink *rtnl = NULL;
2096 int fd, r;
2097 ssize_t k;
2098
2099 assert(event);
2100 assert(recv_fd >= 0);
2101 assert(ret);
2102
2103 if (!arg_expose_ports)
2104 return 0;
2105
2106 k = recvmsg(recv_fd, &mh, MSG_NOSIGNAL);
2107 if (k < 0)
2108 return log_error_errno(errno, "Failed to recv netlink fd: %m");
2109
2110 cmsg = CMSG_FIRSTHDR(&mh);
2111 assert(cmsg->cmsg_level == SOL_SOCKET);
2112 assert(cmsg->cmsg_type == SCM_RIGHTS);
2113 assert(cmsg->cmsg_len == CMSG_LEN(sizeof(int)));
2114 memcpy(&fd, CMSG_DATA(cmsg), sizeof(int));
2115
2116 r = sd_netlink_open_fd(&rtnl, fd);
2117 if (r < 0) {
2118 safe_close(fd);
2119 return log_error_errno(r, "Failed to create rtnl object: %m");
2120 }
2121
2122 r = sd_netlink_add_match(rtnl, RTM_NEWADDR, on_address_change, exposed);
2123 if (r < 0)
2124 return log_error_errno(r, "Failed to subscribe to RTM_NEWADDR messages: %m");
2125
2126 r = sd_netlink_add_match(rtnl, RTM_DELADDR, on_address_change, exposed);
2127 if (r < 0)
2128 return log_error_errno(r, "Failed to subscribe to RTM_DELADDR messages: %m");
2129
2130 r = sd_netlink_attach_event(rtnl, event, 0);
2131 if (r < 0)
2132 return log_error_errno(r, "Failed to add to even loop: %m");
2133
2134 *ret = rtnl;
2135 rtnl = NULL;
2136
2137 return 0;
2138 }
2139
2140 static int setup_hostname(void) {
2141
2142 if (arg_share_system)
2143 return 0;
2144
2145 if (sethostname_idempotent(arg_machine) < 0)
2146 return -errno;
2147
2148 return 0;
2149 }
2150
2151 static int setup_journal(const char *directory) {
2152 sd_id128_t machine_id, this_id;
2153 _cleanup_free_ char *b = NULL, *d = NULL;
2154 const char *etc_machine_id, *p, *q;
2155 char *id;
2156 int r;
2157
2158 /* Don't link journals in ephemeral mode */
2159 if (arg_ephemeral)
2160 return 0;
2161
2162 etc_machine_id = prefix_roota(directory, "/etc/machine-id");
2163
2164 r = read_one_line_file(etc_machine_id, &b);
2165 if (r == -ENOENT && arg_link_journal == LINK_AUTO)
2166 return 0;
2167 else if (r < 0)
2168 return log_error_errno(r, "Failed to read machine ID from %s: %m", etc_machine_id);
2169
2170 id = strstrip(b);
2171 if (isempty(id) && arg_link_journal == LINK_AUTO)
2172 return 0;
2173
2174 /* Verify validity */
2175 r = sd_id128_from_string(id, &machine_id);
2176 if (r < 0)
2177 return log_error_errno(r, "Failed to parse machine ID from %s: %m", etc_machine_id);
2178
2179 r = sd_id128_get_machine(&this_id);
2180 if (r < 0)
2181 return log_error_errno(r, "Failed to retrieve machine ID: %m");
2182
2183 if (sd_id128_equal(machine_id, this_id)) {
2184 log_full(arg_link_journal == LINK_AUTO ? LOG_WARNING : LOG_ERR,
2185 "Host and machine ids are equal (%s): refusing to link journals", id);
2186 if (arg_link_journal == LINK_AUTO)
2187 return 0;
2188 return -EEXIST;
2189 }
2190
2191 if (arg_link_journal == LINK_NO)
2192 return 0;
2193
2194 r = userns_mkdir(directory, "/var", 0755, 0, 0);
2195 if (r < 0)
2196 return log_error_errno(r, "Failed to create /var: %m");
2197
2198 r = userns_mkdir(directory, "/var/log", 0755, 0, 0);
2199 if (r < 0)
2200 return log_error_errno(r, "Failed to create /var/log: %m");
2201
2202 r = userns_mkdir(directory, "/var/log/journal", 0755, 0, 0);
2203 if (r < 0)
2204 return log_error_errno(r, "Failed to create /var/log/journal: %m");
2205
2206 p = strjoina("/var/log/journal/", id);
2207 q = prefix_roota(directory, p);
2208
2209 if (path_is_mount_point(p, 0) > 0) {
2210 if (arg_link_journal != LINK_AUTO) {
2211 log_error("%s: already a mount point, refusing to use for journal", p);
2212 return -EEXIST;
2213 }
2214
2215 return 0;
2216 }
2217
2218 if (path_is_mount_point(q, 0) > 0) {
2219 if (arg_link_journal != LINK_AUTO) {
2220 log_error("%s: already a mount point, refusing to use for journal", q);
2221 return -EEXIST;
2222 }
2223
2224 return 0;
2225 }
2226
2227 r = readlink_and_make_absolute(p, &d);
2228 if (r >= 0) {
2229 if ((arg_link_journal == LINK_GUEST ||
2230 arg_link_journal == LINK_AUTO) &&
2231 path_equal(d, q)) {
2232
2233 r = userns_mkdir(directory, p, 0755, 0, 0);
2234 if (r < 0)
2235 log_warning_errno(errno, "Failed to create directory %s: %m", q);
2236 return 0;
2237 }
2238
2239 if (unlink(p) < 0)
2240 return log_error_errno(errno, "Failed to remove symlink %s: %m", p);
2241 } else if (r == -EINVAL) {
2242
2243 if (arg_link_journal == LINK_GUEST &&
2244 rmdir(p) < 0) {
2245
2246 if (errno == ENOTDIR) {
2247 log_error("%s already exists and is neither a symlink nor a directory", p);
2248 return r;
2249 } else {
2250 log_error_errno(errno, "Failed to remove %s: %m", p);
2251 return -errno;
2252 }
2253 }
2254 } else if (r != -ENOENT) {
2255 log_error_errno(errno, "readlink(%s) failed: %m", p);
2256 return r;
2257 }
2258
2259 if (arg_link_journal == LINK_GUEST) {
2260
2261 if (symlink(q, p) < 0) {
2262 if (arg_link_journal_try) {
2263 log_debug_errno(errno, "Failed to symlink %s to %s, skipping journal setup: %m", q, p);
2264 return 0;
2265 } else {
2266 log_error_errno(errno, "Failed to symlink %s to %s: %m", q, p);
2267 return -errno;
2268 }
2269 }
2270
2271 r = userns_mkdir(directory, p, 0755, 0, 0);
2272 if (r < 0)
2273 log_warning_errno(errno, "Failed to create directory %s: %m", q);
2274 return 0;
2275 }
2276
2277 if (arg_link_journal == LINK_HOST) {
2278 /* don't create parents here -- if the host doesn't have
2279 * permanent journal set up, don't force it here */
2280 r = mkdir(p, 0755);
2281 if (r < 0) {
2282 if (arg_link_journal_try) {
2283 log_debug_errno(errno, "Failed to create %s, skipping journal setup: %m", p);
2284 return 0;
2285 } else {
2286 log_error_errno(errno, "Failed to create %s: %m", p);
2287 return r;
2288 }
2289 }
2290
2291 } else if (access(p, F_OK) < 0)
2292 return 0;
2293
2294 if (dir_is_empty(q) == 0)
2295 log_warning("%s is not empty, proceeding anyway.", q);
2296
2297 r = userns_mkdir(directory, p, 0755, 0, 0);
2298 if (r < 0) {
2299 log_error_errno(errno, "Failed to create %s: %m", q);
2300 return r;
2301 }
2302
2303 if (mount(p, q, NULL, MS_BIND, NULL) < 0)
2304 return log_error_errno(errno, "Failed to bind mount journal from host into guest: %m");
2305
2306 return 0;
2307 }
2308
2309 static int drop_capabilities(void) {
2310 return capability_bounding_set_drop(~arg_retain, false);
2311 }
2312
2313 static int register_machine(pid_t pid, int local_ifindex) {
2314 _cleanup_bus_error_free_ sd_bus_error error = SD_BUS_ERROR_NULL;
2315 _cleanup_bus_flush_close_unref_ sd_bus *bus = NULL;
2316 int r;
2317
2318 if (!arg_register)
2319 return 0;
2320
2321 r = sd_bus_default_system(&bus);
2322 if (r < 0)
2323 return log_error_errno(r, "Failed to open system bus: %m");
2324
2325 if (arg_keep_unit) {
2326 r = sd_bus_call_method(
2327 bus,
2328 "org.freedesktop.machine1",
2329 "/org/freedesktop/machine1",
2330 "org.freedesktop.machine1.Manager",
2331 "RegisterMachineWithNetwork",
2332 &error,
2333 NULL,
2334 "sayssusai",
2335 arg_machine,
2336 SD_BUS_MESSAGE_APPEND_ID128(arg_uuid),
2337 "nspawn",
2338 "container",
2339 (uint32_t) pid,
2340 strempty(arg_directory),
2341 local_ifindex > 0 ? 1 : 0, local_ifindex);
2342 } else {
2343 _cleanup_bus_message_unref_ sd_bus_message *m = NULL;
2344 char **i;
2345 unsigned j;
2346
2347 r = sd_bus_message_new_method_call(
2348 bus,
2349 &m,
2350 "org.freedesktop.machine1",
2351 "/org/freedesktop/machine1",
2352 "org.freedesktop.machine1.Manager",
2353 "CreateMachineWithNetwork");
2354 if (r < 0)
2355 return bus_log_create_error(r);
2356
2357 r = sd_bus_message_append(
2358 m,
2359 "sayssusai",
2360 arg_machine,
2361 SD_BUS_MESSAGE_APPEND_ID128(arg_uuid),
2362 "nspawn",
2363 "container",
2364 (uint32_t) pid,
2365 strempty(arg_directory),
2366 local_ifindex > 0 ? 1 : 0, local_ifindex);
2367 if (r < 0)
2368 return bus_log_create_error(r);
2369
2370 r = sd_bus_message_open_container(m, 'a', "(sv)");
2371 if (r < 0)
2372 return bus_log_create_error(r);
2373
2374 if (!isempty(arg_slice)) {
2375 r = sd_bus_message_append(m, "(sv)", "Slice", "s", arg_slice);
2376 if (r < 0)
2377 return bus_log_create_error(r);
2378 }
2379
2380 r = sd_bus_message_append(m, "(sv)", "DevicePolicy", "s", "strict");
2381 if (r < 0)
2382 return bus_log_create_error(r);
2383
2384 /* If you make changes here, also make sure to update
2385 * systemd-nspawn@.service, to keep the device
2386 * policies in sync regardless if we are run with or
2387 * without the --keep-unit switch. */
2388 r = sd_bus_message_append(m, "(sv)", "DeviceAllow", "a(ss)", 9,
2389 /* Allow the container to
2390 * access and create the API
2391 * device nodes, so that
2392 * PrivateDevices= in the
2393 * container can work
2394 * fine */
2395 "/dev/null", "rwm",
2396 "/dev/zero", "rwm",
2397 "/dev/full", "rwm",
2398 "/dev/random", "rwm",
2399 "/dev/urandom", "rwm",
2400 "/dev/tty", "rwm",
2401 "/dev/net/tun", "rwm",
2402 /* Allow the container
2403 * access to ptys. However,
2404 * do not permit the
2405 * container to ever create
2406 * these device nodes. */
2407 "/dev/pts/ptmx", "rw",
2408 "char-pts", "rw");
2409 if (r < 0)
2410 return bus_log_create_error(r);
2411
2412 for (j = 0; j < arg_n_custom_mounts; j++) {
2413 CustomMount *cm = &arg_custom_mounts[j];
2414
2415 if (cm->type != CUSTOM_MOUNT_BIND)
2416 continue;
2417
2418 r = is_device_node(cm->source);
2419 if (r < 0)
2420 return log_error_errno(r, "Failed to stat %s: %m", cm->source);
2421
2422 if (r) {
2423 r = sd_bus_message_append(m, "(sv)", "DeviceAllow", "a(ss)", 1,
2424 cm->source, cm->read_only ? "r" : "rw");
2425 if (r < 0)
2426 return log_error_errno(r, "Failed to append message arguments: %m");
2427 }
2428 }
2429
2430 if (arg_kill_signal != 0) {
2431 r = sd_bus_message_append(m, "(sv)", "KillSignal", "i", arg_kill_signal);
2432 if (r < 0)
2433 return bus_log_create_error(r);
2434
2435 r = sd_bus_message_append(m, "(sv)", "KillMode", "s", "mixed");
2436 if (r < 0)
2437 return bus_log_create_error(r);
2438 }
2439
2440 STRV_FOREACH(i, arg_property) {
2441 r = sd_bus_message_open_container(m, 'r', "sv");
2442 if (r < 0)
2443 return bus_log_create_error(r);
2444
2445 r = bus_append_unit_property_assignment(m, *i);
2446 if (r < 0)
2447 return r;
2448
2449 r = sd_bus_message_close_container(m);
2450 if (r < 0)
2451 return bus_log_create_error(r);
2452 }
2453
2454 r = sd_bus_message_close_container(m);
2455 if (r < 0)
2456 return bus_log_create_error(r);
2457
2458 r = sd_bus_call(bus, m, 0, &error, NULL);
2459 }
2460
2461 if (r < 0) {
2462 log_error("Failed to register machine: %s", bus_error_message(&error, r));
2463 return r;
2464 }
2465
2466 return 0;
2467 }
2468
2469 static int terminate_machine(pid_t pid) {
2470 _cleanup_bus_error_free_ sd_bus_error error = SD_BUS_ERROR_NULL;
2471 _cleanup_bus_message_unref_ sd_bus_message *reply = NULL;
2472 _cleanup_bus_flush_close_unref_ sd_bus *bus = NULL;
2473 const char *path;
2474 int r;
2475
2476 if (!arg_register)
2477 return 0;
2478
2479 /* If we are reusing the unit, then just exit, systemd will do
2480 * the right thing when we exit. */
2481 if (arg_keep_unit)
2482 return 0;
2483
2484 r = sd_bus_default_system(&bus);
2485 if (r < 0)
2486 return log_error_errno(r, "Failed to open system bus: %m");
2487
2488 r = sd_bus_call_method(
2489 bus,
2490 "org.freedesktop.machine1",
2491 "/org/freedesktop/machine1",
2492 "org.freedesktop.machine1.Manager",
2493 "GetMachineByPID",
2494 &error,
2495 &reply,
2496 "u",
2497 (uint32_t) pid);
2498 if (r < 0) {
2499 /* Note that the machine might already have been
2500 * cleaned up automatically, hence don't consider it a
2501 * failure if we cannot get the machine object. */
2502 log_debug("Failed to get machine: %s", bus_error_message(&error, r));
2503 return 0;
2504 }
2505
2506 r = sd_bus_message_read(reply, "o", &path);
2507 if (r < 0)
2508 return bus_log_parse_error(r);
2509
2510 r = sd_bus_call_method(
2511 bus,
2512 "org.freedesktop.machine1",
2513 path,
2514 "org.freedesktop.machine1.Machine",
2515 "Terminate",
2516 &error,
2517 NULL,
2518 NULL);
2519 if (r < 0) {
2520 log_debug("Failed to terminate machine: %s", bus_error_message(&error, r));
2521 return 0;
2522 }
2523
2524 return 0;
2525 }
2526
2527 static int reset_audit_loginuid(void) {
2528 _cleanup_free_ char *p = NULL;
2529 int r;
2530
2531 if (arg_share_system)
2532 return 0;
2533
2534 r = read_one_line_file("/proc/self/loginuid", &p);
2535 if (r == -ENOENT)
2536 return 0;
2537 if (r < 0)
2538 return log_error_errno(r, "Failed to read /proc/self/loginuid: %m");
2539
2540 /* Already reset? */
2541 if (streq(p, "4294967295"))
2542 return 0;
2543
2544 r = write_string_file("/proc/self/loginuid", "4294967295", 0);
2545 if (r < 0) {
2546 log_error_errno(r,
2547 "Failed to reset audit login UID. This probably means that your kernel is too\n"
2548 "old and you have audit enabled. Note that the auditing subsystem is known to\n"
2549 "be incompatible with containers on old kernels. Please make sure to upgrade\n"
2550 "your kernel or to off auditing with 'audit=0' on the kernel command line before\n"
2551 "using systemd-nspawn. Sleeping for 5s... (%m)");
2552
2553 sleep(5);
2554 }
2555
2556 return 0;
2557 }
2558
2559 #define HOST_HASH_KEY SD_ID128_MAKE(1a,37,6f,c7,46,ec,45,0b,ad,a3,d5,31,06,60,5d,b1)
2560 #define CONTAINER_HASH_KEY SD_ID128_MAKE(c3,c4,f9,19,b5,57,b2,1c,e6,cf,14,27,03,9c,ee,a2)
2561 #define MACVLAN_HASH_KEY SD_ID128_MAKE(00,13,6d,bc,66,83,44,81,bb,0c,f9,51,1f,24,a6,6f)
2562
2563 static int generate_mac(struct ether_addr *mac, sd_id128_t hash_key, uint64_t idx) {
2564 uint8_t result[8];
2565 size_t l, sz;
2566 uint8_t *v, *i;
2567 int r;
2568
2569 l = strlen(arg_machine);
2570 sz = sizeof(sd_id128_t) + l;
2571 if (idx > 0)
2572 sz += sizeof(idx);
2573
2574 v = alloca(sz);
2575
2576 /* fetch some persistent data unique to the host */
2577 r = sd_id128_get_machine((sd_id128_t*) v);
2578 if (r < 0)
2579 return r;
2580
2581 /* combine with some data unique (on this host) to this
2582 * container instance */
2583 i = mempcpy(v + sizeof(sd_id128_t), arg_machine, l);
2584 if (idx > 0) {
2585 idx = htole64(idx);
2586 memcpy(i, &idx, sizeof(idx));
2587 }
2588
2589 /* Let's hash the host machine ID plus the container name. We
2590 * use a fixed, but originally randomly created hash key here. */
2591 siphash24(result, v, sz, hash_key.bytes);
2592
2593 assert_cc(ETH_ALEN <= sizeof(result));
2594 memcpy(mac->ether_addr_octet, result, ETH_ALEN);
2595
2596 /* see eth_random_addr in the kernel */
2597 mac->ether_addr_octet[0] &= 0xfe; /* clear multicast bit */
2598 mac->ether_addr_octet[0] |= 0x02; /* set local assignment bit (IEEE802) */
2599
2600 return 0;
2601 }
2602
2603 static int setup_veth(pid_t pid, char iface_name[IFNAMSIZ], int *ifi) {
2604 _cleanup_netlink_message_unref_ sd_netlink_message *m = NULL;
2605 _cleanup_netlink_unref_ sd_netlink *rtnl = NULL;
2606 struct ether_addr mac_host, mac_container;
2607 int r, i;
2608
2609 if (!arg_private_network)
2610 return 0;
2611
2612 if (!arg_network_veth)
2613 return 0;
2614
2615 /* Use two different interface name prefixes depending whether
2616 * we are in bridge mode or not. */
2617 snprintf(iface_name, IFNAMSIZ - 1, "%s-%s",
2618 arg_network_bridge ? "vb" : "ve", arg_machine);
2619
2620 r = generate_mac(&mac_container, CONTAINER_HASH_KEY, 0);
2621 if (r < 0)
2622 return log_error_errno(r, "Failed to generate predictable MAC address for container side: %m");
2623
2624 r = generate_mac(&mac_host, HOST_HASH_KEY, 0);
2625 if (r < 0)
2626 return log_error_errno(r, "Failed to generate predictable MAC address for host side: %m");
2627
2628 r = sd_netlink_open(&rtnl);
2629 if (r < 0)
2630 return log_error_errno(r, "Failed to connect to netlink: %m");
2631
2632 r = sd_rtnl_message_new_link(rtnl, &m, RTM_NEWLINK, 0);
2633 if (r < 0)
2634 return log_error_errno(r, "Failed to allocate netlink message: %m");
2635
2636 r = sd_netlink_message_append_string(m, IFLA_IFNAME, iface_name);
2637 if (r < 0)
2638 return log_error_errno(r, "Failed to add netlink interface name: %m");
2639
2640 r = sd_netlink_message_append_ether_addr(m, IFLA_ADDRESS, &mac_host);
2641 if (r < 0)
2642 return log_error_errno(r, "Failed to add netlink MAC address: %m");
2643
2644 r = sd_netlink_message_open_container(m, IFLA_LINKINFO);
2645 if (r < 0)
2646 return log_error_errno(r, "Failed to open netlink container: %m");
2647
2648 r = sd_netlink_message_open_container_union(m, IFLA_INFO_DATA, "veth");
2649 if (r < 0)
2650 return log_error_errno(r, "Failed to open netlink container: %m");
2651
2652 r = sd_netlink_message_open_container(m, VETH_INFO_PEER);
2653 if (r < 0)
2654 return log_error_errno(r, "Failed to open netlink container: %m");
2655
2656 r = sd_netlink_message_append_string(m, IFLA_IFNAME, "host0");
2657 if (r < 0)
2658 return log_error_errno(r, "Failed to add netlink interface name: %m");
2659
2660 r = sd_netlink_message_append_ether_addr(m, IFLA_ADDRESS, &mac_container);
2661 if (r < 0)
2662 return log_error_errno(r, "Failed to add netlink MAC address: %m");
2663
2664 r = sd_netlink_message_append_u32(m, IFLA_NET_NS_PID, pid);
2665 if (r < 0)
2666 return log_error_errno(r, "Failed to add netlink namespace field: %m");
2667
2668 r = sd_netlink_message_close_container(m);
2669 if (r < 0)
2670 return log_error_errno(r, "Failed to close netlink container: %m");
2671
2672 r = sd_netlink_message_close_container(m);
2673 if (r < 0)
2674 return log_error_errno(r, "Failed to close netlink container: %m");
2675
2676 r = sd_netlink_message_close_container(m);
2677 if (r < 0)
2678 return log_error_errno(r, "Failed to close netlink container: %m");
2679
2680 r = sd_netlink_call(rtnl, m, 0, NULL);
2681 if (r < 0)
2682 return log_error_errno(r, "Failed to add new veth interfaces (host0, %s): %m", iface_name);
2683
2684 i = (int) if_nametoindex(iface_name);
2685 if (i <= 0)
2686 return log_error_errno(errno, "Failed to resolve interface %s: %m", iface_name);
2687
2688 *ifi = i;
2689
2690 return 0;
2691 }
2692
2693 static int setup_bridge(const char veth_name[], int *ifi) {
2694 _cleanup_netlink_message_unref_ sd_netlink_message *m = NULL;
2695 _cleanup_netlink_unref_ sd_netlink *rtnl = NULL;
2696 int r, bridge;
2697
2698 if (!arg_private_network)
2699 return 0;
2700
2701 if (!arg_network_veth)
2702 return 0;
2703
2704 if (!arg_network_bridge)
2705 return 0;
2706
2707 bridge = (int) if_nametoindex(arg_network_bridge);
2708 if (bridge <= 0)
2709 return log_error_errno(errno, "Failed to resolve interface %s: %m", arg_network_bridge);
2710
2711 *ifi = bridge;
2712
2713 r = sd_netlink_open(&rtnl);
2714 if (r < 0)
2715 return log_error_errno(r, "Failed to connect to netlink: %m");
2716
2717 r = sd_rtnl_message_new_link(rtnl, &m, RTM_SETLINK, 0);
2718 if (r < 0)
2719 return log_error_errno(r, "Failed to allocate netlink message: %m");
2720
2721 r = sd_rtnl_message_link_set_flags(m, IFF_UP, IFF_UP);
2722 if (r < 0)
2723 return log_error_errno(r, "Failed to set IFF_UP flag: %m");
2724
2725 r = sd_netlink_message_append_string(m, IFLA_IFNAME, veth_name);
2726 if (r < 0)
2727 return log_error_errno(r, "Failed to add netlink interface name field: %m");
2728
2729 r = sd_netlink_message_append_u32(m, IFLA_MASTER, bridge);
2730 if (r < 0)
2731 return log_error_errno(r, "Failed to add netlink master field: %m");
2732
2733 r = sd_netlink_call(rtnl, m, 0, NULL);
2734 if (r < 0)
2735 return log_error_errno(r, "Failed to add veth interface to bridge: %m");
2736
2737 return 0;
2738 }
2739
2740 static int parse_interface(struct udev *udev, const char *name) {
2741 _cleanup_udev_device_unref_ struct udev_device *d = NULL;
2742 char ifi_str[2 + DECIMAL_STR_MAX(int)];
2743 int ifi;
2744
2745 ifi = (int) if_nametoindex(name);
2746 if (ifi <= 0)
2747 return log_error_errno(errno, "Failed to resolve interface %s: %m", name);
2748
2749 sprintf(ifi_str, "n%i", ifi);
2750 d = udev_device_new_from_device_id(udev, ifi_str);
2751 if (!d)
2752 return log_error_errno(errno, "Failed to get udev device for interface %s: %m", name);
2753
2754 if (udev_device_get_is_initialized(d) <= 0) {
2755 log_error("Network interface %s is not initialized yet.", name);
2756 return -EBUSY;
2757 }
2758
2759 return ifi;
2760 }
2761
2762 static int move_network_interfaces(pid_t pid) {
2763 _cleanup_udev_unref_ struct udev *udev = NULL;
2764 _cleanup_netlink_unref_ sd_netlink *rtnl = NULL;
2765 char **i;
2766 int r;
2767
2768 if (!arg_private_network)
2769 return 0;
2770
2771 if (strv_isempty(arg_network_interfaces))
2772 return 0;
2773
2774 r = sd_netlink_open(&rtnl);
2775 if (r < 0)
2776 return log_error_errno(r, "Failed to connect to netlink: %m");
2777
2778 udev = udev_new();
2779 if (!udev) {
2780 log_error("Failed to connect to udev.");
2781 return -ENOMEM;
2782 }
2783
2784 STRV_FOREACH(i, arg_network_interfaces) {
2785 _cleanup_netlink_message_unref_ sd_netlink_message *m = NULL;
2786 int ifi;
2787
2788 ifi = parse_interface(udev, *i);
2789 if (ifi < 0)
2790 return ifi;
2791
2792 r = sd_rtnl_message_new_link(rtnl, &m, RTM_SETLINK, ifi);
2793 if (r < 0)
2794 return log_error_errno(r, "Failed to allocate netlink message: %m");
2795
2796 r = sd_netlink_message_append_u32(m, IFLA_NET_NS_PID, pid);
2797 if (r < 0)
2798 return log_error_errno(r, "Failed to append namespace PID to netlink message: %m");
2799
2800 r = sd_netlink_call(rtnl, m, 0, NULL);
2801 if (r < 0)
2802 return log_error_errno(r, "Failed to move interface %s to namespace: %m", *i);
2803 }
2804
2805 return 0;
2806 }
2807
2808 static int setup_macvlan(pid_t pid) {
2809 _cleanup_udev_unref_ struct udev *udev = NULL;
2810 _cleanup_netlink_unref_ sd_netlink *rtnl = NULL;
2811 unsigned idx = 0;
2812 char **i;
2813 int r;
2814
2815 if (!arg_private_network)
2816 return 0;
2817
2818 if (strv_isempty(arg_network_macvlan))
2819 return 0;
2820
2821 r = sd_netlink_open(&rtnl);
2822 if (r < 0)
2823 return log_error_errno(r, "Failed to connect to netlink: %m");
2824
2825 udev = udev_new();
2826 if (!udev) {
2827 log_error("Failed to connect to udev.");
2828 return -ENOMEM;
2829 }
2830
2831 STRV_FOREACH(i, arg_network_macvlan) {
2832 _cleanup_netlink_message_unref_ sd_netlink_message *m = NULL;
2833 _cleanup_free_ char *n = NULL;
2834 struct ether_addr mac;
2835 int ifi;
2836
2837 ifi = parse_interface(udev, *i);
2838 if (ifi < 0)
2839 return ifi;
2840
2841 r = generate_mac(&mac, MACVLAN_HASH_KEY, idx++);
2842 if (r < 0)
2843 return log_error_errno(r, "Failed to create MACVLAN MAC address: %m");
2844
2845 r = sd_rtnl_message_new_link(rtnl, &m, RTM_NEWLINK, 0);
2846 if (r < 0)
2847 return log_error_errno(r, "Failed to allocate netlink message: %m");
2848
2849 r = sd_netlink_message_append_u32(m, IFLA_LINK, ifi);
2850 if (r < 0)
2851 return log_error_errno(r, "Failed to add netlink interface index: %m");
2852
2853 n = strappend("mv-", *i);
2854 if (!n)
2855 return log_oom();
2856
2857 strshorten(n, IFNAMSIZ-1);
2858
2859 r = sd_netlink_message_append_string(m, IFLA_IFNAME, n);
2860 if (r < 0)
2861 return log_error_errno(r, "Failed to add netlink interface name: %m");
2862
2863 r = sd_netlink_message_append_ether_addr(m, IFLA_ADDRESS, &mac);
2864 if (r < 0)
2865 return log_error_errno(r, "Failed to add netlink MAC address: %m");
2866
2867 r = sd_netlink_message_append_u32(m, IFLA_NET_NS_PID, pid);
2868 if (r < 0)
2869 return log_error_errno(r, "Failed to add netlink namespace field: %m");
2870
2871 r = sd_netlink_message_open_container(m, IFLA_LINKINFO);
2872 if (r < 0)
2873 return log_error_errno(r, "Failed to open netlink container: %m");
2874
2875 r = sd_netlink_message_open_container_union(m, IFLA_INFO_DATA, "macvlan");
2876 if (r < 0)
2877 return log_error_errno(r, "Failed to open netlink container: %m");
2878
2879 r = sd_netlink_message_append_u32(m, IFLA_MACVLAN_MODE, MACVLAN_MODE_BRIDGE);
2880 if (r < 0)
2881 return log_error_errno(r, "Failed to append macvlan mode: %m");
2882
2883 r = sd_netlink_message_close_container(m);
2884 if (r < 0)
2885 return log_error_errno(r, "Failed to close netlink container: %m");
2886
2887 r = sd_netlink_message_close_container(m);
2888 if (r < 0)
2889 return log_error_errno(r, "Failed to close netlink container: %m");
2890
2891 r = sd_netlink_call(rtnl, m, 0, NULL);
2892 if (r < 0)
2893 return log_error_errno(r, "Failed to add new macvlan interfaces: %m");
2894 }
2895
2896 return 0;
2897 }
2898
2899 static int setup_ipvlan(pid_t pid) {
2900 _cleanup_udev_unref_ struct udev *udev = NULL;
2901 _cleanup_netlink_unref_ sd_netlink *rtnl = NULL;
2902 char **i;
2903 int r;
2904
2905 if (!arg_private_network)
2906 return 0;
2907
2908 if (strv_isempty(arg_network_ipvlan))
2909 return 0;
2910
2911 r = sd_netlink_open(&rtnl);
2912 if (r < 0)
2913 return log_error_errno(r, "Failed to connect to netlink: %m");
2914
2915 udev = udev_new();
2916 if (!udev) {
2917 log_error("Failed to connect to udev.");
2918 return -ENOMEM;
2919 }
2920
2921 STRV_FOREACH(i, arg_network_ipvlan) {
2922 _cleanup_netlink_message_unref_ sd_netlink_message *m = NULL;
2923 _cleanup_free_ char *n = NULL;
2924 int ifi;
2925
2926 ifi = parse_interface(udev, *i);
2927 if (ifi < 0)
2928 return ifi;
2929
2930 r = sd_rtnl_message_new_link(rtnl, &m, RTM_NEWLINK, 0);
2931 if (r < 0)
2932 return log_error_errno(r, "Failed to allocate netlink message: %m");
2933
2934 r = sd_netlink_message_append_u32(m, IFLA_LINK, ifi);
2935 if (r < 0)
2936 return log_error_errno(r, "Failed to add netlink interface index: %m");
2937
2938 n = strappend("iv-", *i);
2939 if (!n)
2940 return log_oom();
2941
2942 strshorten(n, IFNAMSIZ-1);
2943
2944 r = sd_netlink_message_append_string(m, IFLA_IFNAME, n);
2945 if (r < 0)
2946 return log_error_errno(r, "Failed to add netlink interface name: %m");
2947
2948 r = sd_netlink_message_append_u32(m, IFLA_NET_NS_PID, pid);
2949 if (r < 0)
2950 return log_error_errno(r, "Failed to add netlink namespace field: %m");
2951
2952 r = sd_netlink_message_open_container(m, IFLA_LINKINFO);
2953 if (r < 0)
2954 return log_error_errno(r, "Failed to open netlink container: %m");
2955
2956 r = sd_netlink_message_open_container_union(m, IFLA_INFO_DATA, "ipvlan");
2957 if (r < 0)
2958 return log_error_errno(r, "Failed to open netlink container: %m");
2959
2960 r = sd_netlink_message_append_u16(m, IFLA_IPVLAN_MODE, IPVLAN_MODE_L2);
2961 if (r < 0)
2962 return log_error_errno(r, "Failed to add ipvlan mode: %m");
2963
2964 r = sd_netlink_message_close_container(m);
2965 if (r < 0)
2966 return log_error_errno(r, "Failed to close netlink container: %m");
2967
2968 r = sd_netlink_message_close_container(m);
2969 if (r < 0)
2970 return log_error_errno(r, "Failed to close netlink container: %m");
2971
2972 r = sd_netlink_call(rtnl, m, 0, NULL);
2973 if (r < 0)
2974 return log_error_errno(r, "Failed to add new ipvlan interfaces: %m");
2975 }
2976
2977 return 0;
2978 }
2979
2980 static int setup_seccomp(void) {
2981
2982 #ifdef HAVE_SECCOMP
2983 static const struct {
2984 uint64_t capability;
2985 int syscall_num;
2986 } blacklist[] = {
2987 { CAP_SYS_RAWIO, SCMP_SYS(iopl) },
2988 { CAP_SYS_RAWIO, SCMP_SYS(ioperm) },
2989 { CAP_SYS_BOOT, SCMP_SYS(kexec_load) },
2990 { CAP_SYS_ADMIN, SCMP_SYS(swapon) },
2991 { CAP_SYS_ADMIN, SCMP_SYS(swapoff) },
2992 { CAP_SYS_ADMIN, SCMP_SYS(open_by_handle_at) },
2993 { CAP_SYS_MODULE, SCMP_SYS(init_module) },
2994 { CAP_SYS_MODULE, SCMP_SYS(finit_module) },
2995 { CAP_SYS_MODULE, SCMP_SYS(delete_module) },
2996 { CAP_SYSLOG, SCMP_SYS(syslog) },
2997 };
2998
2999 scmp_filter_ctx seccomp;
3000 unsigned i;
3001 int r;
3002
3003 seccomp = seccomp_init(SCMP_ACT_ALLOW);
3004 if (!seccomp)
3005 return log_oom();
3006
3007 r = seccomp_add_secondary_archs(seccomp);
3008 if (r < 0) {
3009 log_error_errno(r, "Failed to add secondary archs to seccomp filter: %m");
3010 goto finish;
3011 }
3012
3013 for (i = 0; i < ELEMENTSOF(blacklist); i++) {
3014 if (arg_retain & (1ULL << blacklist[i].capability))
3015 continue;
3016
3017 r = seccomp_rule_add(seccomp, SCMP_ACT_ERRNO(EPERM), blacklist[i].syscall_num, 0);
3018 if (r == -EFAULT)
3019 continue; /* unknown syscall */
3020 if (r < 0) {
3021 log_error_errno(r, "Failed to block syscall: %m");
3022 goto finish;
3023 }
3024 }
3025
3026
3027 /*
3028 Audit is broken in containers, much of the userspace audit
3029 hookup will fail if running inside a container. We don't
3030 care and just turn off creation of audit sockets.
3031
3032 This will make socket(AF_NETLINK, *, NETLINK_AUDIT) fail
3033 with EAFNOSUPPORT which audit userspace uses as indication
3034 that audit is disabled in the kernel.
3035 */
3036
3037 r = seccomp_rule_add(
3038 seccomp,
3039 SCMP_ACT_ERRNO(EAFNOSUPPORT),
3040 SCMP_SYS(socket),
3041 2,
3042 SCMP_A0(SCMP_CMP_EQ, AF_NETLINK),
3043 SCMP_A2(SCMP_CMP_EQ, NETLINK_AUDIT));
3044 if (r < 0) {
3045 log_error_errno(r, "Failed to add audit seccomp rule: %m");
3046 goto finish;
3047 }
3048
3049 r = seccomp_attr_set(seccomp, SCMP_FLTATR_CTL_NNP, 0);
3050 if (r < 0) {
3051 log_error_errno(r, "Failed to unset NO_NEW_PRIVS: %m");
3052 goto finish;
3053 }
3054
3055 r = seccomp_load(seccomp);
3056 if (r == -EINVAL) {
3057 log_debug_errno(r, "Kernel is probably not configured with CONFIG_SECCOMP. Disabling seccomp audit filter: %m");
3058 r = 0;
3059 goto finish;
3060 }
3061 if (r < 0) {
3062 log_error_errno(r, "Failed to install seccomp audit filter: %m");
3063 goto finish;
3064 }
3065
3066 finish:
3067 seccomp_release(seccomp);
3068 return r;
3069 #else
3070 return 0;
3071 #endif
3072
3073 }
3074
3075 static int setup_propagate(const char *root) {
3076 const char *p, *q;
3077
3078 (void) mkdir_p("/run/systemd/nspawn/", 0755);
3079 (void) mkdir_p("/run/systemd/nspawn/propagate", 0600);
3080 p = strjoina("/run/systemd/nspawn/propagate/", arg_machine);
3081 (void) mkdir_p(p, 0600);
3082
3083 if (userns_mkdir(root, "/run/systemd", 0755, 0, 0) < 0)
3084 return log_error_errno(errno, "Failed to create /run/systemd: %m");
3085
3086 if (userns_mkdir(root, "/run/systemd/nspawn", 0755, 0, 0) < 0)
3087 return log_error_errno(errno, "Failed to create /run/systemd/nspawn: %m");
3088
3089 if (userns_mkdir(root, "/run/systemd/nspawn/incoming", 0600, 0, 0) < 0)
3090 return log_error_errno(errno, "Failed to create /run/systemd/nspawn/incoming: %m");
3091
3092 q = prefix_roota(root, "/run/systemd/nspawn/incoming");
3093 if (mount(p, q, NULL, MS_BIND, NULL) < 0)
3094 return log_error_errno(errno, "Failed to install propagation bind mount.");
3095
3096 if (mount(NULL, q, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY, NULL) < 0)
3097 return log_error_errno(errno, "Failed to make propagation mount read-only");
3098
3099 return 0;
3100 }
3101
3102 static int setup_image(char **device_path, int *loop_nr) {
3103 struct loop_info64 info = {
3104 .lo_flags = LO_FLAGS_AUTOCLEAR|LO_FLAGS_PARTSCAN
3105 };
3106 _cleanup_close_ int fd = -1, control = -1, loop = -1;
3107 _cleanup_free_ char* loopdev = NULL;
3108 struct stat st;
3109 int r, nr;
3110
3111 assert(device_path);
3112 assert(loop_nr);
3113 assert(arg_image);
3114
3115 fd = open(arg_image, O_CLOEXEC|(arg_read_only ? O_RDONLY : O_RDWR)|O_NONBLOCK|O_NOCTTY);
3116 if (fd < 0)
3117 return log_error_errno(errno, "Failed to open %s: %m", arg_image);
3118
3119 if (fstat(fd, &st) < 0)
3120 return log_error_errno(errno, "Failed to stat %s: %m", arg_image);
3121
3122 if (S_ISBLK(st.st_mode)) {
3123 char *p;
3124
3125 p = strdup(arg_image);
3126 if (!p)
3127 return log_oom();
3128
3129 *device_path = p;
3130
3131 *loop_nr = -1;
3132
3133 r = fd;
3134 fd = -1;
3135
3136 return r;
3137 }
3138
3139 if (!S_ISREG(st.st_mode)) {
3140 log_error_errno(errno, "%s is not a regular file or block device: %m", arg_image);
3141 return -EINVAL;
3142 }
3143
3144 control = open("/dev/loop-control", O_RDWR|O_CLOEXEC|O_NOCTTY|O_NONBLOCK);
3145 if (control < 0)
3146 return log_error_errno(errno, "Failed to open /dev/loop-control: %m");
3147
3148 nr = ioctl(control, LOOP_CTL_GET_FREE);
3149 if (nr < 0)
3150 return log_error_errno(errno, "Failed to allocate loop device: %m");
3151
3152 if (asprintf(&loopdev, "/dev/loop%i", nr) < 0)
3153 return log_oom();
3154
3155 loop = open(loopdev, O_CLOEXEC|(arg_read_only ? O_RDONLY : O_RDWR)|O_NONBLOCK|O_NOCTTY);
3156 if (loop < 0)
3157 return log_error_errno(errno, "Failed to open loop device %s: %m", loopdev);
3158
3159 if (ioctl(loop, LOOP_SET_FD, fd) < 0)
3160 return log_error_errno(errno, "Failed to set loopback file descriptor on %s: %m", loopdev);
3161
3162 if (arg_read_only)
3163 info.lo_flags |= LO_FLAGS_READ_ONLY;
3164
3165 if (ioctl(loop, LOOP_SET_STATUS64, &info) < 0)
3166 return log_error_errno(errno, "Failed to set loopback settings on %s: %m", loopdev);
3167
3168 *device_path = loopdev;
3169 loopdev = NULL;
3170
3171 *loop_nr = nr;
3172
3173 r = loop;
3174 loop = -1;
3175
3176 return r;
3177 }
3178
3179 #define PARTITION_TABLE_BLURB \
3180 "Note that the disk image needs to either contain only a single MBR partition of\n" \
3181 "type 0x83 that is marked bootable, or a single GPT partition of type " \
3182 "0FC63DAF-8483-4772-8E79-3D69D8477DE4 or follow\n" \
3183 " http://www.freedesktop.org/wiki/Specifications/DiscoverablePartitionsSpec/\n" \
3184 "to be bootable with systemd-nspawn."
3185
3186 static int dissect_image(
3187 int fd,
3188 char **root_device, bool *root_device_rw,
3189 char **home_device, bool *home_device_rw,
3190 char **srv_device, bool *srv_device_rw,
3191 bool *secondary) {
3192
3193 #ifdef HAVE_BLKID
3194 int home_nr = -1, srv_nr = -1;
3195 #ifdef GPT_ROOT_NATIVE
3196 int root_nr = -1;
3197 #endif
3198 #ifdef GPT_ROOT_SECONDARY
3199 int secondary_root_nr = -1;
3200 #endif
3201 _cleanup_free_ char *home = NULL, *root = NULL, *secondary_root = NULL, *srv = NULL, *generic = NULL;
3202 _cleanup_udev_enumerate_unref_ struct udev_enumerate *e = NULL;
3203 _cleanup_udev_device_unref_ struct udev_device *d = NULL;
3204 _cleanup_blkid_free_probe_ blkid_probe b = NULL;
3205 _cleanup_udev_unref_ struct udev *udev = NULL;
3206 struct udev_list_entry *first, *item;
3207 bool home_rw = true, root_rw = true, secondary_root_rw = true, srv_rw = true, generic_rw = true;
3208 bool is_gpt, is_mbr, multiple_generic = false;
3209 const char *pttype = NULL;
3210 blkid_partlist pl;
3211 struct stat st;
3212 unsigned i;
3213 int r;
3214
3215 assert(fd >= 0);
3216 assert(root_device);
3217 assert(home_device);
3218 assert(srv_device);
3219 assert(secondary);
3220 assert(arg_image);
3221
3222 b = blkid_new_probe();
3223 if (!b)
3224 return log_oom();
3225
3226 errno = 0;
3227 r = blkid_probe_set_device(b, fd, 0, 0);
3228 if (r != 0) {
3229 if (errno == 0)
3230 return log_oom();
3231
3232 log_error_errno(errno, "Failed to set device on blkid probe: %m");
3233 return -errno;
3234 }
3235
3236 blkid_probe_enable_partitions(b, 1);
3237 blkid_probe_set_partitions_flags(b, BLKID_PARTS_ENTRY_DETAILS);
3238
3239 errno = 0;
3240 r = blkid_do_safeprobe(b);
3241 if (r == -2 || r == 1) {
3242 log_error("Failed to identify any partition table on\n"
3243 " %s\n"
3244 PARTITION_TABLE_BLURB, arg_image);
3245 return -EINVAL;
3246 } else if (r != 0) {
3247 if (errno == 0)
3248 errno = EIO;
3249 log_error_errno(errno, "Failed to probe: %m");
3250 return -errno;
3251 }
3252
3253 (void) blkid_probe_lookup_value(b, "PTTYPE", &pttype, NULL);
3254
3255 is_gpt = streq_ptr(pttype, "gpt");
3256 is_mbr = streq_ptr(pttype, "dos");
3257
3258 if (!is_gpt && !is_mbr) {
3259 log_error("No GPT or MBR partition table discovered on\n"
3260 " %s\n"
3261 PARTITION_TABLE_BLURB, arg_image);
3262 return -EINVAL;
3263 }
3264
3265 errno = 0;
3266 pl = blkid_probe_get_partitions(b);
3267 if (!pl) {
3268 if (errno == 0)
3269 return log_oom();
3270
3271 log_error("Failed to list partitions of %s", arg_image);
3272 return -errno;
3273 }
3274
3275 udev = udev_new();
3276 if (!udev)
3277 return log_oom();
3278
3279 if (fstat(fd, &st) < 0)
3280 return log_error_errno(errno, "Failed to stat block device: %m");
3281
3282 d = udev_device_new_from_devnum(udev, 'b', st.st_rdev);
3283 if (!d)
3284 return log_oom();
3285
3286 for (i = 0;; i++) {
3287 int n, m;
3288
3289 if (i >= 10) {
3290 log_error("Kernel partitions never appeared.");
3291 return -ENXIO;
3292 }
3293
3294 e = udev_enumerate_new(udev);
3295 if (!e)
3296 return log_oom();
3297
3298 r = udev_enumerate_add_match_parent(e, d);
3299 if (r < 0)
3300 return log_oom();
3301
3302 r = udev_enumerate_scan_devices(e);
3303 if (r < 0)
3304 return log_error_errno(r, "Failed to scan for partition devices of %s: %m", arg_image);
3305
3306 /* Count the partitions enumerated by the kernel */
3307 n = 0;
3308 first = udev_enumerate_get_list_entry(e);
3309 udev_list_entry_foreach(item, first)
3310 n++;
3311
3312 /* Count the partitions enumerated by blkid */
3313 m = blkid_partlist_numof_partitions(pl);
3314 if (n == m + 1)
3315 break;
3316 if (n > m + 1) {
3317 log_error("blkid and kernel partition list do not match.");
3318 return -EIO;
3319 }
3320 if (n < m + 1) {
3321 unsigned j;
3322
3323 /* The kernel has probed fewer partitions than
3324 * blkid? Maybe the kernel prober is still
3325 * running or it got EBUSY because udev
3326 * already opened the device. Let's reprobe
3327 * the device, which is a synchronous call
3328 * that waits until probing is complete. */
3329
3330 for (j = 0; j < 20; j++) {
3331
3332 r = ioctl(fd, BLKRRPART, 0);
3333 if (r < 0)
3334 r = -errno;
3335 if (r >= 0 || r != -EBUSY)
3336 break;
3337
3338 /* If something else has the device
3339 * open, such as an udev rule, the
3340 * ioctl will return EBUSY. Since
3341 * there's no way to wait until it
3342 * isn't busy anymore, let's just wait
3343 * a bit, and try again.
3344 *
3345 * This is really something they
3346 * should fix in the kernel! */
3347
3348 usleep(50 * USEC_PER_MSEC);
3349 }
3350
3351 if (r < 0)
3352 return log_error_errno(r, "Failed to reread partition table: %m");
3353 }
3354
3355 e = udev_enumerate_unref(e);
3356 }
3357
3358 first = udev_enumerate_get_list_entry(e);
3359 udev_list_entry_foreach(item, first) {
3360 _cleanup_udev_device_unref_ struct udev_device *q;
3361 const char *node;
3362 unsigned long long flags;
3363 blkid_partition pp;
3364 dev_t qn;
3365 int nr;
3366
3367 errno = 0;
3368 q = udev_device_new_from_syspath(udev, udev_list_entry_get_name(item));
3369 if (!q) {
3370 if (!errno)
3371 errno = ENOMEM;
3372
3373 log_error_errno(errno, "Failed to get partition device of %s: %m", arg_image);
3374 return -errno;
3375 }
3376
3377 qn = udev_device_get_devnum(q);
3378 if (major(qn) == 0)
3379 continue;
3380
3381 if (st.st_rdev == qn)
3382 continue;
3383
3384 node = udev_device_get_devnode(q);
3385 if (!node)
3386 continue;
3387
3388 pp = blkid_partlist_devno_to_partition(pl, qn);
3389 if (!pp)
3390 continue;
3391
3392 flags = blkid_partition_get_flags(pp);
3393
3394 nr = blkid_partition_get_partno(pp);
3395 if (nr < 0)
3396 continue;
3397
3398 if (is_gpt) {
3399 sd_id128_t type_id;
3400 const char *stype;
3401
3402 if (flags & GPT_FLAG_NO_AUTO)
3403 continue;
3404
3405 stype = blkid_partition_get_type_string(pp);
3406 if (!stype)
3407 continue;
3408
3409 if (sd_id128_from_string(stype, &type_id) < 0)
3410 continue;
3411
3412 if (sd_id128_equal(type_id, GPT_HOME)) {
3413
3414 if (home && nr >= home_nr)
3415 continue;
3416
3417 home_nr = nr;
3418 home_rw = !(flags & GPT_FLAG_READ_ONLY);
3419
3420 r = free_and_strdup(&home, node);
3421 if (r < 0)
3422 return log_oom();
3423
3424 } else if (sd_id128_equal(type_id, GPT_SRV)) {
3425
3426 if (srv && nr >= srv_nr)
3427 continue;
3428
3429 srv_nr = nr;
3430 srv_rw = !(flags & GPT_FLAG_READ_ONLY);
3431
3432 r = free_and_strdup(&srv, node);
3433 if (r < 0)
3434 return log_oom();
3435 }
3436 #ifdef GPT_ROOT_NATIVE
3437 else if (sd_id128_equal(type_id, GPT_ROOT_NATIVE)) {
3438
3439 if (root && nr >= root_nr)
3440 continue;
3441
3442 root_nr = nr;
3443 root_rw = !(flags & GPT_FLAG_READ_ONLY);
3444
3445 r = free_and_strdup(&root, node);
3446 if (r < 0)
3447 return log_oom();
3448 }
3449 #endif
3450 #ifdef GPT_ROOT_SECONDARY
3451 else if (sd_id128_equal(type_id, GPT_ROOT_SECONDARY)) {
3452
3453 if (secondary_root && nr >= secondary_root_nr)
3454 continue;
3455
3456 secondary_root_nr = nr;
3457 secondary_root_rw = !(flags & GPT_FLAG_READ_ONLY);
3458
3459 r = free_and_strdup(&secondary_root, node);
3460 if (r < 0)
3461 return log_oom();
3462 }
3463 #endif
3464 else if (sd_id128_equal(type_id, GPT_LINUX_GENERIC)) {
3465
3466 if (generic)
3467 multiple_generic = true;
3468 else {
3469 generic_rw = !(flags & GPT_FLAG_READ_ONLY);
3470
3471 r = free_and_strdup(&generic, node);
3472 if (r < 0)
3473 return log_oom();
3474 }
3475 }
3476
3477 } else if (is_mbr) {
3478 int type;
3479
3480 if (flags != 0x80) /* Bootable flag */
3481 continue;
3482
3483 type = blkid_partition_get_type(pp);
3484 if (type != 0x83) /* Linux partition */
3485 continue;
3486
3487 if (generic)
3488 multiple_generic = true;
3489 else {
3490 generic_rw = true;
3491
3492 r = free_and_strdup(&root, node);
3493 if (r < 0)
3494 return log_oom();
3495 }
3496 }
3497 }
3498
3499 if (root) {
3500 *root_device = root;
3501 root = NULL;
3502
3503 *root_device_rw = root_rw;
3504 *secondary = false;
3505 } else if (secondary_root) {
3506 *root_device = secondary_root;
3507 secondary_root = NULL;
3508
3509 *root_device_rw = secondary_root_rw;
3510 *secondary = true;
3511 } else if (generic) {
3512
3513 /* There were no partitions with precise meanings
3514 * around, but we found generic partitions. In this
3515 * case, if there's only one, we can go ahead and boot
3516 * it, otherwise we bail out, because we really cannot
3517 * make any sense of it. */
3518
3519 if (multiple_generic) {
3520 log_error("Identified multiple bootable Linux partitions on\n"
3521 " %s\n"
3522 PARTITION_TABLE_BLURB, arg_image);
3523 return -EINVAL;
3524 }
3525
3526 *root_device = generic;
3527 generic = NULL;
3528
3529 *root_device_rw = generic_rw;
3530 *secondary = false;
3531 } else {
3532 log_error("Failed to identify root partition in disk image\n"
3533 " %s\n"
3534 PARTITION_TABLE_BLURB, arg_image);
3535 return -EINVAL;
3536 }
3537
3538 if (home) {
3539 *home_device = home;
3540 home = NULL;
3541
3542 *home_device_rw = home_rw;
3543 }
3544
3545 if (srv) {
3546 *srv_device = srv;
3547 srv = NULL;
3548
3549 *srv_device_rw = srv_rw;
3550 }
3551
3552 return 0;
3553 #else
3554 log_error("--image= is not supported, compiled without blkid support.");
3555 return -EOPNOTSUPP;
3556 #endif
3557 }
3558
3559 static int mount_device(const char *what, const char *where, const char *directory, bool rw) {
3560 #ifdef HAVE_BLKID
3561 _cleanup_blkid_free_probe_ blkid_probe b = NULL;
3562 const char *fstype, *p;
3563 int r;
3564
3565 assert(what);
3566 assert(where);
3567
3568 if (arg_read_only)
3569 rw = false;
3570
3571 if (directory)
3572 p = strjoina(where, directory);
3573 else
3574 p = where;
3575
3576 errno = 0;
3577 b = blkid_new_probe_from_filename(what);
3578 if (!b) {
3579 if (errno == 0)
3580 return log_oom();
3581 log_error_errno(errno, "Failed to allocate prober for %s: %m", what);
3582 return -errno;
3583 }
3584
3585 blkid_probe_enable_superblocks(b, 1);
3586 blkid_probe_set_superblocks_flags(b, BLKID_SUBLKS_TYPE);
3587
3588 errno = 0;
3589 r = blkid_do_safeprobe(b);
3590 if (r == -1 || r == 1) {
3591 log_error("Cannot determine file system type of %s", what);
3592 return -EINVAL;
3593 } else if (r != 0) {
3594 if (errno == 0)
3595 errno = EIO;
3596 log_error_errno(errno, "Failed to probe %s: %m", what);
3597 return -errno;
3598 }
3599
3600 errno = 0;
3601 if (blkid_probe_lookup_value(b, "TYPE", &fstype, NULL) < 0) {
3602 if (errno == 0)
3603 errno = EINVAL;
3604 log_error("Failed to determine file system type of %s", what);
3605 return -errno;
3606 }
3607
3608 if (streq(fstype, "crypto_LUKS")) {
3609 log_error("nspawn currently does not support LUKS disk images.");
3610 return -EOPNOTSUPP;
3611 }
3612
3613 if (mount(what, p, fstype, MS_NODEV|(rw ? 0 : MS_RDONLY), NULL) < 0)
3614 return log_error_errno(errno, "Failed to mount %s: %m", what);
3615
3616 return 0;
3617 #else
3618 log_error("--image= is not supported, compiled without blkid support.");
3619 return -EOPNOTSUPP;
3620 #endif
3621 }
3622
3623 static int mount_devices(
3624 const char *where,
3625 const char *root_device, bool root_device_rw,
3626 const char *home_device, bool home_device_rw,
3627 const char *srv_device, bool srv_device_rw) {
3628 int r;
3629
3630 assert(where);
3631
3632 if (root_device) {
3633 r = mount_device(root_device, arg_directory, NULL, root_device_rw);
3634 if (r < 0)
3635 return log_error_errno(r, "Failed to mount root directory: %m");
3636 }
3637
3638 if (home_device) {
3639 r = mount_device(home_device, arg_directory, "/home", home_device_rw);
3640 if (r < 0)
3641 return log_error_errno(r, "Failed to mount home directory: %m");
3642 }
3643
3644 if (srv_device) {
3645 r = mount_device(srv_device, arg_directory, "/srv", srv_device_rw);
3646 if (r < 0)
3647 return log_error_errno(r, "Failed to mount server data directory: %m");
3648 }
3649
3650 return 0;
3651 }
3652
3653 static void loop_remove(int nr, int *image_fd) {
3654 _cleanup_close_ int control = -1;
3655 int r;
3656
3657 if (nr < 0)
3658 return;
3659
3660 if (image_fd && *image_fd >= 0) {
3661 r = ioctl(*image_fd, LOOP_CLR_FD);
3662 if (r < 0)
3663 log_debug_errno(errno, "Failed to close loop image: %m");
3664 *image_fd = safe_close(*image_fd);
3665 }
3666
3667 control = open("/dev/loop-control", O_RDWR|O_CLOEXEC|O_NOCTTY|O_NONBLOCK);
3668 if (control < 0) {
3669 log_warning_errno(errno, "Failed to open /dev/loop-control: %m");
3670 return;
3671 }
3672
3673 r = ioctl(control, LOOP_CTL_REMOVE, nr);
3674 if (r < 0)
3675 log_debug_errno(errno, "Failed to remove loop %d: %m", nr);
3676 }
3677
3678 static int spawn_getent(const char *database, const char *key, pid_t *rpid) {
3679 int pipe_fds[2];
3680 pid_t pid;
3681
3682 assert(database);
3683 assert(key);
3684 assert(rpid);
3685
3686 if (pipe2(pipe_fds, O_CLOEXEC) < 0)
3687 return log_error_errno(errno, "Failed to allocate pipe: %m");
3688
3689 pid = fork();
3690 if (pid < 0)
3691 return log_error_errno(errno, "Failed to fork getent child: %m");
3692 else if (pid == 0) {
3693 int nullfd;
3694 char *empty_env = NULL;
3695
3696 if (dup3(pipe_fds[1], STDOUT_FILENO, 0) < 0)
3697 _exit(EXIT_FAILURE);
3698
3699 if (pipe_fds[0] > 2)
3700 safe_close(pipe_fds[0]);
3701 if (pipe_fds[1] > 2)
3702 safe_close(pipe_fds[1]);
3703
3704 nullfd = open("/dev/null", O_RDWR);
3705 if (nullfd < 0)
3706 _exit(EXIT_FAILURE);
3707
3708 if (dup3(nullfd, STDIN_FILENO, 0) < 0)
3709 _exit(EXIT_FAILURE);
3710
3711 if (dup3(nullfd, STDERR_FILENO, 0) < 0)
3712 _exit(EXIT_FAILURE);
3713
3714 if (nullfd > 2)
3715 safe_close(nullfd);
3716
3717 (void) reset_all_signal_handlers();
3718 (void) reset_signal_mask();
3719 close_all_fds(NULL, 0);
3720
3721 execle("/usr/bin/getent", "getent", database, key, NULL, &empty_env);
3722 execle("/bin/getent", "getent", database, key, NULL, &empty_env);
3723 _exit(EXIT_FAILURE);
3724 }
3725
3726 pipe_fds[1] = safe_close(pipe_fds[1]);
3727
3728 *rpid = pid;
3729
3730 return pipe_fds[0];
3731 }
3732
3733 static int change_uid_gid(char **_home) {
3734 char line[LINE_MAX], *x, *u, *g, *h;
3735 const char *word, *state;
3736 _cleanup_free_ uid_t *uids = NULL;
3737 _cleanup_free_ char *home = NULL;
3738 _cleanup_fclose_ FILE *f = NULL;
3739 _cleanup_close_ int fd = -1;
3740 unsigned n_uids = 0;
3741 size_t sz = 0, l;
3742 uid_t uid;
3743 gid_t gid;
3744 pid_t pid;
3745 int r;
3746
3747 assert(_home);
3748
3749 if (!arg_user || streq(arg_user, "root") || streq(arg_user, "0")) {
3750 /* Reset everything fully to 0, just in case */
3751
3752 r = reset_uid_gid();
3753 if (r < 0)
3754 return log_error_errno(r, "Failed to become root: %m");
3755
3756 *_home = NULL;
3757 return 0;
3758 }
3759
3760 /* First, get user credentials */
3761 fd = spawn_getent("passwd", arg_user, &pid);
3762 if (fd < 0)
3763 return fd;
3764
3765 f = fdopen(fd, "r");
3766 if (!f)
3767 return log_oom();
3768 fd = -1;
3769
3770 if (!fgets(line, sizeof(line), f)) {
3771
3772 if (!ferror(f)) {
3773 log_error("Failed to resolve user %s.", arg_user);
3774 return -ESRCH;
3775 }
3776
3777 log_error_errno(errno, "Failed to read from getent: %m");
3778 return -errno;
3779 }
3780
3781 truncate_nl(line);
3782
3783 wait_for_terminate_and_warn("getent passwd", pid, true);
3784
3785 x = strchr(line, ':');
3786 if (!x) {
3787 log_error("/etc/passwd entry has invalid user field.");
3788 return -EIO;
3789 }
3790
3791 u = strchr(x+1, ':');
3792 if (!u) {
3793 log_error("/etc/passwd entry has invalid password field.");
3794 return -EIO;
3795 }
3796
3797 u++;
3798 g = strchr(u, ':');
3799 if (!g) {
3800 log_error("/etc/passwd entry has invalid UID field.");
3801 return -EIO;
3802 }
3803
3804 *g = 0;
3805 g++;
3806 x = strchr(g, ':');
3807 if (!x) {
3808 log_error("/etc/passwd entry has invalid GID field.");
3809 return -EIO;
3810 }
3811
3812 *x = 0;
3813 h = strchr(x+1, ':');
3814 if (!h) {
3815 log_error("/etc/passwd entry has invalid GECOS field.");
3816 return -EIO;
3817 }
3818
3819 h++;
3820 x = strchr(h, ':');
3821 if (!x) {
3822 log_error("/etc/passwd entry has invalid home directory field.");
3823 return -EIO;
3824 }
3825
3826 *x = 0;
3827
3828 r = parse_uid(u, &uid);
3829 if (r < 0) {
3830 log_error("Failed to parse UID of user.");
3831 return -EIO;
3832 }
3833
3834 r = parse_gid(g, &gid);
3835 if (r < 0) {
3836 log_error("Failed to parse GID of user.");
3837 return -EIO;
3838 }
3839
3840 home = strdup(h);
3841 if (!home)
3842 return log_oom();
3843
3844 /* Second, get group memberships */
3845 fd = spawn_getent("initgroups", arg_user, &pid);
3846 if (fd < 0)
3847 return fd;
3848
3849 fclose(f);
3850 f = fdopen(fd, "r");
3851 if (!f)
3852 return log_oom();
3853 fd = -1;
3854
3855 if (!fgets(line, sizeof(line), f)) {
3856 if (!ferror(f)) {
3857 log_error("Failed to resolve user %s.", arg_user);
3858 return -ESRCH;
3859 }
3860
3861 log_error_errno(errno, "Failed to read from getent: %m");
3862 return -errno;
3863 }
3864
3865 truncate_nl(line);
3866
3867 wait_for_terminate_and_warn("getent initgroups", pid, true);
3868
3869 /* Skip over the username and subsequent separator whitespace */
3870 x = line;
3871 x += strcspn(x, WHITESPACE);
3872 x += strspn(x, WHITESPACE);
3873
3874 FOREACH_WORD(word, l, x, state) {
3875 char c[l+1];
3876
3877 memcpy(c, word, l);
3878 c[l] = 0;
3879
3880 if (!GREEDY_REALLOC(uids, sz, n_uids+1))
3881 return log_oom();
3882
3883 r = parse_uid(c, &uids[n_uids++]);
3884 if (r < 0) {
3885 log_error("Failed to parse group data from getent.");
3886 return -EIO;
3887 }
3888 }
3889
3890 r = mkdir_parents(home, 0775);
3891 if (r < 0)
3892 return log_error_errno(r, "Failed to make home root directory: %m");
3893
3894 r = mkdir_safe(home, 0755, uid, gid);
3895 if (r < 0 && r != -EEXIST)
3896 return log_error_errno(r, "Failed to make home directory: %m");
3897
3898 (void) fchown(STDIN_FILENO, uid, gid);
3899 (void) fchown(STDOUT_FILENO, uid, gid);
3900 (void) fchown(STDERR_FILENO, uid, gid);
3901
3902 if (setgroups(n_uids, uids) < 0)
3903 return log_error_errno(errno, "Failed to set auxiliary groups: %m");
3904
3905 if (setresgid(gid, gid, gid) < 0)
3906 return log_error_errno(errno, "setregid() failed: %m");
3907
3908 if (setresuid(uid, uid, uid) < 0)
3909 return log_error_errno(errno, "setreuid() failed: %m");
3910
3911 if (_home) {
3912 *_home = home;
3913 home = NULL;
3914 }
3915
3916 return 0;
3917 }
3918
3919 /*
3920 * Return values:
3921 * < 0 : wait_for_terminate() failed to get the state of the
3922 * container, the container was terminated by a signal, or
3923 * failed for an unknown reason. No change is made to the
3924 * container argument.
3925 * > 0 : The program executed in the container terminated with an
3926 * error. The exit code of the program executed in the
3927 * container is returned. The container argument has been set
3928 * to CONTAINER_TERMINATED.
3929 * 0 : The container is being rebooted, has been shut down or exited
3930 * successfully. The container argument has been set to either
3931 * CONTAINER_TERMINATED or CONTAINER_REBOOTED.
3932 *
3933 * That is, success is indicated by a return value of zero, and an
3934 * error is indicated by a non-zero value.
3935 */
3936 static int wait_for_container(pid_t pid, ContainerStatus *container) {
3937 siginfo_t status;
3938 int r;
3939
3940 r = wait_for_terminate(pid, &status);
3941 if (r < 0)
3942 return log_warning_errno(r, "Failed to wait for container: %m");
3943
3944 switch (status.si_code) {
3945
3946 case CLD_EXITED:
3947 if (status.si_status == 0) {
3948 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s exited successfully.", arg_machine);
3949
3950 } else
3951 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s failed with error code %i.", arg_machine, status.si_status);
3952
3953 *container = CONTAINER_TERMINATED;
3954 return status.si_status;
3955
3956 case CLD_KILLED:
3957 if (status.si_status == SIGINT) {
3958
3959 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s has been shut down.", arg_machine);
3960 *container = CONTAINER_TERMINATED;
3961 return 0;
3962
3963 } else if (status.si_status == SIGHUP) {
3964
3965 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s is being rebooted.", arg_machine);
3966 *container = CONTAINER_REBOOTED;
3967 return 0;
3968 }
3969
3970 /* CLD_KILLED fallthrough */
3971
3972 case CLD_DUMPED:
3973 log_error("Container %s terminated by signal %s.", arg_machine, signal_to_string(status.si_status));
3974 return -EIO;
3975
3976 default:
3977 log_error("Container %s failed due to unknown reason.", arg_machine);
3978 return -EIO;
3979 }
3980
3981 return r;
3982 }
3983
3984 static void nop_handler(int sig) {}
3985
3986 static int on_orderly_shutdown(sd_event_source *s, const struct signalfd_siginfo *si, void *userdata) {
3987 pid_t pid;
3988
3989 pid = PTR_TO_UINT32(userdata);
3990 if (pid > 0) {
3991 if (kill(pid, arg_kill_signal) >= 0) {
3992 log_info("Trying to halt container. Send SIGTERM again to trigger immediate termination.");
3993 sd_event_source_set_userdata(s, NULL);
3994 return 0;
3995 }
3996 }
3997
3998 sd_event_exit(sd_event_source_get_event(s), 0);
3999 return 0;
4000 }
4001
4002 static int determine_names(void) {
4003 int r;
4004
4005 if (!arg_image && !arg_directory) {
4006 if (arg_machine) {
4007 _cleanup_(image_unrefp) Image *i = NULL;
4008
4009 r = image_find(arg_machine, &i);
4010 if (r < 0)
4011 return log_error_errno(r, "Failed to find image for machine '%s': %m", arg_machine);
4012 else if (r == 0) {
4013 log_error("No image for machine '%s': %m", arg_machine);
4014 return -ENOENT;
4015 }
4016
4017 if (i->type == IMAGE_RAW)
4018 r = set_sanitized_path(&arg_image, i->path);
4019 else
4020 r = set_sanitized_path(&arg_directory, i->path);
4021 if (r < 0)
4022 return log_error_errno(r, "Invalid image directory: %m");
4023
4024 if (!arg_ephemeral)
4025 arg_read_only = arg_read_only || i->read_only;
4026 } else
4027 arg_directory = get_current_dir_name();
4028
4029 if (!arg_directory && !arg_machine) {
4030 log_error("Failed to determine path, please use -D or -i.");
4031 return -EINVAL;
4032 }
4033 }
4034
4035 if (!arg_machine) {
4036 if (arg_directory && path_equal(arg_directory, "/"))
4037 arg_machine = gethostname_malloc();
4038 else
4039 arg_machine = strdup(basename(arg_image ?: arg_directory));
4040
4041 if (!arg_machine)
4042 return log_oom();
4043
4044 hostname_cleanup(arg_machine);
4045 if (!machine_name_is_valid(arg_machine)) {
4046 log_error("Failed to determine machine name automatically, please use -M.");
4047 return -EINVAL;
4048 }
4049
4050 if (arg_ephemeral) {
4051 char *b;
4052
4053 /* Add a random suffix when this is an
4054 * ephemeral machine, so that we can run many
4055 * instances at once without manually having
4056 * to specify -M each time. */
4057
4058 if (asprintf(&b, "%s-%016" PRIx64, arg_machine, random_u64()) < 0)
4059 return log_oom();
4060
4061 free(arg_machine);
4062 arg_machine = b;
4063 }
4064 }
4065
4066 return 0;
4067 }
4068
4069 static int determine_uid_shift(const char *directory) {
4070 int r;
4071
4072 if (!arg_userns) {
4073 arg_uid_shift = 0;
4074 return 0;
4075 }
4076
4077 if (arg_uid_shift == UID_INVALID) {
4078 struct stat st;
4079
4080 r = stat(directory, &st);
4081 if (r < 0)
4082 return log_error_errno(errno, "Failed to determine UID base of %s: %m", directory);
4083
4084 arg_uid_shift = st.st_uid & UINT32_C(0xffff0000);
4085
4086 if (arg_uid_shift != (st.st_gid & UINT32_C(0xffff0000))) {
4087 log_error("UID and GID base of %s don't match.", directory);
4088 return -EINVAL;
4089 }
4090
4091 arg_uid_range = UINT32_C(0x10000);
4092 }
4093
4094 if (arg_uid_shift > (uid_t) -1 - arg_uid_range) {
4095 log_error("UID base too high for UID range.");
4096 return -EINVAL;
4097 }
4098
4099 log_info("Using user namespaces with base " UID_FMT " and range " UID_FMT ".", arg_uid_shift, arg_uid_range);
4100 return 0;
4101 }
4102
4103 static int inner_child(
4104 Barrier *barrier,
4105 const char *directory,
4106 bool secondary,
4107 int kmsg_socket,
4108 int rtnl_socket,
4109 FDSet *fds,
4110 int argc,
4111 char *argv[]) {
4112
4113 _cleanup_free_ char *home = NULL;
4114 unsigned n_env = 2;
4115 const char *envp[] = {
4116 "PATH=" DEFAULT_PATH_SPLIT_USR,
4117 "container=systemd-nspawn", /* LXC sets container=lxc, so follow the scheme here */
4118 NULL, /* TERM */
4119 NULL, /* HOME */
4120 NULL, /* USER */
4121 NULL, /* LOGNAME */
4122 NULL, /* container_uuid */
4123 NULL, /* LISTEN_FDS */
4124 NULL, /* LISTEN_PID */
4125 NULL
4126 };
4127
4128 _cleanup_strv_free_ char **env_use = NULL;
4129 int r;
4130
4131 assert(barrier);
4132 assert(directory);
4133 assert(kmsg_socket >= 0);
4134
4135 if (arg_userns) {
4136 /* Tell the parent, that it now can write the UID map. */
4137 (void) barrier_place(barrier); /* #1 */
4138
4139 /* Wait until the parent wrote the UID map */
4140 if (!barrier_place_and_sync(barrier)) { /* #2 */
4141 log_error("Parent died too early");
4142 return -ESRCH;
4143 }
4144 }
4145
4146 r = mount_all(NULL, true);
4147 if (r < 0)
4148 return r;
4149
4150 /* Wait until we are cgroup-ified, so that we
4151 * can mount the right cgroup path writable */
4152 if (!barrier_place_and_sync(barrier)) { /* #3 */
4153 log_error("Parent died too early");
4154 return -ESRCH;
4155 }
4156
4157 r = mount_systemd_cgroup_writable("");
4158 if (r < 0)
4159 return r;
4160
4161 r = reset_uid_gid();
4162 if (r < 0)
4163 return log_error_errno(r, "Couldn't become new root: %m");
4164
4165 r = setup_boot_id(NULL);
4166 if (r < 0)
4167 return r;
4168
4169 r = setup_kmsg(NULL, kmsg_socket);
4170 if (r < 0)
4171 return r;
4172 kmsg_socket = safe_close(kmsg_socket);
4173
4174 umask(0022);
4175
4176 if (setsid() < 0)
4177 return log_error_errno(errno, "setsid() failed: %m");
4178
4179 if (arg_private_network)
4180 loopback_setup();
4181
4182 r = send_rtnl(rtnl_socket);
4183 if (r < 0)
4184 return r;
4185 rtnl_socket = safe_close(rtnl_socket);
4186
4187 if (drop_capabilities() < 0)
4188 return log_error_errno(errno, "drop_capabilities() failed: %m");
4189
4190 setup_hostname();
4191
4192 if (arg_personality != PERSONALITY_INVALID) {
4193 if (personality(arg_personality) < 0)
4194 return log_error_errno(errno, "personality() failed: %m");
4195 } else if (secondary) {
4196 if (personality(PER_LINUX32) < 0)
4197 return log_error_errno(errno, "personality() failed: %m");
4198 }
4199
4200 #ifdef HAVE_SELINUX
4201 if (arg_selinux_context)
4202 if (setexeccon((security_context_t) arg_selinux_context) < 0)
4203 return log_error_errno(errno, "setexeccon(\"%s\") failed: %m", arg_selinux_context);
4204 #endif
4205
4206 r = change_uid_gid(&home);
4207 if (r < 0)
4208 return r;
4209
4210 envp[n_env] = strv_find_prefix(environ, "TERM=");
4211 if (envp[n_env])
4212 n_env ++;
4213
4214 if ((asprintf((char**)(envp + n_env++), "HOME=%s", home ? home: "/root") < 0) ||
4215 (asprintf((char**)(envp + n_env++), "USER=%s", arg_user ? arg_user : "root") < 0) ||
4216 (asprintf((char**)(envp + n_env++), "LOGNAME=%s", arg_user ? arg_user : "root") < 0))
4217 return log_oom();
4218
4219 if (!sd_id128_equal(arg_uuid, SD_ID128_NULL)) {
4220 char as_uuid[37];
4221
4222 if (asprintf((char**)(envp + n_env++), "container_uuid=%s", id128_format_as_uuid(arg_uuid, as_uuid)) < 0)
4223 return log_oom();
4224 }
4225
4226 if (fdset_size(fds) > 0) {
4227 r = fdset_cloexec(fds, false);
4228 if (r < 0)
4229 return log_error_errno(r, "Failed to unset O_CLOEXEC for file descriptors.");
4230
4231 if ((asprintf((char **)(envp + n_env++), "LISTEN_FDS=%u", fdset_size(fds)) < 0) ||
4232 (asprintf((char **)(envp + n_env++), "LISTEN_PID=1") < 0))
4233 return log_oom();
4234 }
4235
4236 env_use = strv_env_merge(2, envp, arg_setenv);
4237 if (!env_use)
4238 return log_oom();
4239
4240 /* Let the parent know that we are ready and
4241 * wait until the parent is ready with the
4242 * setup, too... */
4243 if (!barrier_place_and_sync(barrier)) { /* #4 */
4244 log_error("Parent died too early");
4245 return -ESRCH;
4246 }
4247
4248 /* Now, explicitly close the log, so that we
4249 * then can close all remaining fds. Closing
4250 * the log explicitly first has the benefit
4251 * that the logging subsystem knows about it,
4252 * and is thus ready to be reopened should we
4253 * need it again. Note that the other fds
4254 * closed here are at least the locking and
4255 * barrier fds. */
4256 log_close();
4257 (void) fdset_close_others(fds);
4258
4259 if (arg_boot) {
4260 char **a;
4261 size_t m;
4262
4263 /* Automatically search for the init system */
4264
4265 m = 1 + argc - optind;
4266 a = newa(char*, m + 1);
4267 memcpy(a + 1, argv + optind, m * sizeof(char*));
4268
4269 a[0] = (char*) "/usr/lib/systemd/systemd";
4270 execve(a[0], a, env_use);
4271
4272 a[0] = (char*) "/lib/systemd/systemd";
4273 execve(a[0], a, env_use);
4274
4275 a[0] = (char*) "/sbin/init";
4276 execve(a[0], a, env_use);
4277 } else if (argc > optind)
4278 execvpe(argv[optind], argv + optind, env_use);
4279 else {
4280 chdir(home ? home : "/root");
4281 execle("/bin/bash", "-bash", NULL, env_use);
4282 execle("/bin/sh", "-sh", NULL, env_use);
4283 }
4284
4285 (void) log_open();
4286 return log_error_errno(errno, "execv() failed: %m");
4287 }
4288
4289 static int outer_child(
4290 Barrier *barrier,
4291 const char *directory,
4292 const char *console,
4293 const char *root_device, bool root_device_rw,
4294 const char *home_device, bool home_device_rw,
4295 const char *srv_device, bool srv_device_rw,
4296 bool interactive,
4297 bool secondary,
4298 int pid_socket,
4299 int kmsg_socket,
4300 int rtnl_socket,
4301 int uid_shift_socket,
4302 FDSet *fds,
4303 int argc,
4304 char *argv[]) {
4305
4306 pid_t pid;
4307 ssize_t l;
4308 int r;
4309
4310 assert(barrier);
4311 assert(directory);
4312 assert(console);
4313 assert(pid_socket >= 0);
4314 assert(kmsg_socket >= 0);
4315
4316 if (prctl(PR_SET_PDEATHSIG, SIGKILL) < 0)
4317 return log_error_errno(errno, "PR_SET_PDEATHSIG failed: %m");
4318
4319 if (interactive) {
4320 close_nointr(STDIN_FILENO);
4321 close_nointr(STDOUT_FILENO);
4322 close_nointr(STDERR_FILENO);
4323
4324 r = open_terminal(console, O_RDWR);
4325 if (r != STDIN_FILENO) {
4326 if (r >= 0) {
4327 safe_close(r);
4328 r = -EINVAL;
4329 }
4330
4331 return log_error_errno(r, "Failed to open console: %m");
4332 }
4333
4334 if (dup2(STDIN_FILENO, STDOUT_FILENO) != STDOUT_FILENO ||
4335 dup2(STDIN_FILENO, STDERR_FILENO) != STDERR_FILENO)
4336 return log_error_errno(errno, "Failed to duplicate console: %m");
4337 }
4338
4339 r = reset_audit_loginuid();
4340 if (r < 0)
4341 return r;
4342
4343 /* Mark everything as slave, so that we still
4344 * receive mounts from the real root, but don't
4345 * propagate mounts to the real root. */
4346 if (mount(NULL, "/", NULL, MS_SLAVE|MS_REC, NULL) < 0)
4347 return log_error_errno(errno, "MS_SLAVE|MS_REC failed: %m");
4348
4349 r = mount_devices(directory,
4350 root_device, root_device_rw,
4351 home_device, home_device_rw,
4352 srv_device, srv_device_rw);
4353 if (r < 0)
4354 return r;
4355
4356 r = determine_uid_shift(directory);
4357 if (r < 0)
4358 return r;
4359
4360 if (arg_userns) {
4361 l = send(uid_shift_socket, &arg_uid_shift, sizeof(arg_uid_shift), MSG_NOSIGNAL);
4362 if (l < 0)
4363 return log_error_errno(errno, "Failed to send UID shift: %m");
4364 if (l != sizeof(arg_uid_shift)) {
4365 log_error("Short write while sending UID shift.");
4366 return -EIO;
4367 }
4368 }
4369
4370 /* Turn directory into bind mount */
4371 if (mount(directory, directory, NULL, MS_BIND|MS_REC, NULL) < 0)
4372 return log_error_errno(errno, "Failed to make bind mount: %m");
4373
4374 r = setup_volatile(directory);
4375 if (r < 0)
4376 return r;
4377
4378 r = setup_volatile_state(directory);
4379 if (r < 0)
4380 return r;
4381
4382 r = base_filesystem_create(directory, arg_uid_shift, (gid_t) arg_uid_shift);
4383 if (r < 0)
4384 return r;
4385
4386 if (arg_read_only) {
4387 r = bind_remount_recursive(directory, true);
4388 if (r < 0)
4389 return log_error_errno(r, "Failed to make tree read-only: %m");
4390 }
4391
4392 r = mount_all(directory, false);
4393 if (r < 0)
4394 return r;
4395
4396 if (copy_devnodes(directory) < 0)
4397 return r;
4398
4399 dev_setup(directory, arg_uid_shift, arg_uid_shift);
4400
4401 if (setup_pts(directory) < 0)
4402 return r;
4403
4404 r = setup_propagate(directory);
4405 if (r < 0)
4406 return r;
4407
4408 r = setup_dev_console(directory, console);
4409 if (r < 0)
4410 return r;
4411
4412 r = setup_seccomp();
4413 if (r < 0)
4414 return r;
4415
4416 r = setup_timezone(directory);
4417 if (r < 0)
4418 return r;
4419
4420 r = setup_resolv_conf(directory);
4421 if (r < 0)
4422 return r;
4423
4424 r = setup_journal(directory);
4425 if (r < 0)
4426 return r;
4427
4428 r = mount_custom(directory);
4429 if (r < 0)
4430 return r;
4431
4432 r = mount_cgroup(directory);
4433 if (r < 0)
4434 return r;
4435
4436 r = mount_move_root(directory);
4437 if (r < 0)
4438 return log_error_errno(r, "Failed to move root directory: %m");
4439
4440 pid = raw_clone(SIGCHLD|CLONE_NEWNS|
4441 (arg_share_system ? 0 : CLONE_NEWIPC|CLONE_NEWPID|CLONE_NEWUTS) |
4442 (arg_private_network ? CLONE_NEWNET : 0) |
4443 (arg_userns ? CLONE_NEWUSER : 0),
4444 NULL);
4445 if (pid < 0)
4446 return log_error_errno(errno, "Failed to fork inner child: %m");
4447
4448 if (pid == 0) {
4449 pid_socket = safe_close(pid_socket);
4450 uid_shift_socket = safe_close(uid_shift_socket);
4451
4452 /* The inner child has all namespaces that are
4453 * requested, so that we all are owned by the user if
4454 * user namespaces are turned on. */
4455
4456 r = inner_child(barrier, directory, secondary, kmsg_socket, rtnl_socket, fds, argc, argv);
4457 if (r < 0)
4458 _exit(EXIT_FAILURE);
4459
4460 _exit(EXIT_SUCCESS);
4461 }
4462
4463 l = send(pid_socket, &pid, sizeof(pid), MSG_NOSIGNAL);
4464 if (l < 0)
4465 return log_error_errno(errno, "Failed to send PID: %m");
4466 if (l != sizeof(pid)) {
4467 log_error("Short write while sending PID.");
4468 return -EIO;
4469 }
4470
4471 pid_socket = safe_close(pid_socket);
4472
4473 return 0;
4474 }
4475
4476 static int setup_uid_map(pid_t pid) {
4477 char uid_map[strlen("/proc//uid_map") + DECIMAL_STR_MAX(uid_t) + 1], line[DECIMAL_STR_MAX(uid_t)*3+3+1];
4478 int r;
4479
4480 assert(pid > 1);
4481
4482 xsprintf(uid_map, "/proc/" PID_FMT "/uid_map", pid);
4483 xsprintf(line, UID_FMT " " UID_FMT " " UID_FMT "\n", 0, arg_uid_shift, arg_uid_range);
4484 r = write_string_file(uid_map, line, 0);
4485 if (r < 0)
4486 return log_error_errno(r, "Failed to write UID map: %m");
4487
4488 /* We always assign the same UID and GID ranges */
4489 xsprintf(uid_map, "/proc/" PID_FMT "/gid_map", pid);
4490 r = write_string_file(uid_map, line, 0);
4491 if (r < 0)
4492 return log_error_errno(r, "Failed to write GID map: %m");
4493
4494 return 0;
4495 }
4496
4497 static int chown_cgroup(pid_t pid) {
4498 _cleanup_free_ char *path = NULL, *fs = NULL;
4499 _cleanup_close_ int fd = -1;
4500 const char *fn;
4501 int r;
4502
4503 r = cg_pid_get_path(NULL, pid, &path);
4504 if (r < 0)
4505 return log_error_errno(r, "Failed to get container cgroup path: %m");
4506
4507 r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, path, NULL, &fs);
4508 if (r < 0)
4509 return log_error_errno(r, "Failed to get file system path for container cgroup: %m");
4510
4511 fd = open(fs, O_RDONLY|O_CLOEXEC|O_DIRECTORY);
4512 if (fd < 0)
4513 return log_error_errno(errno, "Failed to open %s: %m", fs);
4514
4515 FOREACH_STRING(fn, ".", "tasks", "notify_on_release", "cgroup.procs", "cgroup.clone_children")
4516 if (fchownat(fd, fn, arg_uid_shift, arg_uid_shift, 0) < 0)
4517 log_warning_errno(errno, "Failed to chown() cgroup file %s, ignoring: %m", fn);
4518
4519 return 0;
4520 }
4521
4522 int main(int argc, char *argv[]) {
4523
4524 _cleanup_free_ char *device_path = NULL, *root_device = NULL, *home_device = NULL, *srv_device = NULL, *console = NULL;
4525 bool root_device_rw = true, home_device_rw = true, srv_device_rw = true;
4526 _cleanup_close_ int master = -1, image_fd = -1;
4527 _cleanup_fdset_free_ FDSet *fds = NULL;
4528 int r, n_fd_passed, loop_nr = -1;
4529 char veth_name[IFNAMSIZ];
4530 bool secondary = false, remove_subvol = false;
4531 sigset_t mask_chld;
4532 pid_t pid = 0;
4533 int ret = EXIT_SUCCESS;
4534 union in_addr_union exposed = {};
4535 _cleanup_release_lock_file_ LockFile tree_global_lock = LOCK_FILE_INIT, tree_local_lock = LOCK_FILE_INIT;
4536 bool interactive;
4537
4538 log_parse_environment();
4539 log_open();
4540
4541 r = parse_argv(argc, argv);
4542 if (r <= 0)
4543 goto finish;
4544
4545 r = determine_names();
4546 if (r < 0)
4547 goto finish;
4548
4549 if (geteuid() != 0) {
4550 log_error("Need to be root.");
4551 r = -EPERM;
4552 goto finish;
4553 }
4554
4555 n_fd_passed = sd_listen_fds(false);
4556 if (n_fd_passed > 0) {
4557 r = fdset_new_listen_fds(&fds, false);
4558 if (r < 0) {
4559 log_error_errno(r, "Failed to collect file descriptors: %m");
4560 goto finish;
4561 }
4562 }
4563
4564 if (arg_directory) {
4565 assert(!arg_image);
4566
4567 if (path_equal(arg_directory, "/") && !arg_ephemeral) {
4568 log_error("Spawning container on root directory is not supported. Consider using --ephemeral.");
4569 r = -EINVAL;
4570 goto finish;
4571 }
4572
4573 if (arg_ephemeral) {
4574 _cleanup_free_ char *np = NULL;
4575
4576 /* If the specified path is a mount point we
4577 * generate the new snapshot immediately
4578 * inside it under a random name. However if
4579 * the specified is not a mount point we
4580 * create the new snapshot in the parent
4581 * directory, just next to it. */
4582 r = path_is_mount_point(arg_directory, 0);
4583 if (r < 0) {
4584 log_error_errno(r, "Failed to determine whether directory %s is mount point: %m", arg_directory);
4585 goto finish;
4586 }
4587 if (r > 0)
4588 r = tempfn_random_child(arg_directory, "machine.", &np);
4589 else
4590 r = tempfn_random(arg_directory, "machine.", &np);
4591 if (r < 0) {
4592 log_error_errno(r, "Failed to generate name for snapshot: %m");
4593 goto finish;
4594 }
4595
4596 r = image_path_lock(np, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
4597 if (r < 0) {
4598 log_error_errno(r, "Failed to lock %s: %m", np);
4599 goto finish;
4600 }
4601
4602 r = btrfs_subvol_snapshot(arg_directory, np, (arg_read_only ? BTRFS_SNAPSHOT_READ_ONLY : 0) | BTRFS_SNAPSHOT_FALLBACK_COPY | BTRFS_SNAPSHOT_RECURSIVE);
4603 if (r < 0) {
4604 log_error_errno(r, "Failed to create snapshot %s from %s: %m", np, arg_directory);
4605 goto finish;
4606 }
4607
4608 free(arg_directory);
4609 arg_directory = np;
4610 np = NULL;
4611
4612 remove_subvol = true;
4613
4614 } else {
4615 r = image_path_lock(arg_directory, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
4616 if (r == -EBUSY) {
4617 log_error_errno(r, "Directory tree %s is currently busy.", arg_directory);
4618 goto finish;
4619 }
4620 if (r < 0) {
4621 log_error_errno(r, "Failed to lock %s: %m", arg_directory);
4622 return r;
4623 }
4624
4625 if (arg_template) {
4626 r = btrfs_subvol_snapshot(arg_template, arg_directory, (arg_read_only ? BTRFS_SNAPSHOT_READ_ONLY : 0) | BTRFS_SNAPSHOT_FALLBACK_COPY | BTRFS_SNAPSHOT_RECURSIVE);
4627 if (r == -EEXIST) {
4628 if (!arg_quiet)
4629 log_info("Directory %s already exists, not populating from template %s.", arg_directory, arg_template);
4630 } else if (r < 0) {
4631 log_error_errno(r, "Couldn't create snapshot %s from %s: %m", arg_directory, arg_template);
4632 goto finish;
4633 } else {
4634 if (!arg_quiet)
4635 log_info("Populated %s from template %s.", arg_directory, arg_template);
4636 }
4637 }
4638 }
4639
4640 if (arg_boot) {
4641 if (path_is_os_tree(arg_directory) <= 0) {
4642 log_error("Directory %s doesn't look like an OS root directory (os-release file is missing). Refusing.", arg_directory);
4643 r = -EINVAL;
4644 goto finish;
4645 }
4646 } else {
4647 const char *p;
4648
4649 p = strjoina(arg_directory,
4650 argc > optind && path_is_absolute(argv[optind]) ? argv[optind] : "/usr/bin/");
4651 if (access(p, F_OK) < 0) {
4652 log_error("Directory %s lacks the binary to execute or doesn't look like a binary tree. Refusing.", arg_directory);
4653 r = -EINVAL;
4654 goto finish;
4655 }
4656 }
4657
4658 } else {
4659 char template[] = "/tmp/nspawn-root-XXXXXX";
4660
4661 assert(arg_image);
4662 assert(!arg_template);
4663
4664 r = image_path_lock(arg_image, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
4665 if (r == -EBUSY) {
4666 r = log_error_errno(r, "Disk image %s is currently busy.", arg_image);
4667 goto finish;
4668 }
4669 if (r < 0) {
4670 r = log_error_errno(r, "Failed to create image lock: %m");
4671 goto finish;
4672 }
4673
4674 if (!mkdtemp(template)) {
4675 log_error_errno(errno, "Failed to create temporary directory: %m");
4676 r = -errno;
4677 goto finish;
4678 }
4679
4680 arg_directory = strdup(template);
4681 if (!arg_directory) {
4682 r = log_oom();
4683 goto finish;
4684 }
4685
4686 image_fd = setup_image(&device_path, &loop_nr);
4687 if (image_fd < 0) {
4688 r = image_fd;
4689 goto finish;
4690 }
4691
4692 r = dissect_image(image_fd,
4693 &root_device, &root_device_rw,
4694 &home_device, &home_device_rw,
4695 &srv_device, &srv_device_rw,
4696 &secondary);
4697 if (r < 0)
4698 goto finish;
4699 }
4700
4701 r = custom_mounts_prepare();
4702 if (r < 0)
4703 goto finish;
4704
4705 interactive =
4706 isatty(STDIN_FILENO) > 0 &&
4707 isatty(STDOUT_FILENO) > 0;
4708
4709 master = posix_openpt(O_RDWR|O_NOCTTY|O_CLOEXEC|O_NDELAY);
4710 if (master < 0) {
4711 r = log_error_errno(errno, "Failed to acquire pseudo tty: %m");
4712 goto finish;
4713 }
4714
4715 r = ptsname_malloc(master, &console);
4716 if (r < 0) {
4717 r = log_error_errno(r, "Failed to determine tty name: %m");
4718 goto finish;
4719 }
4720
4721 if (unlockpt(master) < 0) {
4722 r = log_error_errno(errno, "Failed to unlock tty: %m");
4723 goto finish;
4724 }
4725
4726 if (!arg_quiet)
4727 log_info("Spawning container %s on %s.\nPress ^] three times within 1s to kill container.",
4728 arg_machine, arg_image ?: arg_directory);
4729
4730 assert_se(sigprocmask_many(SIG_BLOCK, NULL, SIGCHLD, SIGWINCH, SIGTERM, SIGINT, -1) >= 0);
4731
4732 assert_se(sigemptyset(&mask_chld) == 0);
4733 assert_se(sigaddset(&mask_chld, SIGCHLD) == 0);
4734
4735 if (prctl(PR_SET_CHILD_SUBREAPER, 1) < 0) {
4736 r = log_error_errno(errno, "Failed to become subreaper: %m");
4737 goto finish;
4738 }
4739
4740 for (;;) {
4741 _cleanup_close_pair_ int kmsg_socket_pair[2] = { -1, -1 }, rtnl_socket_pair[2] = { -1, -1 }, pid_socket_pair[2] = { -1, -1 },
4742 uid_shift_socket_pair[2] = { -1, -1 };
4743 ContainerStatus container_status;
4744 _cleanup_(barrier_destroy) Barrier barrier = BARRIER_NULL;
4745 static const struct sigaction sa = {
4746 .sa_handler = nop_handler,
4747 .sa_flags = SA_NOCLDSTOP,
4748 };
4749 int ifi = 0;
4750 ssize_t l;
4751 _cleanup_event_unref_ sd_event *event = NULL;
4752 _cleanup_(pty_forward_freep) PTYForward *forward = NULL;
4753 _cleanup_netlink_unref_ sd_netlink *rtnl = NULL;
4754 char last_char = 0;
4755
4756 r = barrier_create(&barrier);
4757 if (r < 0) {
4758 log_error_errno(r, "Cannot initialize IPC barrier: %m");
4759 goto finish;
4760 }
4761
4762 if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC, 0, kmsg_socket_pair) < 0) {
4763 r = log_error_errno(errno, "Failed to create kmsg socket pair: %m");
4764 goto finish;
4765 }
4766
4767 if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC, 0, rtnl_socket_pair) < 0) {
4768 r = log_error_errno(errno, "Failed to create rtnl socket pair: %m");
4769 goto finish;
4770 }
4771
4772 if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC, 0, pid_socket_pair) < 0) {
4773 r = log_error_errno(errno, "Failed to create pid socket pair: %m");
4774 goto finish;
4775 }
4776
4777 if (arg_userns)
4778 if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC, 0, uid_shift_socket_pair) < 0) {
4779 r = log_error_errno(errno, "Failed to create uid shift socket pair: %m");
4780 goto finish;
4781 }
4782
4783 /* Child can be killed before execv(), so handle SIGCHLD
4784 * in order to interrupt parent's blocking calls and
4785 * give it a chance to call wait() and terminate. */
4786 r = sigprocmask(SIG_UNBLOCK, &mask_chld, NULL);
4787 if (r < 0) {
4788 r = log_error_errno(errno, "Failed to change the signal mask: %m");
4789 goto finish;
4790 }
4791
4792 r = sigaction(SIGCHLD, &sa, NULL);
4793 if (r < 0) {
4794 r = log_error_errno(errno, "Failed to install SIGCHLD handler: %m");
4795 goto finish;
4796 }
4797
4798 pid = raw_clone(SIGCHLD|CLONE_NEWNS, NULL);
4799 if (pid < 0) {
4800 if (errno == EINVAL)
4801 r = log_error_errno(errno, "clone() failed, do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in): %m");
4802 else
4803 r = log_error_errno(errno, "clone() failed: %m");
4804
4805 goto finish;
4806 }
4807
4808 if (pid == 0) {
4809 /* The outer child only has a file system namespace. */
4810 barrier_set_role(&barrier, BARRIER_CHILD);
4811
4812 master = safe_close(master);
4813
4814 kmsg_socket_pair[0] = safe_close(kmsg_socket_pair[0]);
4815 rtnl_socket_pair[0] = safe_close(rtnl_socket_pair[0]);
4816 pid_socket_pair[0] = safe_close(pid_socket_pair[0]);
4817 uid_shift_socket_pair[0] = safe_close(uid_shift_socket_pair[0]);
4818
4819 (void) reset_all_signal_handlers();
4820 (void) reset_signal_mask();
4821
4822 r = outer_child(&barrier,
4823 arg_directory,
4824 console,
4825 root_device, root_device_rw,
4826 home_device, home_device_rw,
4827 srv_device, srv_device_rw,
4828 interactive,
4829 secondary,
4830 pid_socket_pair[1],
4831 kmsg_socket_pair[1],
4832 rtnl_socket_pair[1],
4833 uid_shift_socket_pair[1],
4834 fds,
4835 argc, argv);
4836 if (r < 0)
4837 _exit(EXIT_FAILURE);
4838
4839 _exit(EXIT_SUCCESS);
4840 }
4841
4842 barrier_set_role(&barrier, BARRIER_PARENT);
4843
4844 fdset_free(fds);
4845 fds = NULL;
4846
4847 kmsg_socket_pair[1] = safe_close(kmsg_socket_pair[1]);
4848 rtnl_socket_pair[1] = safe_close(rtnl_socket_pair[1]);
4849 pid_socket_pair[1] = safe_close(pid_socket_pair[1]);
4850
4851 /* Wait for the outer child. */
4852 r = wait_for_terminate_and_warn("namespace helper", pid, NULL);
4853 if (r < 0)
4854 goto finish;
4855 if (r != 0) {
4856 r = -EIO;
4857 goto finish;
4858 }
4859 pid = 0;
4860
4861 /* And now retrieve the PID of the inner child. */
4862 l = recv(pid_socket_pair[0], &pid, sizeof(pid), 0);
4863 if (l < 0) {
4864 r = log_error_errno(errno, "Failed to read inner child PID: %m");
4865 goto finish;
4866 }
4867 if (l != sizeof(pid)) {
4868 log_error("Short read while reading inner child PID: %m");
4869 r = EIO;
4870 goto finish;
4871 }
4872
4873 log_debug("Init process invoked as PID " PID_FMT, pid);
4874
4875 if (arg_userns) {
4876 if (!barrier_place_and_sync(&barrier)) { /* #1 */
4877 log_error("Child died too early.");
4878 r = -ESRCH;
4879 goto finish;
4880 }
4881
4882 l = recv(uid_shift_socket_pair[0], &arg_uid_shift, sizeof(arg_uid_shift), 0);
4883 if (l < 0) {
4884 r = log_error_errno(errno, "Failed to read UID shift: %m");
4885 goto finish;
4886 }
4887 if (l != sizeof(arg_uid_shift)) {
4888 log_error("Short read while reading UID shift: %m");
4889 r = EIO;
4890 goto finish;
4891 }
4892
4893 r = setup_uid_map(pid);
4894 if (r < 0)
4895 goto finish;
4896
4897 (void) barrier_place(&barrier); /* #2 */
4898 }
4899
4900 r = move_network_interfaces(pid);
4901 if (r < 0)
4902 goto finish;
4903
4904 r = setup_veth(pid, veth_name, &ifi);
4905 if (r < 0)
4906 goto finish;
4907
4908 r = setup_bridge(veth_name, &ifi);
4909 if (r < 0)
4910 goto finish;
4911
4912 r = setup_macvlan(pid);
4913 if (r < 0)
4914 goto finish;
4915
4916 r = setup_ipvlan(pid);
4917 if (r < 0)
4918 goto finish;
4919
4920 r = register_machine(pid, ifi);
4921 if (r < 0)
4922 goto finish;
4923
4924 r = chown_cgroup(pid);
4925 if (r < 0)
4926 goto finish;
4927
4928 /* Notify the child that the parent is ready with all
4929 * its setup (including cgroup-ification), and that
4930 * the child can now hand over control to the code to
4931 * run inside the container. */
4932 (void) barrier_place(&barrier); /* #3 */
4933
4934 /* Block SIGCHLD here, before notifying child.
4935 * process_pty() will handle it with the other signals. */
4936 assert_se(sigprocmask(SIG_BLOCK, &mask_chld, NULL) >= 0);
4937
4938 /* Reset signal to default */
4939 r = default_signals(SIGCHLD, -1);
4940 if (r < 0) {
4941 log_error_errno(r, "Failed to reset SIGCHLD: %m");
4942 goto finish;
4943 }
4944
4945 /* Let the child know that we are ready and wait that the child is completely ready now. */
4946 if (!barrier_place_and_sync(&barrier)) { /* #5 */
4947 log_error("Client died too early.");
4948 r = -ESRCH;
4949 goto finish;
4950 }
4951
4952 sd_notifyf(false,
4953 "READY=1\n"
4954 "STATUS=Container running.\n"
4955 "X_NSPAWN_LEADER_PID=" PID_FMT, pid);
4956
4957 r = sd_event_new(&event);
4958 if (r < 0) {
4959 log_error_errno(r, "Failed to get default event source: %m");
4960 goto finish;
4961 }
4962
4963 if (arg_kill_signal > 0) {
4964 /* Try to kill the init system on SIGINT or SIGTERM */
4965 sd_event_add_signal(event, NULL, SIGINT, on_orderly_shutdown, UINT32_TO_PTR(pid));
4966 sd_event_add_signal(event, NULL, SIGTERM, on_orderly_shutdown, UINT32_TO_PTR(pid));
4967 } else {
4968 /* Immediately exit */
4969 sd_event_add_signal(event, NULL, SIGINT, NULL, NULL);
4970 sd_event_add_signal(event, NULL, SIGTERM, NULL, NULL);
4971 }
4972
4973 /* simply exit on sigchld */
4974 sd_event_add_signal(event, NULL, SIGCHLD, NULL, NULL);
4975
4976 if (arg_expose_ports) {
4977 r = watch_rtnl(event, rtnl_socket_pair[0], &exposed, &rtnl);
4978 if (r < 0)
4979 goto finish;
4980
4981 (void) expose_ports(rtnl, &exposed);
4982 }
4983
4984 rtnl_socket_pair[0] = safe_close(rtnl_socket_pair[0]);
4985
4986 r = pty_forward_new(event, master, true, !interactive, &forward);
4987 if (r < 0) {
4988 log_error_errno(r, "Failed to create PTY forwarder: %m");
4989 goto finish;
4990 }
4991
4992 r = sd_event_loop(event);
4993 if (r < 0) {
4994 log_error_errno(r, "Failed to run event loop: %m");
4995 goto finish;
4996 }
4997
4998 pty_forward_get_last_char(forward, &last_char);
4999
5000 forward = pty_forward_free(forward);
5001
5002 if (!arg_quiet && last_char != '\n')
5003 putc('\n', stdout);
5004
5005 /* Kill if it is not dead yet anyway */
5006 terminate_machine(pid);
5007
5008 /* Normally redundant, but better safe than sorry */
5009 kill(pid, SIGKILL);
5010
5011 r = wait_for_container(pid, &container_status);
5012 pid = 0;
5013
5014 if (r < 0)
5015 /* We failed to wait for the container, or the
5016 * container exited abnormally */
5017 goto finish;
5018 else if (r > 0 || container_status == CONTAINER_TERMINATED){
5019 /* The container exited with a non-zero
5020 * status, or with zero status and no reboot
5021 * was requested. */
5022 ret = r;
5023 break;
5024 }
5025
5026 /* CONTAINER_REBOOTED, loop again */
5027
5028 if (arg_keep_unit) {
5029 /* Special handling if we are running as a
5030 * service: instead of simply restarting the
5031 * machine we want to restart the entire
5032 * service, so let's inform systemd about this
5033 * with the special exit code 133. The service
5034 * file uses RestartForceExitStatus=133 so
5035 * that this results in a full nspawn
5036 * restart. This is necessary since we might
5037 * have cgroup parameters set we want to have
5038 * flushed out. */
5039 ret = 133;
5040 r = 0;
5041 break;
5042 }
5043
5044 flush_ports(&exposed);
5045 }
5046
5047 finish:
5048 sd_notify(false,
5049 "STOPPING=1\n"
5050 "STATUS=Terminating...");
5051
5052 if (pid > 0)
5053 kill(pid, SIGKILL);
5054
5055 /* Try to flush whatever is still queued in the pty */
5056 if (master >= 0)
5057 (void) copy_bytes(master, STDOUT_FILENO, (off_t) -1, false);
5058
5059 loop_remove(loop_nr, &image_fd);
5060
5061 if (remove_subvol && arg_directory) {
5062 int k;
5063
5064 k = btrfs_subvol_remove(arg_directory, true);
5065 if (k < 0)
5066 log_warning_errno(k, "Cannot remove subvolume '%s', ignoring: %m", arg_directory);
5067 }
5068
5069 if (arg_machine) {
5070 const char *p;
5071
5072 p = strjoina("/run/systemd/nspawn/propagate/", arg_machine);
5073 (void) rm_rf(p, REMOVE_ROOT);
5074 }
5075
5076 free(arg_directory);
5077 free(arg_template);
5078 free(arg_image);
5079 free(arg_machine);
5080 free(arg_user);
5081 strv_free(arg_setenv);
5082 strv_free(arg_network_interfaces);
5083 strv_free(arg_network_macvlan);
5084 strv_free(arg_network_ipvlan);
5085 custom_mount_free_all();
5086
5087 flush_ports(&exposed);
5088
5089 while (arg_expose_ports) {
5090 ExposePort *p = arg_expose_ports;
5091 LIST_REMOVE(ports, arg_expose_ports, p);
5092 free(p);
5093 }
5094
5095 return r < 0 ? EXIT_FAILURE : ret;
5096 }