]> git.ipfire.org Git - thirdparty/systemd.git/blob - src/nspawn/nspawn.c
shared: add process-util.[ch]
[thirdparty/systemd.git] / src / nspawn / nspawn.c
1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
2
3 /***
4 This file is part of systemd.
5
6 Copyright 2010 Lennart Poettering
7
8 systemd is free software; you can redistribute it and/or modify it
9 under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 2.1 of the License, or
11 (at your option) any later version.
12
13 systemd is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 Lesser General Public License for more details.
17
18 You should have received a copy of the GNU Lesser General Public License
19 along with systemd; If not, see <http://www.gnu.org/licenses/>.
20 ***/
21
22 #include <signal.h>
23 #include <sched.h>
24 #include <unistd.h>
25 #include <sys/types.h>
26 #include <sys/mount.h>
27 #include <stdlib.h>
28 #include <string.h>
29 #include <stdio.h>
30 #include <errno.h>
31 #include <sys/prctl.h>
32 #include <getopt.h>
33 #include <grp.h>
34 #include <linux/fs.h>
35 #include <sys/socket.h>
36 #include <linux/netlink.h>
37 #include <net/if.h>
38 #include <linux/veth.h>
39 #include <sys/personality.h>
40 #include <linux/loop.h>
41 #include <sys/file.h>
42
43 #ifdef HAVE_SELINUX
44 #include <selinux/selinux.h>
45 #endif
46
47 #ifdef HAVE_SECCOMP
48 #include <seccomp.h>
49 #endif
50
51 #ifdef HAVE_BLKID
52 #include <blkid/blkid.h>
53 #endif
54
55 #include "sd-daemon.h"
56 #include "sd-bus.h"
57 #include "sd-id128.h"
58 #include "sd-rtnl.h"
59 #include "log.h"
60 #include "util.h"
61 #include "mkdir.h"
62 #include "rm-rf.h"
63 #include "macro.h"
64 #include "missing.h"
65 #include "cgroup-util.h"
66 #include "strv.h"
67 #include "path-util.h"
68 #include "loopback-setup.h"
69 #include "dev-setup.h"
70 #include "fdset.h"
71 #include "build.h"
72 #include "fileio.h"
73 #include "bus-util.h"
74 #include "bus-error.h"
75 #include "ptyfwd.h"
76 #include "env-util.h"
77 #include "rtnl-util.h"
78 #include "udev-util.h"
79 #include "blkid-util.h"
80 #include "gpt.h"
81 #include "siphash24.h"
82 #include "copy.h"
83 #include "base-filesystem.h"
84 #include "barrier.h"
85 #include "event-util.h"
86 #include "capability.h"
87 #include "cap-list.h"
88 #include "btrfs-util.h"
89 #include "machine-image.h"
90 #include "list.h"
91 #include "in-addr-util.h"
92 #include "fw-util.h"
93 #include "local-addresses.h"
94 #include "formats-util.h"
95 #include "process-util.h"
96
97 #ifdef HAVE_SECCOMP
98 #include "seccomp-util.h"
99 #endif
100
101 typedef struct ExposePort {
102 int protocol;
103 uint16_t host_port;
104 uint16_t container_port;
105 LIST_FIELDS(struct ExposePort, ports);
106 } ExposePort;
107
108 typedef enum ContainerStatus {
109 CONTAINER_TERMINATED,
110 CONTAINER_REBOOTED
111 } ContainerStatus;
112
113 typedef enum LinkJournal {
114 LINK_NO,
115 LINK_AUTO,
116 LINK_HOST,
117 LINK_GUEST
118 } LinkJournal;
119
120 typedef enum Volatile {
121 VOLATILE_NO,
122 VOLATILE_YES,
123 VOLATILE_STATE,
124 } Volatile;
125
126 static char *arg_directory = NULL;
127 static char *arg_template = NULL;
128 static char *arg_user = NULL;
129 static sd_id128_t arg_uuid = {};
130 static char *arg_machine = NULL;
131 static const char *arg_selinux_context = NULL;
132 static const char *arg_selinux_apifs_context = NULL;
133 static const char *arg_slice = NULL;
134 static bool arg_private_network = false;
135 static bool arg_read_only = false;
136 static bool arg_boot = false;
137 static bool arg_ephemeral = false;
138 static LinkJournal arg_link_journal = LINK_AUTO;
139 static bool arg_link_journal_try = false;
140 static uint64_t arg_retain =
141 (1ULL << CAP_CHOWN) |
142 (1ULL << CAP_DAC_OVERRIDE) |
143 (1ULL << CAP_DAC_READ_SEARCH) |
144 (1ULL << CAP_FOWNER) |
145 (1ULL << CAP_FSETID) |
146 (1ULL << CAP_IPC_OWNER) |
147 (1ULL << CAP_KILL) |
148 (1ULL << CAP_LEASE) |
149 (1ULL << CAP_LINUX_IMMUTABLE) |
150 (1ULL << CAP_NET_BIND_SERVICE) |
151 (1ULL << CAP_NET_BROADCAST) |
152 (1ULL << CAP_NET_RAW) |
153 (1ULL << CAP_SETGID) |
154 (1ULL << CAP_SETFCAP) |
155 (1ULL << CAP_SETPCAP) |
156 (1ULL << CAP_SETUID) |
157 (1ULL << CAP_SYS_ADMIN) |
158 (1ULL << CAP_SYS_CHROOT) |
159 (1ULL << CAP_SYS_NICE) |
160 (1ULL << CAP_SYS_PTRACE) |
161 (1ULL << CAP_SYS_TTY_CONFIG) |
162 (1ULL << CAP_SYS_RESOURCE) |
163 (1ULL << CAP_SYS_BOOT) |
164 (1ULL << CAP_AUDIT_WRITE) |
165 (1ULL << CAP_AUDIT_CONTROL) |
166 (1ULL << CAP_MKNOD);
167 static char **arg_bind = NULL;
168 static char **arg_bind_ro = NULL;
169 static char **arg_tmpfs = NULL;
170 static char **arg_setenv = NULL;
171 static bool arg_quiet = false;
172 static bool arg_share_system = false;
173 static bool arg_register = true;
174 static bool arg_keep_unit = false;
175 static char **arg_network_interfaces = NULL;
176 static char **arg_network_macvlan = NULL;
177 static char **arg_network_ipvlan = NULL;
178 static bool arg_network_veth = false;
179 static const char *arg_network_bridge = NULL;
180 static unsigned long arg_personality = 0xffffffffLU;
181 static char *arg_image = NULL;
182 static Volatile arg_volatile = VOLATILE_NO;
183 static ExposePort *arg_expose_ports = NULL;
184 static char **arg_property = NULL;
185 static uid_t arg_uid_shift = UID_INVALID, arg_uid_range = 0x10000U;
186 static bool arg_userns = false;
187 static int arg_kill_signal = 0;
188
189 static void help(void) {
190 printf("%s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
191 "Spawn a minimal namespace container for debugging, testing and building.\n\n"
192 " -h --help Show this help\n"
193 " --version Print version string\n"
194 " -q --quiet Do not show status information\n"
195 " -D --directory=PATH Root directory for the container\n"
196 " --template=PATH Initialize root directory from template directory,\n"
197 " if missing\n"
198 " -x --ephemeral Run container with snapshot of root directory, and\n"
199 " remove it after exit\n"
200 " -i --image=PATH File system device or disk image for the container\n"
201 " -b --boot Boot up full system (i.e. invoke init)\n"
202 " -u --user=USER Run the command under specified user or uid\n"
203 " -M --machine=NAME Set the machine name for the container\n"
204 " --uuid=UUID Set a specific machine UUID for the container\n"
205 " -S --slice=SLICE Place the container in the specified slice\n"
206 " --property=NAME=VALUE Set scope unit property\n"
207 " --private-network Disable network in container\n"
208 " --network-interface=INTERFACE\n"
209 " Assign an existing network interface to the\n"
210 " container\n"
211 " --network-macvlan=INTERFACE\n"
212 " Create a macvlan network interface based on an\n"
213 " existing network interface to the container\n"
214 " --network-ipvlan=INTERFACE\n"
215 " Create a ipvlan network interface based on an\n"
216 " existing network interface to the container\n"
217 " -n --network-veth Add a virtual ethernet connection between host\n"
218 " and container\n"
219 " --network-bridge=INTERFACE\n"
220 " Add a virtual ethernet connection between host\n"
221 " and container and add it to an existing bridge on\n"
222 " the host\n"
223 " --private-users[=UIDBASE[:NUIDS]]\n"
224 " Run within user namespace\n"
225 " -p --port=[PROTOCOL:]HOSTPORT[:CONTAINERPORT]\n"
226 " Expose a container IP port on the host\n"
227 " -Z --selinux-context=SECLABEL\n"
228 " Set the SELinux security context to be used by\n"
229 " processes in the container\n"
230 " -L --selinux-apifs-context=SECLABEL\n"
231 " Set the SELinux security context to be used by\n"
232 " API/tmpfs file systems in the container\n"
233 " --capability=CAP In addition to the default, retain specified\n"
234 " capability\n"
235 " --drop-capability=CAP Drop the specified capability from the default set\n"
236 " --kill-signal=SIGNAL Select signal to use for shutting down PID 1\n"
237 " --link-journal=MODE Link up guest journal, one of no, auto, guest, host,\n"
238 " try-guest, try-host\n"
239 " -j Equivalent to --link-journal=try-guest\n"
240 " --read-only Mount the root directory read-only\n"
241 " --bind=PATH[:PATH] Bind mount a file or directory from the host into\n"
242 " the container\n"
243 " --bind-ro=PATH[:PATH] Similar, but creates a read-only bind mount\n"
244 " --tmpfs=PATH:[OPTIONS] Mount an empty tmpfs to the specified directory\n"
245 " --setenv=NAME=VALUE Pass an environment variable to PID 1\n"
246 " --share-system Share system namespaces with host\n"
247 " --register=BOOLEAN Register container as machine\n"
248 " --keep-unit Do not register a scope for the machine, reuse\n"
249 " the service unit nspawn is running in\n"
250 " --volatile[=MODE] Run the system in volatile mode\n"
251 , program_invocation_short_name);
252 }
253
254 static int set_sanitized_path(char **b, const char *path) {
255 char *p;
256
257 assert(b);
258 assert(path);
259
260 p = canonicalize_file_name(path);
261 if (!p) {
262 if (errno != ENOENT)
263 return -errno;
264
265 p = path_make_absolute_cwd(path);
266 if (!p)
267 return -ENOMEM;
268 }
269
270 free(*b);
271 *b = path_kill_slashes(p);
272 return 0;
273 }
274
275 static int parse_argv(int argc, char *argv[]) {
276
277 enum {
278 ARG_VERSION = 0x100,
279 ARG_PRIVATE_NETWORK,
280 ARG_UUID,
281 ARG_READ_ONLY,
282 ARG_CAPABILITY,
283 ARG_DROP_CAPABILITY,
284 ARG_LINK_JOURNAL,
285 ARG_BIND,
286 ARG_BIND_RO,
287 ARG_TMPFS,
288 ARG_SETENV,
289 ARG_SHARE_SYSTEM,
290 ARG_REGISTER,
291 ARG_KEEP_UNIT,
292 ARG_NETWORK_INTERFACE,
293 ARG_NETWORK_MACVLAN,
294 ARG_NETWORK_IPVLAN,
295 ARG_NETWORK_BRIDGE,
296 ARG_PERSONALITY,
297 ARG_VOLATILE,
298 ARG_TEMPLATE,
299 ARG_PROPERTY,
300 ARG_PRIVATE_USERS,
301 ARG_KILL_SIGNAL,
302 };
303
304 static const struct option options[] = {
305 { "help", no_argument, NULL, 'h' },
306 { "version", no_argument, NULL, ARG_VERSION },
307 { "directory", required_argument, NULL, 'D' },
308 { "template", required_argument, NULL, ARG_TEMPLATE },
309 { "ephemeral", no_argument, NULL, 'x' },
310 { "user", required_argument, NULL, 'u' },
311 { "private-network", no_argument, NULL, ARG_PRIVATE_NETWORK },
312 { "boot", no_argument, NULL, 'b' },
313 { "uuid", required_argument, NULL, ARG_UUID },
314 { "read-only", no_argument, NULL, ARG_READ_ONLY },
315 { "capability", required_argument, NULL, ARG_CAPABILITY },
316 { "drop-capability", required_argument, NULL, ARG_DROP_CAPABILITY },
317 { "link-journal", required_argument, NULL, ARG_LINK_JOURNAL },
318 { "bind", required_argument, NULL, ARG_BIND },
319 { "bind-ro", required_argument, NULL, ARG_BIND_RO },
320 { "tmpfs", required_argument, NULL, ARG_TMPFS },
321 { "machine", required_argument, NULL, 'M' },
322 { "slice", required_argument, NULL, 'S' },
323 { "setenv", required_argument, NULL, ARG_SETENV },
324 { "selinux-context", required_argument, NULL, 'Z' },
325 { "selinux-apifs-context", required_argument, NULL, 'L' },
326 { "quiet", no_argument, NULL, 'q' },
327 { "share-system", no_argument, NULL, ARG_SHARE_SYSTEM },
328 { "register", required_argument, NULL, ARG_REGISTER },
329 { "keep-unit", no_argument, NULL, ARG_KEEP_UNIT },
330 { "network-interface", required_argument, NULL, ARG_NETWORK_INTERFACE },
331 { "network-macvlan", required_argument, NULL, ARG_NETWORK_MACVLAN },
332 { "network-ipvlan", required_argument, NULL, ARG_NETWORK_IPVLAN },
333 { "network-veth", no_argument, NULL, 'n' },
334 { "network-bridge", required_argument, NULL, ARG_NETWORK_BRIDGE },
335 { "personality", required_argument, NULL, ARG_PERSONALITY },
336 { "image", required_argument, NULL, 'i' },
337 { "volatile", optional_argument, NULL, ARG_VOLATILE },
338 { "port", required_argument, NULL, 'p' },
339 { "property", required_argument, NULL, ARG_PROPERTY },
340 { "private-users", optional_argument, NULL, ARG_PRIVATE_USERS },
341 { "kill-signal", required_argument, NULL, ARG_KILL_SIGNAL },
342 {}
343 };
344
345 int c, r;
346 uint64_t plus = 0, minus = 0;
347
348 assert(argc >= 0);
349 assert(argv);
350
351 while ((c = getopt_long(argc, argv, "+hD:u:bL:M:jS:Z:qi:xp:n", options, NULL)) >= 0)
352
353 switch (c) {
354
355 case 'h':
356 help();
357 return 0;
358
359 case ARG_VERSION:
360 puts(PACKAGE_STRING);
361 puts(SYSTEMD_FEATURES);
362 return 0;
363
364 case 'D':
365 r = set_sanitized_path(&arg_directory, optarg);
366 if (r < 0)
367 return log_error_errno(r, "Invalid root directory: %m");
368
369 break;
370
371 case ARG_TEMPLATE:
372 r = set_sanitized_path(&arg_template, optarg);
373 if (r < 0)
374 return log_error_errno(r, "Invalid template directory: %m");
375
376 break;
377
378 case 'i':
379 r = set_sanitized_path(&arg_image, optarg);
380 if (r < 0)
381 return log_error_errno(r, "Invalid image path: %m");
382
383 break;
384
385 case 'x':
386 arg_ephemeral = true;
387 break;
388
389 case 'u':
390 free(arg_user);
391 arg_user = strdup(optarg);
392 if (!arg_user)
393 return log_oom();
394
395 break;
396
397 case ARG_NETWORK_BRIDGE:
398 arg_network_bridge = optarg;
399
400 /* fall through */
401
402 case 'n':
403 arg_network_veth = true;
404 arg_private_network = true;
405 break;
406
407 case ARG_NETWORK_INTERFACE:
408 if (strv_extend(&arg_network_interfaces, optarg) < 0)
409 return log_oom();
410
411 arg_private_network = true;
412 break;
413
414 case ARG_NETWORK_MACVLAN:
415 if (strv_extend(&arg_network_macvlan, optarg) < 0)
416 return log_oom();
417
418 arg_private_network = true;
419 break;
420
421 case ARG_NETWORK_IPVLAN:
422 if (strv_extend(&arg_network_ipvlan, optarg) < 0)
423 return log_oom();
424
425 /* fall through */
426
427 case ARG_PRIVATE_NETWORK:
428 arg_private_network = true;
429 break;
430
431 case 'b':
432 arg_boot = true;
433 break;
434
435 case ARG_UUID:
436 r = sd_id128_from_string(optarg, &arg_uuid);
437 if (r < 0) {
438 log_error("Invalid UUID: %s", optarg);
439 return r;
440 }
441 break;
442
443 case 'S':
444 arg_slice = optarg;
445 break;
446
447 case 'M':
448 if (isempty(optarg)) {
449 free(arg_machine);
450 arg_machine = NULL;
451 } else {
452 if (!machine_name_is_valid(optarg)) {
453 log_error("Invalid machine name: %s", optarg);
454 return -EINVAL;
455 }
456
457 r = free_and_strdup(&arg_machine, optarg);
458 if (r < 0)
459 return log_oom();
460
461 break;
462 }
463
464 case 'Z':
465 arg_selinux_context = optarg;
466 break;
467
468 case 'L':
469 arg_selinux_apifs_context = optarg;
470 break;
471
472 case ARG_READ_ONLY:
473 arg_read_only = true;
474 break;
475
476 case ARG_CAPABILITY:
477 case ARG_DROP_CAPABILITY: {
478 const char *state, *word;
479 size_t length;
480
481 FOREACH_WORD_SEPARATOR(word, length, optarg, ",", state) {
482 _cleanup_free_ char *t;
483
484 t = strndup(word, length);
485 if (!t)
486 return log_oom();
487
488 if (streq(t, "all")) {
489 if (c == ARG_CAPABILITY)
490 plus = (uint64_t) -1;
491 else
492 minus = (uint64_t) -1;
493 } else {
494 int cap;
495
496 cap = capability_from_name(t);
497 if (cap < 0) {
498 log_error("Failed to parse capability %s.", t);
499 return -EINVAL;
500 }
501
502 if (c == ARG_CAPABILITY)
503 plus |= 1ULL << (uint64_t) cap;
504 else
505 minus |= 1ULL << (uint64_t) cap;
506 }
507 }
508
509 break;
510 }
511
512 case 'j':
513 arg_link_journal = LINK_GUEST;
514 arg_link_journal_try = true;
515 break;
516
517 case ARG_LINK_JOURNAL:
518 if (streq(optarg, "auto")) {
519 arg_link_journal = LINK_AUTO;
520 arg_link_journal_try = false;
521 } else if (streq(optarg, "no")) {
522 arg_link_journal = LINK_NO;
523 arg_link_journal_try = false;
524 } else if (streq(optarg, "guest")) {
525 arg_link_journal = LINK_GUEST;
526 arg_link_journal_try = false;
527 } else if (streq(optarg, "host")) {
528 arg_link_journal = LINK_HOST;
529 arg_link_journal_try = false;
530 } else if (streq(optarg, "try-guest")) {
531 arg_link_journal = LINK_GUEST;
532 arg_link_journal_try = true;
533 } else if (streq(optarg, "try-host")) {
534 arg_link_journal = LINK_HOST;
535 arg_link_journal_try = true;
536 } else {
537 log_error("Failed to parse link journal mode %s", optarg);
538 return -EINVAL;
539 }
540
541 break;
542
543 case ARG_BIND:
544 case ARG_BIND_RO: {
545 _cleanup_free_ char *a = NULL, *b = NULL;
546 char *e;
547 char ***x;
548
549 x = c == ARG_BIND ? &arg_bind : &arg_bind_ro;
550
551 e = strchr(optarg, ':');
552 if (e) {
553 a = strndup(optarg, e - optarg);
554 b = strdup(e + 1);
555 } else {
556 a = strdup(optarg);
557 b = strdup(optarg);
558 }
559
560 if (!a || !b)
561 return log_oom();
562
563 if (!path_is_absolute(a) || !path_is_absolute(b)) {
564 log_error("Invalid bind mount specification: %s", optarg);
565 return -EINVAL;
566 }
567
568 r = strv_extend(x, a);
569 if (r < 0)
570 return log_oom();
571
572 r = strv_extend(x, b);
573 if (r < 0)
574 return log_oom();
575
576 break;
577 }
578
579 case ARG_TMPFS: {
580 _cleanup_free_ char *a = NULL, *b = NULL;
581 char *e;
582
583 e = strchr(optarg, ':');
584 if (e) {
585 a = strndup(optarg, e - optarg);
586 b = strdup(e + 1);
587 } else {
588 a = strdup(optarg);
589 b = strdup("mode=0755");
590 }
591
592 if (!a || !b)
593 return log_oom();
594
595 if (!path_is_absolute(a)) {
596 log_error("Invalid tmpfs specification: %s", optarg);
597 return -EINVAL;
598 }
599
600 r = strv_push(&arg_tmpfs, a);
601 if (r < 0)
602 return log_oom();
603
604 a = NULL;
605
606 r = strv_push(&arg_tmpfs, b);
607 if (r < 0)
608 return log_oom();
609
610 b = NULL;
611
612 break;
613 }
614
615 case ARG_SETENV: {
616 char **n;
617
618 if (!env_assignment_is_valid(optarg)) {
619 log_error("Environment variable assignment '%s' is not valid.", optarg);
620 return -EINVAL;
621 }
622
623 n = strv_env_set(arg_setenv, optarg);
624 if (!n)
625 return log_oom();
626
627 strv_free(arg_setenv);
628 arg_setenv = n;
629 break;
630 }
631
632 case 'q':
633 arg_quiet = true;
634 break;
635
636 case ARG_SHARE_SYSTEM:
637 arg_share_system = true;
638 break;
639
640 case ARG_REGISTER:
641 r = parse_boolean(optarg);
642 if (r < 0) {
643 log_error("Failed to parse --register= argument: %s", optarg);
644 return r;
645 }
646
647 arg_register = r;
648 break;
649
650 case ARG_KEEP_UNIT:
651 arg_keep_unit = true;
652 break;
653
654 case ARG_PERSONALITY:
655
656 arg_personality = personality_from_string(optarg);
657 if (arg_personality == 0xffffffffLU) {
658 log_error("Unknown or unsupported personality '%s'.", optarg);
659 return -EINVAL;
660 }
661
662 break;
663
664 case ARG_VOLATILE:
665
666 if (!optarg)
667 arg_volatile = VOLATILE_YES;
668 else {
669 r = parse_boolean(optarg);
670 if (r < 0) {
671 if (streq(optarg, "state"))
672 arg_volatile = VOLATILE_STATE;
673 else {
674 log_error("Failed to parse --volatile= argument: %s", optarg);
675 return r;
676 }
677 } else
678 arg_volatile = r ? VOLATILE_YES : VOLATILE_NO;
679 }
680
681 break;
682
683 case 'p': {
684 const char *split, *e;
685 uint16_t container_port, host_port;
686 int protocol;
687 ExposePort *p;
688
689 if ((e = startswith(optarg, "tcp:")))
690 protocol = IPPROTO_TCP;
691 else if ((e = startswith(optarg, "udp:")))
692 protocol = IPPROTO_UDP;
693 else {
694 e = optarg;
695 protocol = IPPROTO_TCP;
696 }
697
698 split = strchr(e, ':');
699 if (split) {
700 char v[split - e + 1];
701
702 memcpy(v, e, split - e);
703 v[split - e] = 0;
704
705 r = safe_atou16(v, &host_port);
706 if (r < 0 || host_port <= 0) {
707 log_error("Failed to parse host port: %s", optarg);
708 return -EINVAL;
709 }
710
711 r = safe_atou16(split + 1, &container_port);
712 } else {
713 r = safe_atou16(e, &container_port);
714 host_port = container_port;
715 }
716
717 if (r < 0 || container_port <= 0) {
718 log_error("Failed to parse host port: %s", optarg);
719 return -EINVAL;
720 }
721
722 LIST_FOREACH(ports, p, arg_expose_ports) {
723 if (p->protocol == protocol && p->host_port == host_port) {
724 log_error("Duplicate port specification: %s", optarg);
725 return -EINVAL;
726 }
727 }
728
729 p = new(ExposePort, 1);
730 if (!p)
731 return log_oom();
732
733 p->protocol = protocol;
734 p->host_port = host_port;
735 p->container_port = container_port;
736
737 LIST_PREPEND(ports, arg_expose_ports, p);
738
739 break;
740 }
741
742 case ARG_PROPERTY:
743 if (strv_extend(&arg_property, optarg) < 0)
744 return log_oom();
745
746 break;
747
748 case ARG_PRIVATE_USERS:
749 if (optarg) {
750 _cleanup_free_ char *buffer = NULL;
751 const char *range, *shift;
752
753 range = strchr(optarg, ':');
754 if (range) {
755 buffer = strndup(optarg, range - optarg);
756 if (!buffer)
757 return log_oom();
758 shift = buffer;
759
760 range++;
761 if (safe_atou32(range, &arg_uid_range) < 0 || arg_uid_range <= 0) {
762 log_error("Failed to parse UID range: %s", range);
763 return -EINVAL;
764 }
765 } else
766 shift = optarg;
767
768 if (parse_uid(shift, &arg_uid_shift) < 0) {
769 log_error("Failed to parse UID: %s", optarg);
770 return -EINVAL;
771 }
772 }
773
774 arg_userns = true;
775 break;
776
777 case ARG_KILL_SIGNAL:
778 arg_kill_signal = signal_from_string_try_harder(optarg);
779 if (arg_kill_signal < 0) {
780 log_error("Cannot parse signal: %s", optarg);
781 return -EINVAL;
782 }
783
784 break;
785
786 case '?':
787 return -EINVAL;
788
789 default:
790 assert_not_reached("Unhandled option");
791 }
792
793 if (arg_share_system)
794 arg_register = false;
795
796 if (arg_boot && arg_share_system) {
797 log_error("--boot and --share-system may not be combined.");
798 return -EINVAL;
799 }
800
801 if (arg_keep_unit && cg_pid_get_owner_uid(0, NULL) >= 0) {
802 log_error("--keep-unit may not be used when invoked from a user session.");
803 return -EINVAL;
804 }
805
806 if (arg_directory && arg_image) {
807 log_error("--directory= and --image= may not be combined.");
808 return -EINVAL;
809 }
810
811 if (arg_template && arg_image) {
812 log_error("--template= and --image= may not be combined.");
813 return -EINVAL;
814 }
815
816 if (arg_template && !(arg_directory || arg_machine)) {
817 log_error("--template= needs --directory= or --machine=.");
818 return -EINVAL;
819 }
820
821 if (arg_ephemeral && arg_template) {
822 log_error("--ephemeral and --template= may not be combined.");
823 return -EINVAL;
824 }
825
826 if (arg_ephemeral && arg_image) {
827 log_error("--ephemeral and --image= may not be combined.");
828 return -EINVAL;
829 }
830
831 if (arg_ephemeral && !IN_SET(arg_link_journal, LINK_NO, LINK_AUTO)) {
832 log_error("--ephemeral and --link-journal= may not be combined.");
833 return -EINVAL;
834 }
835
836 if (arg_volatile != VOLATILE_NO && arg_read_only) {
837 log_error("Cannot combine --read-only with --volatile. Note that --volatile already implies a read-only base hierarchy.");
838 return -EINVAL;
839 }
840
841 if (arg_expose_ports && !arg_private_network) {
842 log_error("Cannot use --port= without private networking.");
843 return -EINVAL;
844 }
845
846 arg_retain = (arg_retain | plus | (arg_private_network ? 1ULL << CAP_NET_ADMIN : 0)) & ~minus;
847
848 if (arg_boot && arg_kill_signal <= 0)
849 arg_kill_signal = SIGRTMIN+3;
850
851 return 1;
852 }
853
854 static int mount_all(const char *dest) {
855
856 typedef struct MountPoint {
857 const char *what;
858 const char *where;
859 const char *type;
860 const char *options;
861 unsigned long flags;
862 bool fatal;
863 } MountPoint;
864
865 static const MountPoint mount_table[] = {
866 { "proc", "/proc", "proc", NULL, MS_NOSUID|MS_NOEXEC|MS_NODEV, true },
867 { "/proc/sys", "/proc/sys", NULL, NULL, MS_BIND, true }, /* Bind mount first */
868 { NULL, "/proc/sys", NULL, NULL, MS_BIND|MS_RDONLY|MS_REMOUNT, true }, /* Then, make it r/o */
869 { "sysfs", "/sys", "sysfs", NULL, MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV, true },
870 { "tmpfs", "/dev", "tmpfs", "mode=755", MS_NOSUID|MS_STRICTATIME, true },
871 { "devpts", "/dev/pts", "devpts","newinstance,ptmxmode=0666,mode=620,gid=" STRINGIFY(TTY_GID), MS_NOSUID|MS_NOEXEC, true },
872 { "tmpfs", "/dev/shm", "tmpfs", "mode=1777", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true },
873 { "tmpfs", "/run", "tmpfs", "mode=755", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true },
874 { "tmpfs", "/tmp", "tmpfs", "mode=1777", MS_STRICTATIME, true },
875 #ifdef HAVE_SELINUX
876 { "/sys/fs/selinux", "/sys/fs/selinux", NULL, NULL, MS_BIND, false }, /* Bind mount first */
877 { NULL, "/sys/fs/selinux", NULL, NULL, MS_BIND|MS_RDONLY|MS_REMOUNT, false }, /* Then, make it r/o */
878 #endif
879 };
880
881 unsigned k;
882 int r = 0;
883
884 for (k = 0; k < ELEMENTSOF(mount_table); k++) {
885 _cleanup_free_ char *where = NULL, *options = NULL;
886 const char *o;
887 int t;
888
889 where = strjoin(dest, "/", mount_table[k].where, NULL);
890 if (!where)
891 return log_oom();
892
893 t = path_is_mount_point(where, true);
894 if (t < 0 && t != -ENOENT) {
895 log_error_errno(t, "Failed to detect whether %s is a mount point: %m", where);
896
897 if (r == 0)
898 r = t;
899
900 continue;
901 }
902
903 /* Skip this entry if it is not a remount. */
904 if (mount_table[k].what && t > 0)
905 continue;
906
907 t = mkdir_p(where, 0755);
908 if (t < 0) {
909 if (mount_table[k].fatal) {
910 log_error_errno(t, "Failed to create directory %s: %m", where);
911
912 if (r == 0)
913 r = t;
914 } else
915 log_warning_errno(t, "Failed to create directory %s: %m", where);
916
917 continue;
918 }
919
920 #ifdef HAVE_SELINUX
921 if (arg_selinux_apifs_context &&
922 (streq_ptr(mount_table[k].what, "tmpfs") || streq_ptr(mount_table[k].what, "devpts"))) {
923 options = strjoin(mount_table[k].options, ",context=\"", arg_selinux_apifs_context, "\"", NULL);
924 if (!options)
925 return log_oom();
926
927 o = options;
928 } else
929 #endif
930 o = mount_table[k].options;
931
932 if (arg_userns && arg_uid_shift != UID_INVALID && streq_ptr(mount_table[k].type, "tmpfs")) {
933 char *uid_options = NULL;
934
935 if (o)
936 asprintf(&uid_options, "%s,uid=" UID_FMT ",gid=" UID_FMT, o, arg_uid_shift, arg_uid_shift);
937 else
938 asprintf(&uid_options, "uid=" UID_FMT ",gid=" UID_FMT, arg_uid_shift, arg_uid_shift);
939 if (!uid_options)
940 return log_oom();
941
942 free(options);
943 o = options = uid_options;
944 }
945
946 if (mount(mount_table[k].what,
947 where,
948 mount_table[k].type,
949 mount_table[k].flags,
950 o) < 0) {
951
952 if (mount_table[k].fatal) {
953 log_error_errno(errno, "mount(%s) failed: %m", where);
954
955 if (r == 0)
956 r = -errno;
957 } else
958 log_warning_errno(errno, "mount(%s) failed: %m", where);
959 }
960 }
961
962 return r;
963 }
964
965 static int mount_binds(const char *dest, char **l, bool ro) {
966 char **x, **y;
967
968 STRV_FOREACH_PAIR(x, y, l) {
969 _cleanup_free_ char *where = NULL;
970 struct stat source_st, dest_st;
971 int r;
972
973 if (stat(*x, &source_st) < 0)
974 return log_error_errno(errno, "Failed to stat %s: %m", *x);
975
976 where = strappend(dest, *y);
977 if (!where)
978 return log_oom();
979
980 r = stat(where, &dest_st);
981 if (r == 0) {
982 if (S_ISDIR(source_st.st_mode) && !S_ISDIR(dest_st.st_mode)) {
983 log_error("Cannot bind mount directory %s on file %s.", *x, where);
984 return -EINVAL;
985 }
986 if (!S_ISDIR(source_st.st_mode) && S_ISDIR(dest_st.st_mode)) {
987 log_error("Cannot bind mount file %s on directory %s.", *x, where);
988 return -EINVAL;
989 }
990 } else if (errno == ENOENT) {
991 r = mkdir_parents_label(where, 0755);
992 if (r < 0)
993 return log_error_errno(r, "Failed to bind mount %s: %m", *x);
994 } else {
995 log_error_errno(errno, "Failed to bind mount %s: %m", *x);
996 return -errno;
997 }
998
999 /* Create the mount point. Any non-directory file can be
1000 * mounted on any non-directory file (regular, fifo, socket,
1001 * char, block).
1002 */
1003 if (S_ISDIR(source_st.st_mode)) {
1004 r = mkdir_label(where, 0755);
1005 if (r < 0 && errno != EEXIST)
1006 return log_error_errno(r, "Failed to create mount point %s: %m", where);
1007 } else {
1008 r = touch(where);
1009 if (r < 0)
1010 return log_error_errno(r, "Failed to create mount point %s: %m", where);
1011 }
1012
1013 if (mount(*x, where, NULL, MS_BIND, NULL) < 0)
1014 return log_error_errno(errno, "mount(%s) failed: %m", where);
1015
1016 if (ro) {
1017 r = bind_remount_recursive(where, true);
1018 if (r < 0)
1019 return log_error_errno(r, "Read-Only bind mount failed: %m");
1020 }
1021 }
1022
1023 return 0;
1024 }
1025
1026 static int mount_cgroup_hierarchy(const char *dest, const char *controller, const char *hierarchy, bool read_only) {
1027 char *to;
1028 int r;
1029
1030 to = strjoina(dest, "/sys/fs/cgroup/", hierarchy);
1031
1032 r = path_is_mount_point(to, false);
1033 if (r < 0 && r != -ENOENT)
1034 return log_error_errno(r, "Failed to determine if %s is mounted already: %m", to);
1035 if (r > 0)
1036 return 0;
1037
1038 mkdir_p(to, 0755);
1039
1040 /* The superblock mount options of the mount point need to be
1041 * identical to the hosts', and hence writable... */
1042 if (mount("cgroup", to, "cgroup", MS_NOSUID|MS_NOEXEC|MS_NODEV, controller) < 0)
1043 return log_error_errno(errno, "Failed to mount to %s: %m", to);
1044
1045 /* ... hence let's only make the bind mount read-only, not the
1046 * superblock. */
1047 if (read_only) {
1048 if (mount(NULL, to, NULL, MS_BIND|MS_REMOUNT|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_RDONLY, NULL) < 0)
1049 return log_error_errno(errno, "Failed to remount %s read-only: %m", to);
1050 }
1051 return 1;
1052 }
1053
1054 static int mount_cgroup(const char *dest) {
1055 _cleanup_set_free_free_ Set *controllers = NULL;
1056 _cleanup_free_ char *own_cgroup_path = NULL;
1057 const char *cgroup_root, *systemd_root, *systemd_own;
1058 int r;
1059
1060 controllers = set_new(&string_hash_ops);
1061 if (!controllers)
1062 return log_oom();
1063
1064 r = cg_kernel_controllers(controllers);
1065 if (r < 0)
1066 return log_error_errno(r, "Failed to determine cgroup controllers: %m");
1067
1068 r = cg_pid_get_path(NULL, 0, &own_cgroup_path);
1069 if (r < 0)
1070 return log_error_errno(r, "Failed to determine our own cgroup path: %m");
1071
1072 cgroup_root = strjoina(dest, "/sys/fs/cgroup");
1073 if (mount("tmpfs", cgroup_root, "tmpfs", MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_STRICTATIME, "mode=755") < 0)
1074 return log_error_errno(errno, "Failed to mount tmpfs to /sys/fs/cgroup: %m");
1075
1076 for (;;) {
1077 _cleanup_free_ char *controller = NULL, *origin = NULL, *combined = NULL;
1078
1079 controller = set_steal_first(controllers);
1080 if (!controller)
1081 break;
1082
1083 origin = strappend("/sys/fs/cgroup/", controller);
1084 if (!origin)
1085 return log_oom();
1086
1087 r = readlink_malloc(origin, &combined);
1088 if (r == -EINVAL) {
1089 /* Not a symbolic link, but directly a single cgroup hierarchy */
1090
1091 r = mount_cgroup_hierarchy(dest, controller, controller, true);
1092 if (r < 0)
1093 return r;
1094
1095 } else if (r < 0)
1096 return log_error_errno(r, "Failed to read link %s: %m", origin);
1097 else {
1098 _cleanup_free_ char *target = NULL;
1099
1100 target = strjoin(dest, "/sys/fs/cgroup/", controller, NULL);
1101 if (!target)
1102 return log_oom();
1103
1104 /* A symbolic link, a combination of controllers in one hierarchy */
1105
1106 if (!filename_is_valid(combined)) {
1107 log_warning("Ignoring invalid combined hierarchy %s.", combined);
1108 continue;
1109 }
1110
1111 r = mount_cgroup_hierarchy(dest, combined, combined, true);
1112 if (r < 0)
1113 return r;
1114
1115 if (symlink(combined, target) < 0)
1116 return log_error_errno(errno, "Failed to create symlink for combined hierarchy: %m");
1117 }
1118 }
1119
1120 r = mount_cgroup_hierarchy(dest, "name=systemd,xattr", "systemd", false);
1121 if (r < 0)
1122 return r;
1123
1124 /* Make our own cgroup a (writable) bind mount */
1125 systemd_own = strjoina(dest, "/sys/fs/cgroup/systemd", own_cgroup_path);
1126 if (mount(systemd_own, systemd_own, NULL, MS_BIND, NULL) < 0)
1127 return log_error_errno(errno, "Failed to turn %s into a bind mount: %m", own_cgroup_path);
1128
1129 /* And then remount the systemd cgroup root read-only */
1130 systemd_root = strjoina(dest, "/sys/fs/cgroup/systemd");
1131 if (mount(NULL, systemd_root, NULL, MS_BIND|MS_REMOUNT|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_RDONLY, NULL) < 0)
1132 return log_error_errno(errno, "Failed to mount cgroup root read-only: %m");
1133
1134 if (mount(NULL, cgroup_root, NULL, MS_REMOUNT|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_STRICTATIME|MS_RDONLY, "mode=755") < 0)
1135 return log_error_errno(errno, "Failed to remount %s read-only: %m", cgroup_root);
1136
1137 return 0;
1138 }
1139
1140 static int mount_tmpfs(const char *dest) {
1141 char **i, **o;
1142
1143 STRV_FOREACH_PAIR(i, o, arg_tmpfs) {
1144 _cleanup_free_ char *where = NULL;
1145 int r;
1146
1147 where = strappend(dest, *i);
1148 if (!where)
1149 return log_oom();
1150
1151 r = mkdir_label(where, 0755);
1152 if (r < 0 && r != -EEXIST)
1153 return log_error_errno(r, "Creating mount point for tmpfs %s failed: %m", where);
1154
1155 if (mount("tmpfs", where, "tmpfs", MS_NODEV|MS_STRICTATIME, *o) < 0)
1156 return log_error_errno(errno, "tmpfs mount to %s failed: %m", where);
1157 }
1158
1159 return 0;
1160 }
1161
1162 static int setup_timezone(const char *dest) {
1163 _cleanup_free_ char *where = NULL, *p = NULL, *q = NULL, *check = NULL, *what = NULL;
1164 char *z, *y;
1165 int r;
1166
1167 assert(dest);
1168
1169 /* Fix the timezone, if possible */
1170 r = readlink_malloc("/etc/localtime", &p);
1171 if (r < 0) {
1172 log_warning("/etc/localtime is not a symlink, not updating container timezone.");
1173 return 0;
1174 }
1175
1176 z = path_startswith(p, "../usr/share/zoneinfo/");
1177 if (!z)
1178 z = path_startswith(p, "/usr/share/zoneinfo/");
1179 if (!z) {
1180 log_warning("/etc/localtime does not point into /usr/share/zoneinfo/, not updating container timezone.");
1181 return 0;
1182 }
1183
1184 where = strappend(dest, "/etc/localtime");
1185 if (!where)
1186 return log_oom();
1187
1188 r = readlink_malloc(where, &q);
1189 if (r >= 0) {
1190 y = path_startswith(q, "../usr/share/zoneinfo/");
1191 if (!y)
1192 y = path_startswith(q, "/usr/share/zoneinfo/");
1193
1194 /* Already pointing to the right place? Then do nothing .. */
1195 if (y && streq(y, z))
1196 return 0;
1197 }
1198
1199 check = strjoin(dest, "/usr/share/zoneinfo/", z, NULL);
1200 if (!check)
1201 return log_oom();
1202
1203 if (access(check, F_OK) < 0) {
1204 log_warning("Timezone %s does not exist in container, not updating container timezone.", z);
1205 return 0;
1206 }
1207
1208 what = strappend("../usr/share/zoneinfo/", z);
1209 if (!what)
1210 return log_oom();
1211
1212 r = mkdir_parents(where, 0755);
1213 if (r < 0) {
1214 log_error_errno(r, "Failed to create directory for timezone info %s in container: %m", where);
1215
1216 return 0;
1217 }
1218
1219 r = unlink(where);
1220 if (r < 0 && errno != ENOENT) {
1221 log_error_errno(errno, "Failed to remove existing timezone info %s in container: %m", where);
1222
1223 return 0;
1224 }
1225
1226 if (symlink(what, where) < 0) {
1227 log_error_errno(errno, "Failed to correct timezone of container: %m");
1228 return 0;
1229 }
1230
1231 return 0;
1232 }
1233
1234 static int setup_resolv_conf(const char *dest) {
1235 _cleanup_free_ char *where = NULL;
1236 int r;
1237
1238 assert(dest);
1239
1240 if (arg_private_network)
1241 return 0;
1242
1243 /* Fix resolv.conf, if possible */
1244 where = strappend(dest, "/etc/resolv.conf");
1245 if (!where)
1246 return log_oom();
1247
1248 /* We don't really care for the results of this really. If it
1249 * fails, it fails, but meh... */
1250 r = mkdir_parents(where, 0755);
1251 if (r < 0) {
1252 log_warning_errno(r, "Failed to create parent directory for resolv.conf %s: %m", where);
1253
1254 return 0;
1255 }
1256
1257 r = copy_file("/etc/resolv.conf", where, O_TRUNC|O_NOFOLLOW, 0644, 0);
1258 if (r < 0) {
1259 log_warning_errno(r, "Failed to copy /etc/resolv.conf to %s: %m", where);
1260
1261 return 0;
1262 }
1263
1264 return 0;
1265 }
1266
1267 static int setup_volatile_state(const char *directory) {
1268 const char *p;
1269 int r;
1270
1271 assert(directory);
1272
1273 if (arg_volatile != VOLATILE_STATE)
1274 return 0;
1275
1276 /* --volatile=state means we simply overmount /var
1277 with a tmpfs, and the rest read-only. */
1278
1279 r = bind_remount_recursive(directory, true);
1280 if (r < 0)
1281 return log_error_errno(r, "Failed to remount %s read-only: %m", directory);
1282
1283 p = strjoina(directory, "/var");
1284 r = mkdir(p, 0755);
1285 if (r < 0 && errno != EEXIST)
1286 return log_error_errno(errno, "Failed to create %s: %m", directory);
1287
1288 if (mount("tmpfs", p, "tmpfs", MS_STRICTATIME, "mode=755") < 0)
1289 return log_error_errno(errno, "Failed to mount tmpfs to /var: %m");
1290
1291 return 0;
1292 }
1293
1294 static int setup_volatile(const char *directory) {
1295 bool tmpfs_mounted = false, bind_mounted = false;
1296 char template[] = "/tmp/nspawn-volatile-XXXXXX";
1297 const char *f, *t;
1298 int r;
1299
1300 assert(directory);
1301
1302 if (arg_volatile != VOLATILE_YES)
1303 return 0;
1304
1305 /* --volatile=yes means we mount a tmpfs to the root dir, and
1306 the original /usr to use inside it, and that read-only. */
1307
1308 if (!mkdtemp(template))
1309 return log_error_errno(errno, "Failed to create temporary directory: %m");
1310
1311 if (mount("tmpfs", template, "tmpfs", MS_STRICTATIME, "mode=755") < 0) {
1312 log_error_errno(errno, "Failed to mount tmpfs for root directory: %m");
1313 r = -errno;
1314 goto fail;
1315 }
1316
1317 tmpfs_mounted = true;
1318
1319 f = strjoina(directory, "/usr");
1320 t = strjoina(template, "/usr");
1321
1322 r = mkdir(t, 0755);
1323 if (r < 0 && errno != EEXIST) {
1324 log_error_errno(errno, "Failed to create %s: %m", t);
1325 r = -errno;
1326 goto fail;
1327 }
1328
1329 if (mount(f, t, NULL, MS_BIND|MS_REC, NULL) < 0) {
1330 log_error_errno(errno, "Failed to create /usr bind mount: %m");
1331 r = -errno;
1332 goto fail;
1333 }
1334
1335 bind_mounted = true;
1336
1337 r = bind_remount_recursive(t, true);
1338 if (r < 0) {
1339 log_error_errno(r, "Failed to remount %s read-only: %m", t);
1340 goto fail;
1341 }
1342
1343 if (mount(template, directory, NULL, MS_MOVE, NULL) < 0) {
1344 log_error_errno(errno, "Failed to move root mount: %m");
1345 r = -errno;
1346 goto fail;
1347 }
1348
1349 rmdir(template);
1350
1351 return 0;
1352
1353 fail:
1354 if (bind_mounted)
1355 umount(t);
1356 if (tmpfs_mounted)
1357 umount(template);
1358 rmdir(template);
1359 return r;
1360 }
1361
1362 static char* id128_format_as_uuid(sd_id128_t id, char s[37]) {
1363
1364 snprintf(s, 37,
1365 "%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
1366 SD_ID128_FORMAT_VAL(id));
1367
1368 return s;
1369 }
1370
1371 static int setup_boot_id(const char *dest) {
1372 _cleanup_free_ char *from = NULL, *to = NULL;
1373 sd_id128_t rnd = {};
1374 char as_uuid[37];
1375 int r;
1376
1377 assert(dest);
1378
1379 if (arg_share_system)
1380 return 0;
1381
1382 /* Generate a new randomized boot ID, so that each boot-up of
1383 * the container gets a new one */
1384
1385 from = strappend(dest, "/dev/proc-sys-kernel-random-boot-id");
1386 to = strappend(dest, "/proc/sys/kernel/random/boot_id");
1387 if (!from || !to)
1388 return log_oom();
1389
1390 r = sd_id128_randomize(&rnd);
1391 if (r < 0)
1392 return log_error_errno(r, "Failed to generate random boot id: %m");
1393
1394 id128_format_as_uuid(rnd, as_uuid);
1395
1396 r = write_string_file(from, as_uuid);
1397 if (r < 0)
1398 return log_error_errno(r, "Failed to write boot id: %m");
1399
1400 if (mount(from, to, NULL, MS_BIND, NULL) < 0) {
1401 log_error_errno(errno, "Failed to bind mount boot id: %m");
1402 r = -errno;
1403 } else if (mount(from, to, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY, NULL))
1404 log_warning_errno(errno, "Failed to make boot id read-only: %m");
1405
1406 unlink(from);
1407 return r;
1408 }
1409
1410 static int copy_devnodes(const char *dest) {
1411
1412 static const char devnodes[] =
1413 "null\0"
1414 "zero\0"
1415 "full\0"
1416 "random\0"
1417 "urandom\0"
1418 "tty\0"
1419 "net/tun\0";
1420
1421 const char *d;
1422 int r = 0;
1423 _cleanup_umask_ mode_t u;
1424
1425 assert(dest);
1426
1427 u = umask(0000);
1428
1429 NULSTR_FOREACH(d, devnodes) {
1430 _cleanup_free_ char *from = NULL, *to = NULL;
1431 struct stat st;
1432
1433 from = strappend("/dev/", d);
1434 to = strjoin(dest, "/dev/", d, NULL);
1435 if (!from || !to)
1436 return log_oom();
1437
1438 if (stat(from, &st) < 0) {
1439
1440 if (errno != ENOENT)
1441 return log_error_errno(errno, "Failed to stat %s: %m", from);
1442
1443 } else if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
1444
1445 log_error("%s is not a char or block device, cannot copy", from);
1446 return -EIO;
1447
1448 } else {
1449 r = mkdir_parents(to, 0775);
1450 if (r < 0) {
1451 log_error_errno(r, "Failed to create parent directory of %s: %m", to);
1452 return -r;
1453 }
1454
1455 if (mknod(to, st.st_mode, st.st_rdev) < 0) {
1456 if (errno != EPERM)
1457 return log_error_errno(errno, "mknod(%s) failed: %m", to);
1458
1459 /* Some systems abusively restrict mknod but
1460 * allow bind mounts. */
1461 r = touch(to);
1462 if (r < 0)
1463 return log_error_errno(r, "touch (%s) failed: %m", to);
1464 if (mount(from, to, NULL, MS_BIND, NULL) < 0)
1465 return log_error_errno(errno, "Both mknod and bind mount (%s) failed: %m", to);
1466 }
1467
1468 if (arg_userns && arg_uid_shift != UID_INVALID)
1469 if (lchown(to, arg_uid_shift, arg_uid_shift) < 0)
1470 return log_error_errno(errno, "chown() of device node %s failed: %m", to);
1471 }
1472 }
1473
1474 return r;
1475 }
1476
1477 static int setup_ptmx(const char *dest) {
1478 _cleanup_free_ char *p = NULL;
1479
1480 p = strappend(dest, "/dev/ptmx");
1481 if (!p)
1482 return log_oom();
1483
1484 if (symlink("pts/ptmx", p) < 0)
1485 return log_error_errno(errno, "Failed to create /dev/ptmx symlink: %m");
1486
1487 if (arg_userns && arg_uid_shift != UID_INVALID)
1488 if (lchown(p, arg_uid_shift, arg_uid_shift) < 0)
1489 return log_error_errno(errno, "lchown() of symlink %s failed: %m", p);
1490
1491 return 0;
1492 }
1493
1494 static int setup_dev_console(const char *dest, const char *console) {
1495 _cleanup_umask_ mode_t u;
1496 const char *to;
1497 int r;
1498
1499 assert(dest);
1500 assert(console);
1501
1502 u = umask(0000);
1503
1504 r = chmod_and_chown(console, 0600, 0, 0);
1505 if (r < 0)
1506 return log_error_errno(r, "Failed to correct access mode for TTY: %m");
1507
1508 /* We need to bind mount the right tty to /dev/console since
1509 * ptys can only exist on pts file systems. To have something
1510 * to bind mount things on we create a empty regular file. */
1511
1512 to = strjoina(dest, "/dev/console");
1513 r = touch(to);
1514 if (r < 0)
1515 return log_error_errno(r, "touch() for /dev/console failed: %m");
1516
1517 if (mount(console, to, NULL, MS_BIND, NULL) < 0)
1518 return log_error_errno(errno, "Bind mount for /dev/console failed: %m");
1519
1520 return 0;
1521 }
1522
1523 static int setup_kmsg(const char *dest, int kmsg_socket) {
1524 _cleanup_free_ char *from = NULL, *to = NULL;
1525 _cleanup_umask_ mode_t u;
1526 int r, fd, k;
1527 union {
1528 struct cmsghdr cmsghdr;
1529 uint8_t buf[CMSG_SPACE(sizeof(int))];
1530 } control = {};
1531 struct msghdr mh = {
1532 .msg_control = &control,
1533 .msg_controllen = sizeof(control),
1534 };
1535 struct cmsghdr *cmsg;
1536
1537 assert(dest);
1538 assert(kmsg_socket >= 0);
1539
1540 u = umask(0000);
1541
1542 /* We create the kmsg FIFO as /dev/kmsg, but immediately
1543 * delete it after bind mounting it to /proc/kmsg. While FIFOs
1544 * on the reading side behave very similar to /proc/kmsg,
1545 * their writing side behaves differently from /dev/kmsg in
1546 * that writing blocks when nothing is reading. In order to
1547 * avoid any problems with containers deadlocking due to this
1548 * we simply make /dev/kmsg unavailable to the container. */
1549 if (asprintf(&from, "%s/dev/kmsg", dest) < 0 ||
1550 asprintf(&to, "%s/proc/kmsg", dest) < 0)
1551 return log_oom();
1552
1553 if (mkfifo(from, 0600) < 0)
1554 return log_error_errno(errno, "mkfifo() for /dev/kmsg failed: %m");
1555
1556 r = chmod_and_chown(from, 0600, 0, 0);
1557 if (r < 0)
1558 return log_error_errno(r, "Failed to correct access mode for /dev/kmsg: %m");
1559
1560 if (mount(from, to, NULL, MS_BIND, NULL) < 0)
1561 return log_error_errno(errno, "Bind mount for /proc/kmsg failed: %m");
1562
1563 fd = open(from, O_RDWR|O_NDELAY|O_CLOEXEC);
1564 if (fd < 0)
1565 return log_error_errno(errno, "Failed to open fifo: %m");
1566
1567 cmsg = CMSG_FIRSTHDR(&mh);
1568 cmsg->cmsg_level = SOL_SOCKET;
1569 cmsg->cmsg_type = SCM_RIGHTS;
1570 cmsg->cmsg_len = CMSG_LEN(sizeof(int));
1571 memcpy(CMSG_DATA(cmsg), &fd, sizeof(int));
1572
1573 mh.msg_controllen = cmsg->cmsg_len;
1574
1575 /* Store away the fd in the socket, so that it stays open as
1576 * long as we run the child */
1577 k = sendmsg(kmsg_socket, &mh, MSG_NOSIGNAL);
1578 safe_close(fd);
1579
1580 if (k < 0)
1581 return log_error_errno(errno, "Failed to send FIFO fd: %m");
1582
1583 /* And now make the FIFO unavailable as /dev/kmsg... */
1584 unlink(from);
1585 return 0;
1586 }
1587
1588 static int send_rtnl(int send_fd) {
1589 union {
1590 struct cmsghdr cmsghdr;
1591 uint8_t buf[CMSG_SPACE(sizeof(int))];
1592 } control = {};
1593 struct msghdr mh = {
1594 .msg_control = &control,
1595 .msg_controllen = sizeof(control),
1596 };
1597 struct cmsghdr *cmsg;
1598 _cleanup_close_ int fd = -1;
1599 ssize_t k;
1600
1601 assert(send_fd >= 0);
1602
1603 if (!arg_expose_ports)
1604 return 0;
1605
1606 fd = socket(PF_NETLINK, SOCK_RAW|SOCK_CLOEXEC|SOCK_NONBLOCK, NETLINK_ROUTE);
1607 if (fd < 0)
1608 return log_error_errno(errno, "failed to allocate container netlink: %m");
1609
1610 cmsg = CMSG_FIRSTHDR(&mh);
1611 cmsg->cmsg_level = SOL_SOCKET;
1612 cmsg->cmsg_type = SCM_RIGHTS;
1613 cmsg->cmsg_len = CMSG_LEN(sizeof(int));
1614 memcpy(CMSG_DATA(cmsg), &fd, sizeof(int));
1615
1616 mh.msg_controllen = cmsg->cmsg_len;
1617
1618 /* Store away the fd in the socket, so that it stays open as
1619 * long as we run the child */
1620 k = sendmsg(send_fd, &mh, MSG_NOSIGNAL);
1621 if (k < 0)
1622 return log_error_errno(errno, "Failed to send netlink fd: %m");
1623
1624 return 0;
1625 }
1626
1627 static int flush_ports(union in_addr_union *exposed) {
1628 ExposePort *p;
1629 int r, af = AF_INET;
1630
1631 assert(exposed);
1632
1633 if (!arg_expose_ports)
1634 return 0;
1635
1636 if (in_addr_is_null(af, exposed))
1637 return 0;
1638
1639 log_debug("Lost IP address.");
1640
1641 LIST_FOREACH(ports, p, arg_expose_ports) {
1642 r = fw_add_local_dnat(false,
1643 af,
1644 p->protocol,
1645 NULL,
1646 NULL, 0,
1647 NULL, 0,
1648 p->host_port,
1649 exposed,
1650 p->container_port,
1651 NULL);
1652 if (r < 0)
1653 log_warning_errno(r, "Failed to modify firewall: %m");
1654 }
1655
1656 *exposed = IN_ADDR_NULL;
1657 return 0;
1658 }
1659
1660 static int expose_ports(sd_rtnl *rtnl, union in_addr_union *exposed) {
1661 _cleanup_free_ struct local_address *addresses = NULL;
1662 _cleanup_free_ char *pretty = NULL;
1663 union in_addr_union new_exposed;
1664 ExposePort *p;
1665 bool add;
1666 int af = AF_INET, r;
1667
1668 assert(exposed);
1669
1670 /* Invoked each time an address is added or removed inside the
1671 * container */
1672
1673 if (!arg_expose_ports)
1674 return 0;
1675
1676 r = local_addresses(rtnl, 0, af, &addresses);
1677 if (r < 0)
1678 return log_error_errno(r, "Failed to enumerate local addresses: %m");
1679
1680 add = r > 0 &&
1681 addresses[0].family == af &&
1682 addresses[0].scope < RT_SCOPE_LINK;
1683
1684 if (!add)
1685 return flush_ports(exposed);
1686
1687 new_exposed = addresses[0].address;
1688 if (in_addr_equal(af, exposed, &new_exposed))
1689 return 0;
1690
1691 in_addr_to_string(af, &new_exposed, &pretty);
1692 log_debug("New container IP is %s.", strna(pretty));
1693
1694 LIST_FOREACH(ports, p, arg_expose_ports) {
1695
1696 r = fw_add_local_dnat(true,
1697 af,
1698 p->protocol,
1699 NULL,
1700 NULL, 0,
1701 NULL, 0,
1702 p->host_port,
1703 &new_exposed,
1704 p->container_port,
1705 in_addr_is_null(af, exposed) ? NULL : exposed);
1706 if (r < 0)
1707 log_warning_errno(r, "Failed to modify firewall: %m");
1708 }
1709
1710 *exposed = new_exposed;
1711 return 0;
1712 }
1713
1714 static int on_address_change(sd_rtnl *rtnl, sd_rtnl_message *m, void *userdata) {
1715 union in_addr_union *exposed = userdata;
1716
1717 assert(rtnl);
1718 assert(m);
1719 assert(exposed);
1720
1721 expose_ports(rtnl, exposed);
1722 return 0;
1723 }
1724
1725 static int watch_rtnl(sd_event *event, int recv_fd, union in_addr_union *exposed, sd_rtnl **ret) {
1726 union {
1727 struct cmsghdr cmsghdr;
1728 uint8_t buf[CMSG_SPACE(sizeof(int))];
1729 } control = {};
1730 struct msghdr mh = {
1731 .msg_control = &control,
1732 .msg_controllen = sizeof(control),
1733 };
1734 struct cmsghdr *cmsg;
1735 _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
1736 int fd, r;
1737 ssize_t k;
1738
1739 assert(event);
1740 assert(recv_fd >= 0);
1741 assert(ret);
1742
1743 if (!arg_expose_ports)
1744 return 0;
1745
1746 k = recvmsg(recv_fd, &mh, MSG_NOSIGNAL);
1747 if (k < 0)
1748 return log_error_errno(errno, "Failed to recv netlink fd: %m");
1749
1750 cmsg = CMSG_FIRSTHDR(&mh);
1751 assert(cmsg->cmsg_level == SOL_SOCKET);
1752 assert(cmsg->cmsg_type == SCM_RIGHTS);
1753 assert(cmsg->cmsg_len == CMSG_LEN(sizeof(int)));
1754 memcpy(&fd, CMSG_DATA(cmsg), sizeof(int));
1755
1756 r = sd_rtnl_open_fd(&rtnl, fd, 1, RTNLGRP_IPV4_IFADDR);
1757 if (r < 0) {
1758 safe_close(fd);
1759 return log_error_errno(r, "Failed to create rtnl object: %m");
1760 }
1761
1762 r = sd_rtnl_add_match(rtnl, RTM_NEWADDR, on_address_change, exposed);
1763 if (r < 0)
1764 return log_error_errno(r, "Failed to subscribe to RTM_NEWADDR messages: %m");
1765
1766 r = sd_rtnl_add_match(rtnl, RTM_DELADDR, on_address_change, exposed);
1767 if (r < 0)
1768 return log_error_errno(r, "Failed to subscribe to RTM_DELADDR messages: %m");
1769
1770 r = sd_rtnl_attach_event(rtnl, event, 0);
1771 if (r < 0)
1772 return log_error_errno(r, "Failed to add to even loop: %m");
1773
1774 *ret = rtnl;
1775 rtnl = NULL;
1776
1777 return 0;
1778 }
1779
1780 static int setup_hostname(void) {
1781
1782 if (arg_share_system)
1783 return 0;
1784
1785 if (sethostname_idempotent(arg_machine) < 0)
1786 return -errno;
1787
1788 return 0;
1789 }
1790
1791 static int setup_journal(const char *directory) {
1792 sd_id128_t machine_id, this_id;
1793 _cleanup_free_ char *p = NULL, *b = NULL, *q = NULL, *d = NULL;
1794 char *id;
1795 int r;
1796
1797 /* Don't link journals in ephemeral mode */
1798 if (arg_ephemeral)
1799 return 0;
1800
1801 p = strappend(directory, "/etc/machine-id");
1802 if (!p)
1803 return log_oom();
1804
1805 r = read_one_line_file(p, &b);
1806 if (r == -ENOENT && arg_link_journal == LINK_AUTO)
1807 return 0;
1808 else if (r < 0)
1809 return log_error_errno(r, "Failed to read machine ID from %s: %m", p);
1810
1811 id = strstrip(b);
1812 if (isempty(id) && arg_link_journal == LINK_AUTO)
1813 return 0;
1814
1815 /* Verify validity */
1816 r = sd_id128_from_string(id, &machine_id);
1817 if (r < 0)
1818 return log_error_errno(r, "Failed to parse machine ID from %s: %m", p);
1819
1820 r = sd_id128_get_machine(&this_id);
1821 if (r < 0)
1822 return log_error_errno(r, "Failed to retrieve machine ID: %m");
1823
1824 if (sd_id128_equal(machine_id, this_id)) {
1825 log_full(arg_link_journal == LINK_AUTO ? LOG_WARNING : LOG_ERR,
1826 "Host and machine ids are equal (%s): refusing to link journals", id);
1827 if (arg_link_journal == LINK_AUTO)
1828 return 0;
1829 return -EEXIST;
1830 }
1831
1832 if (arg_link_journal == LINK_NO)
1833 return 0;
1834
1835 free(p);
1836 p = strappend("/var/log/journal/", id);
1837 q = strjoin(directory, "/var/log/journal/", id, NULL);
1838 if (!p || !q)
1839 return log_oom();
1840
1841 if (path_is_mount_point(p, false) > 0) {
1842 if (arg_link_journal != LINK_AUTO) {
1843 log_error("%s: already a mount point, refusing to use for journal", p);
1844 return -EEXIST;
1845 }
1846
1847 return 0;
1848 }
1849
1850 if (path_is_mount_point(q, false) > 0) {
1851 if (arg_link_journal != LINK_AUTO) {
1852 log_error("%s: already a mount point, refusing to use for journal", q);
1853 return -EEXIST;
1854 }
1855
1856 return 0;
1857 }
1858
1859 r = readlink_and_make_absolute(p, &d);
1860 if (r >= 0) {
1861 if ((arg_link_journal == LINK_GUEST ||
1862 arg_link_journal == LINK_AUTO) &&
1863 path_equal(d, q)) {
1864
1865 r = mkdir_p(q, 0755);
1866 if (r < 0)
1867 log_warning_errno(errno, "Failed to create directory %s: %m", q);
1868 return 0;
1869 }
1870
1871 if (unlink(p) < 0)
1872 return log_error_errno(errno, "Failed to remove symlink %s: %m", p);
1873 } else if (r == -EINVAL) {
1874
1875 if (arg_link_journal == LINK_GUEST &&
1876 rmdir(p) < 0) {
1877
1878 if (errno == ENOTDIR) {
1879 log_error("%s already exists and is neither a symlink nor a directory", p);
1880 return r;
1881 } else {
1882 log_error_errno(errno, "Failed to remove %s: %m", p);
1883 return -errno;
1884 }
1885 }
1886 } else if (r != -ENOENT) {
1887 log_error_errno(errno, "readlink(%s) failed: %m", p);
1888 return r;
1889 }
1890
1891 if (arg_link_journal == LINK_GUEST) {
1892
1893 if (symlink(q, p) < 0) {
1894 if (arg_link_journal_try) {
1895 log_debug_errno(errno, "Failed to symlink %s to %s, skipping journal setup: %m", q, p);
1896 return 0;
1897 } else {
1898 log_error_errno(errno, "Failed to symlink %s to %s: %m", q, p);
1899 return -errno;
1900 }
1901 }
1902
1903 r = mkdir_p(q, 0755);
1904 if (r < 0)
1905 log_warning_errno(errno, "Failed to create directory %s: %m", q);
1906 return 0;
1907 }
1908
1909 if (arg_link_journal == LINK_HOST) {
1910 /* don't create parents here -- if the host doesn't have
1911 * permanent journal set up, don't force it here */
1912 r = mkdir(p, 0755);
1913 if (r < 0) {
1914 if (arg_link_journal_try) {
1915 log_debug_errno(errno, "Failed to create %s, skipping journal setup: %m", p);
1916 return 0;
1917 } else {
1918 log_error_errno(errno, "Failed to create %s: %m", p);
1919 return r;
1920 }
1921 }
1922
1923 } else if (access(p, F_OK) < 0)
1924 return 0;
1925
1926 if (dir_is_empty(q) == 0)
1927 log_warning("%s is not empty, proceeding anyway.", q);
1928
1929 r = mkdir_p(q, 0755);
1930 if (r < 0) {
1931 log_error_errno(errno, "Failed to create %s: %m", q);
1932 return r;
1933 }
1934
1935 if (mount(p, q, NULL, MS_BIND, NULL) < 0)
1936 return log_error_errno(errno, "Failed to bind mount journal from host into guest: %m");
1937
1938 return 0;
1939 }
1940
1941 static int drop_capabilities(void) {
1942 return capability_bounding_set_drop(~arg_retain, false);
1943 }
1944
1945 static int register_machine(pid_t pid, int local_ifindex) {
1946 _cleanup_bus_error_free_ sd_bus_error error = SD_BUS_ERROR_NULL;
1947 _cleanup_bus_close_unref_ sd_bus *bus = NULL;
1948 int r;
1949
1950 if (!arg_register)
1951 return 0;
1952
1953 r = sd_bus_default_system(&bus);
1954 if (r < 0)
1955 return log_error_errno(r, "Failed to open system bus: %m");
1956
1957 if (arg_keep_unit) {
1958 r = sd_bus_call_method(
1959 bus,
1960 "org.freedesktop.machine1",
1961 "/org/freedesktop/machine1",
1962 "org.freedesktop.machine1.Manager",
1963 "RegisterMachineWithNetwork",
1964 &error,
1965 NULL,
1966 "sayssusai",
1967 arg_machine,
1968 SD_BUS_MESSAGE_APPEND_ID128(arg_uuid),
1969 "nspawn",
1970 "container",
1971 (uint32_t) pid,
1972 strempty(arg_directory),
1973 local_ifindex > 0 ? 1 : 0, local_ifindex);
1974 } else {
1975 _cleanup_bus_message_unref_ sd_bus_message *m = NULL;
1976 char **i;
1977
1978 r = sd_bus_message_new_method_call(
1979 bus,
1980 &m,
1981 "org.freedesktop.machine1",
1982 "/org/freedesktop/machine1",
1983 "org.freedesktop.machine1.Manager",
1984 "CreateMachineWithNetwork");
1985 if (r < 0)
1986 return bus_log_create_error(r);
1987
1988 r = sd_bus_message_append(
1989 m,
1990 "sayssusai",
1991 arg_machine,
1992 SD_BUS_MESSAGE_APPEND_ID128(arg_uuid),
1993 "nspawn",
1994 "container",
1995 (uint32_t) pid,
1996 strempty(arg_directory),
1997 local_ifindex > 0 ? 1 : 0, local_ifindex);
1998 if (r < 0)
1999 return bus_log_create_error(r);
2000
2001 r = sd_bus_message_open_container(m, 'a', "(sv)");
2002 if (r < 0)
2003 return bus_log_create_error(r);
2004
2005 if (!isempty(arg_slice)) {
2006 r = sd_bus_message_append(m, "(sv)", "Slice", "s", arg_slice);
2007 if (r < 0)
2008 return bus_log_create_error(r);
2009 }
2010
2011 r = sd_bus_message_append(m, "(sv)", "DevicePolicy", "s", "strict");
2012 if (r < 0)
2013 return bus_log_create_error(r);
2014
2015 r = sd_bus_message_append(m, "(sv)", "DeviceAllow", "a(ss)", 9,
2016 /* Allow the container to
2017 * access and create the API
2018 * device nodes, so that
2019 * PrivateDevices= in the
2020 * container can work
2021 * fine */
2022 "/dev/null", "rwm",
2023 "/dev/zero", "rwm",
2024 "/dev/full", "rwm",
2025 "/dev/random", "rwm",
2026 "/dev/urandom", "rwm",
2027 "/dev/tty", "rwm",
2028 "/dev/net/tun", "rwm",
2029 /* Allow the container
2030 * access to ptys. However,
2031 * do not permit the
2032 * container to ever create
2033 * these device nodes. */
2034 "/dev/pts/ptmx", "rw",
2035 "char-pts", "rw");
2036 if (r < 0)
2037 return log_error_errno(r, "Failed to add device whitelist: %m");
2038
2039 STRV_FOREACH(i, arg_property) {
2040 r = sd_bus_message_open_container(m, 'r', "sv");
2041 if (r < 0)
2042 return bus_log_create_error(r);
2043
2044 r = bus_append_unit_property_assignment(m, *i);
2045 if (r < 0)
2046 return r;
2047
2048 r = sd_bus_message_close_container(m);
2049 if (r < 0)
2050 return bus_log_create_error(r);
2051 }
2052
2053 r = sd_bus_message_close_container(m);
2054 if (r < 0)
2055 return bus_log_create_error(r);
2056
2057 r = sd_bus_call(bus, m, 0, &error, NULL);
2058 }
2059
2060 if (r < 0) {
2061 log_error("Failed to register machine: %s", bus_error_message(&error, r));
2062 return r;
2063 }
2064
2065 return 0;
2066 }
2067
2068 static int terminate_machine(pid_t pid) {
2069 _cleanup_bus_error_free_ sd_bus_error error = SD_BUS_ERROR_NULL;
2070 _cleanup_bus_message_unref_ sd_bus_message *reply = NULL;
2071 _cleanup_bus_close_unref_ sd_bus *bus = NULL;
2072 const char *path;
2073 int r;
2074
2075 if (!arg_register)
2076 return 0;
2077
2078 r = sd_bus_default_system(&bus);
2079 if (r < 0)
2080 return log_error_errno(r, "Failed to open system bus: %m");
2081
2082 r = sd_bus_call_method(
2083 bus,
2084 "org.freedesktop.machine1",
2085 "/org/freedesktop/machine1",
2086 "org.freedesktop.machine1.Manager",
2087 "GetMachineByPID",
2088 &error,
2089 &reply,
2090 "u",
2091 (uint32_t) pid);
2092 if (r < 0) {
2093 /* Note that the machine might already have been
2094 * cleaned up automatically, hence don't consider it a
2095 * failure if we cannot get the machine object. */
2096 log_debug("Failed to get machine: %s", bus_error_message(&error, r));
2097 return 0;
2098 }
2099
2100 r = sd_bus_message_read(reply, "o", &path);
2101 if (r < 0)
2102 return bus_log_parse_error(r);
2103
2104 r = sd_bus_call_method(
2105 bus,
2106 "org.freedesktop.machine1",
2107 path,
2108 "org.freedesktop.machine1.Machine",
2109 "Terminate",
2110 &error,
2111 NULL,
2112 NULL);
2113 if (r < 0) {
2114 log_debug("Failed to terminate machine: %s", bus_error_message(&error, r));
2115 return 0;
2116 }
2117
2118 return 0;
2119 }
2120
2121 static int reset_audit_loginuid(void) {
2122 _cleanup_free_ char *p = NULL;
2123 int r;
2124
2125 if (arg_share_system)
2126 return 0;
2127
2128 r = read_one_line_file("/proc/self/loginuid", &p);
2129 if (r == -ENOENT)
2130 return 0;
2131 if (r < 0)
2132 return log_error_errno(r, "Failed to read /proc/self/loginuid: %m");
2133
2134 /* Already reset? */
2135 if (streq(p, "4294967295"))
2136 return 0;
2137
2138 r = write_string_file("/proc/self/loginuid", "4294967295");
2139 if (r < 0) {
2140 log_error("Failed to reset audit login UID. This probably means that your kernel is too\n"
2141 "old and you have audit enabled. Note that the auditing subsystem is known to\n"
2142 "be incompatible with containers on old kernels. Please make sure to upgrade\n"
2143 "your kernel or to off auditing with 'audit=0' on the kernel command line before\n"
2144 "using systemd-nspawn. Sleeping for 5s... (%s)\n", strerror(-r));
2145
2146 sleep(5);
2147 }
2148
2149 return 0;
2150 }
2151
2152 #define HOST_HASH_KEY SD_ID128_MAKE(1a,37,6f,c7,46,ec,45,0b,ad,a3,d5,31,06,60,5d,b1)
2153 #define CONTAINER_HASH_KEY SD_ID128_MAKE(c3,c4,f9,19,b5,57,b2,1c,e6,cf,14,27,03,9c,ee,a2)
2154 #define MACVLAN_HASH_KEY SD_ID128_MAKE(00,13,6d,bc,66,83,44,81,bb,0c,f9,51,1f,24,a6,6f)
2155
2156 static int generate_mac(struct ether_addr *mac, sd_id128_t hash_key, uint64_t idx) {
2157 uint8_t result[8];
2158 size_t l, sz;
2159 uint8_t *v, *i;
2160 int r;
2161
2162 l = strlen(arg_machine);
2163 sz = sizeof(sd_id128_t) + l;
2164 if (idx > 0)
2165 sz += sizeof(idx);
2166
2167 v = alloca(sz);
2168
2169 /* fetch some persistent data unique to the host */
2170 r = sd_id128_get_machine((sd_id128_t*) v);
2171 if (r < 0)
2172 return r;
2173
2174 /* combine with some data unique (on this host) to this
2175 * container instance */
2176 i = mempcpy(v + sizeof(sd_id128_t), arg_machine, l);
2177 if (idx > 0) {
2178 idx = htole64(idx);
2179 memcpy(i, &idx, sizeof(idx));
2180 }
2181
2182 /* Let's hash the host machine ID plus the container name. We
2183 * use a fixed, but originally randomly created hash key here. */
2184 siphash24(result, v, sz, hash_key.bytes);
2185
2186 assert_cc(ETH_ALEN <= sizeof(result));
2187 memcpy(mac->ether_addr_octet, result, ETH_ALEN);
2188
2189 /* see eth_random_addr in the kernel */
2190 mac->ether_addr_octet[0] &= 0xfe; /* clear multicast bit */
2191 mac->ether_addr_octet[0] |= 0x02; /* set local assignment bit (IEEE802) */
2192
2193 return 0;
2194 }
2195
2196 static int setup_veth(pid_t pid, char iface_name[IFNAMSIZ], int *ifi) {
2197 _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
2198 _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
2199 struct ether_addr mac_host, mac_container;
2200 int r, i;
2201
2202 if (!arg_private_network)
2203 return 0;
2204
2205 if (!arg_network_veth)
2206 return 0;
2207
2208 /* Use two different interface name prefixes depending whether
2209 * we are in bridge mode or not. */
2210 snprintf(iface_name, IFNAMSIZ - 1, "%s-%s",
2211 arg_network_bridge ? "vb" : "ve", arg_machine);
2212
2213 r = generate_mac(&mac_container, CONTAINER_HASH_KEY, 0);
2214 if (r < 0)
2215 return log_error_errno(r, "Failed to generate predictable MAC address for container side: %m");
2216
2217 r = generate_mac(&mac_host, HOST_HASH_KEY, 0);
2218 if (r < 0)
2219 return log_error_errno(r, "Failed to generate predictable MAC address for host side: %m");
2220
2221 r = sd_rtnl_open(&rtnl, 0);
2222 if (r < 0)
2223 return log_error_errno(r, "Failed to connect to netlink: %m");
2224
2225 r = sd_rtnl_message_new_link(rtnl, &m, RTM_NEWLINK, 0);
2226 if (r < 0)
2227 return log_error_errno(r, "Failed to allocate netlink message: %m");
2228
2229 r = sd_rtnl_message_append_string(m, IFLA_IFNAME, iface_name);
2230 if (r < 0)
2231 return log_error_errno(r, "Failed to add netlink interface name: %m");
2232
2233 r = sd_rtnl_message_append_ether_addr(m, IFLA_ADDRESS, &mac_host);
2234 if (r < 0)
2235 return log_error_errno(r, "Failed to add netlink MAC address: %m");
2236
2237 r = sd_rtnl_message_open_container(m, IFLA_LINKINFO);
2238 if (r < 0)
2239 return log_error_errno(r, "Failed to open netlink container: %m");
2240
2241 r = sd_rtnl_message_open_container_union(m, IFLA_INFO_DATA, "veth");
2242 if (r < 0)
2243 return log_error_errno(r, "Failed to open netlink container: %m");
2244
2245 r = sd_rtnl_message_open_container(m, VETH_INFO_PEER);
2246 if (r < 0)
2247 return log_error_errno(r, "Failed to open netlink container: %m");
2248
2249 r = sd_rtnl_message_append_string(m, IFLA_IFNAME, "host0");
2250 if (r < 0)
2251 return log_error_errno(r, "Failed to add netlink interface name: %m");
2252
2253 r = sd_rtnl_message_append_ether_addr(m, IFLA_ADDRESS, &mac_container);
2254 if (r < 0)
2255 return log_error_errno(r, "Failed to add netlink MAC address: %m");
2256
2257 r = sd_rtnl_message_append_u32(m, IFLA_NET_NS_PID, pid);
2258 if (r < 0)
2259 return log_error_errno(r, "Failed to add netlink namespace field: %m");
2260
2261 r = sd_rtnl_message_close_container(m);
2262 if (r < 0)
2263 return log_error_errno(r, "Failed to close netlink container: %m");
2264
2265 r = sd_rtnl_message_close_container(m);
2266 if (r < 0)
2267 return log_error_errno(r, "Failed to close netlink container: %m");
2268
2269 r = sd_rtnl_message_close_container(m);
2270 if (r < 0)
2271 return log_error_errno(r, "Failed to close netlink container: %m");
2272
2273 r = sd_rtnl_call(rtnl, m, 0, NULL);
2274 if (r < 0)
2275 return log_error_errno(r, "Failed to add new veth interfaces: %m");
2276
2277 i = (int) if_nametoindex(iface_name);
2278 if (i <= 0)
2279 return log_error_errno(errno, "Failed to resolve interface %s: %m", iface_name);
2280
2281 *ifi = i;
2282
2283 return 0;
2284 }
2285
2286 static int setup_bridge(const char veth_name[], int *ifi) {
2287 _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
2288 _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
2289 int r, bridge;
2290
2291 if (!arg_private_network)
2292 return 0;
2293
2294 if (!arg_network_veth)
2295 return 0;
2296
2297 if (!arg_network_bridge)
2298 return 0;
2299
2300 bridge = (int) if_nametoindex(arg_network_bridge);
2301 if (bridge <= 0)
2302 return log_error_errno(errno, "Failed to resolve interface %s: %m", arg_network_bridge);
2303
2304 *ifi = bridge;
2305
2306 r = sd_rtnl_open(&rtnl, 0);
2307 if (r < 0)
2308 return log_error_errno(r, "Failed to connect to netlink: %m");
2309
2310 r = sd_rtnl_message_new_link(rtnl, &m, RTM_SETLINK, 0);
2311 if (r < 0)
2312 return log_error_errno(r, "Failed to allocate netlink message: %m");
2313
2314 r = sd_rtnl_message_link_set_flags(m, IFF_UP, IFF_UP);
2315 if (r < 0)
2316 return log_error_errno(r, "Failed to set IFF_UP flag: %m");
2317
2318 r = sd_rtnl_message_append_string(m, IFLA_IFNAME, veth_name);
2319 if (r < 0)
2320 return log_error_errno(r, "Failed to add netlink interface name field: %m");
2321
2322 r = sd_rtnl_message_append_u32(m, IFLA_MASTER, bridge);
2323 if (r < 0)
2324 return log_error_errno(r, "Failed to add netlink master field: %m");
2325
2326 r = sd_rtnl_call(rtnl, m, 0, NULL);
2327 if (r < 0)
2328 return log_error_errno(r, "Failed to add veth interface to bridge: %m");
2329
2330 return 0;
2331 }
2332
2333 static int parse_interface(struct udev *udev, const char *name) {
2334 _cleanup_udev_device_unref_ struct udev_device *d = NULL;
2335 char ifi_str[2 + DECIMAL_STR_MAX(int)];
2336 int ifi;
2337
2338 ifi = (int) if_nametoindex(name);
2339 if (ifi <= 0)
2340 return log_error_errno(errno, "Failed to resolve interface %s: %m", name);
2341
2342 sprintf(ifi_str, "n%i", ifi);
2343 d = udev_device_new_from_device_id(udev, ifi_str);
2344 if (!d)
2345 return log_error_errno(errno, "Failed to get udev device for interface %s: %m", name);
2346
2347 if (udev_device_get_is_initialized(d) <= 0) {
2348 log_error("Network interface %s is not initialized yet.", name);
2349 return -EBUSY;
2350 }
2351
2352 return ifi;
2353 }
2354
2355 static int move_network_interfaces(pid_t pid) {
2356 _cleanup_udev_unref_ struct udev *udev = NULL;
2357 _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
2358 char **i;
2359 int r;
2360
2361 if (!arg_private_network)
2362 return 0;
2363
2364 if (strv_isempty(arg_network_interfaces))
2365 return 0;
2366
2367 r = sd_rtnl_open(&rtnl, 0);
2368 if (r < 0)
2369 return log_error_errno(r, "Failed to connect to netlink: %m");
2370
2371 udev = udev_new();
2372 if (!udev) {
2373 log_error("Failed to connect to udev.");
2374 return -ENOMEM;
2375 }
2376
2377 STRV_FOREACH(i, arg_network_interfaces) {
2378 _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
2379 int ifi;
2380
2381 ifi = parse_interface(udev, *i);
2382 if (ifi < 0)
2383 return ifi;
2384
2385 r = sd_rtnl_message_new_link(rtnl, &m, RTM_SETLINK, ifi);
2386 if (r < 0)
2387 return log_error_errno(r, "Failed to allocate netlink message: %m");
2388
2389 r = sd_rtnl_message_append_u32(m, IFLA_NET_NS_PID, pid);
2390 if (r < 0)
2391 return log_error_errno(r, "Failed to append namespace PID to netlink message: %m");
2392
2393 r = sd_rtnl_call(rtnl, m, 0, NULL);
2394 if (r < 0)
2395 return log_error_errno(r, "Failed to move interface %s to namespace: %m", *i);
2396 }
2397
2398 return 0;
2399 }
2400
2401 static int setup_macvlan(pid_t pid) {
2402 _cleanup_udev_unref_ struct udev *udev = NULL;
2403 _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
2404 unsigned idx = 0;
2405 char **i;
2406 int r;
2407
2408 if (!arg_private_network)
2409 return 0;
2410
2411 if (strv_isempty(arg_network_macvlan))
2412 return 0;
2413
2414 r = sd_rtnl_open(&rtnl, 0);
2415 if (r < 0)
2416 return log_error_errno(r, "Failed to connect to netlink: %m");
2417
2418 udev = udev_new();
2419 if (!udev) {
2420 log_error("Failed to connect to udev.");
2421 return -ENOMEM;
2422 }
2423
2424 STRV_FOREACH(i, arg_network_macvlan) {
2425 _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
2426 _cleanup_free_ char *n = NULL;
2427 struct ether_addr mac;
2428 int ifi;
2429
2430 ifi = parse_interface(udev, *i);
2431 if (ifi < 0)
2432 return ifi;
2433
2434 r = generate_mac(&mac, MACVLAN_HASH_KEY, idx++);
2435 if (r < 0)
2436 return log_error_errno(r, "Failed to create MACVLAN MAC address: %m");
2437
2438 r = sd_rtnl_message_new_link(rtnl, &m, RTM_NEWLINK, 0);
2439 if (r < 0)
2440 return log_error_errno(r, "Failed to allocate netlink message: %m");
2441
2442 r = sd_rtnl_message_append_u32(m, IFLA_LINK, ifi);
2443 if (r < 0)
2444 return log_error_errno(r, "Failed to add netlink interface index: %m");
2445
2446 n = strappend("mv-", *i);
2447 if (!n)
2448 return log_oom();
2449
2450 strshorten(n, IFNAMSIZ-1);
2451
2452 r = sd_rtnl_message_append_string(m, IFLA_IFNAME, n);
2453 if (r < 0)
2454 return log_error_errno(r, "Failed to add netlink interface name: %m");
2455
2456 r = sd_rtnl_message_append_ether_addr(m, IFLA_ADDRESS, &mac);
2457 if (r < 0)
2458 return log_error_errno(r, "Failed to add netlink MAC address: %m");
2459
2460 r = sd_rtnl_message_append_u32(m, IFLA_NET_NS_PID, pid);
2461 if (r < 0)
2462 return log_error_errno(r, "Failed to add netlink namespace field: %m");
2463
2464 r = sd_rtnl_message_open_container(m, IFLA_LINKINFO);
2465 if (r < 0)
2466 return log_error_errno(r, "Failed to open netlink container: %m");
2467
2468 r = sd_rtnl_message_open_container_union(m, IFLA_INFO_DATA, "macvlan");
2469 if (r < 0)
2470 return log_error_errno(r, "Failed to open netlink container: %m");
2471
2472 r = sd_rtnl_message_append_u32(m, IFLA_MACVLAN_MODE, MACVLAN_MODE_BRIDGE);
2473 if (r < 0)
2474 return log_error_errno(r, "Failed to append macvlan mode: %m");
2475
2476 r = sd_rtnl_message_close_container(m);
2477 if (r < 0)
2478 return log_error_errno(r, "Failed to close netlink container: %m");
2479
2480 r = sd_rtnl_message_close_container(m);
2481 if (r < 0)
2482 return log_error_errno(r, "Failed to close netlink container: %m");
2483
2484 r = sd_rtnl_call(rtnl, m, 0, NULL);
2485 if (r < 0)
2486 return log_error_errno(r, "Failed to add new macvlan interfaces: %m");
2487 }
2488
2489 return 0;
2490 }
2491
2492 static int setup_ipvlan(pid_t pid) {
2493 _cleanup_udev_unref_ struct udev *udev = NULL;
2494 _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
2495 char **i;
2496 int r;
2497
2498 if (!arg_private_network)
2499 return 0;
2500
2501 if (strv_isempty(arg_network_ipvlan))
2502 return 0;
2503
2504 r = sd_rtnl_open(&rtnl, 0);
2505 if (r < 0)
2506 return log_error_errno(r, "Failed to connect to netlink: %m");
2507
2508 udev = udev_new();
2509 if (!udev) {
2510 log_error("Failed to connect to udev.");
2511 return -ENOMEM;
2512 }
2513
2514 STRV_FOREACH(i, arg_network_ipvlan) {
2515 _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
2516 _cleanup_free_ char *n = NULL;
2517 int ifi;
2518
2519 ifi = parse_interface(udev, *i);
2520 if (ifi < 0)
2521 return ifi;
2522
2523 r = sd_rtnl_message_new_link(rtnl, &m, RTM_NEWLINK, 0);
2524 if (r < 0)
2525 return log_error_errno(r, "Failed to allocate netlink message: %m");
2526
2527 r = sd_rtnl_message_append_u32(m, IFLA_LINK, ifi);
2528 if (r < 0)
2529 return log_error_errno(r, "Failed to add netlink interface index: %m");
2530
2531 n = strappend("iv-", *i);
2532 if (!n)
2533 return log_oom();
2534
2535 strshorten(n, IFNAMSIZ-1);
2536
2537 r = sd_rtnl_message_append_string(m, IFLA_IFNAME, n);
2538 if (r < 0)
2539 return log_error_errno(r, "Failed to add netlink interface name: %m");
2540
2541 r = sd_rtnl_message_append_u32(m, IFLA_NET_NS_PID, pid);
2542 if (r < 0)
2543 return log_error_errno(r, "Failed to add netlink namespace field: %m");
2544
2545 r = sd_rtnl_message_open_container(m, IFLA_LINKINFO);
2546 if (r < 0)
2547 return log_error_errno(r, "Failed to open netlink container: %m");
2548
2549 r = sd_rtnl_message_open_container_union(m, IFLA_INFO_DATA, "ipvlan");
2550 if (r < 0)
2551 return log_error_errno(r, "Failed to open netlink container: %m");
2552
2553 r = sd_rtnl_message_append_u16(m, IFLA_IPVLAN_MODE, IPVLAN_MODE_L2);
2554 if (r < 0)
2555 return log_error_errno(r, "Failed to add ipvlan mode: %m");
2556
2557 r = sd_rtnl_message_close_container(m);
2558 if (r < 0)
2559 return log_error_errno(r, "Failed to close netlink container: %m");
2560
2561 r = sd_rtnl_message_close_container(m);
2562 if (r < 0)
2563 return log_error_errno(r, "Failed to close netlink container: %m");
2564
2565 r = sd_rtnl_call(rtnl, m, 0, NULL);
2566 if (r < 0)
2567 return log_error_errno(r, "Failed to add new ipvlan interfaces: %m");
2568 }
2569
2570 return 0;
2571 }
2572
2573 static int setup_seccomp(void) {
2574
2575 #ifdef HAVE_SECCOMP
2576 static const struct {
2577 uint64_t capability;
2578 int syscall_num;
2579 } blacklist[] = {
2580 { CAP_SYS_RAWIO, SCMP_SYS(iopl)},
2581 { CAP_SYS_RAWIO, SCMP_SYS(ioperm)},
2582 { CAP_SYS_BOOT, SCMP_SYS(kexec_load)},
2583 { CAP_SYS_ADMIN, SCMP_SYS(swapon)},
2584 { CAP_SYS_ADMIN, SCMP_SYS(swapoff)},
2585 { CAP_SYS_ADMIN, SCMP_SYS(open_by_handle_at)},
2586 { CAP_SYS_MODULE, SCMP_SYS(init_module)},
2587 { CAP_SYS_MODULE, SCMP_SYS(finit_module)},
2588 { CAP_SYS_MODULE, SCMP_SYS(delete_module)},
2589 };
2590
2591 scmp_filter_ctx seccomp;
2592 unsigned i;
2593 int r;
2594
2595 seccomp = seccomp_init(SCMP_ACT_ALLOW);
2596 if (!seccomp)
2597 return log_oom();
2598
2599 r = seccomp_add_secondary_archs(seccomp);
2600 if (r < 0) {
2601 log_error_errno(r, "Failed to add secondary archs to seccomp filter: %m");
2602 goto finish;
2603 }
2604
2605 for (i = 0; i < ELEMENTSOF(blacklist); i++) {
2606 if (arg_retain & (1ULL << blacklist[i].capability))
2607 continue;
2608
2609 r = seccomp_rule_add(seccomp, SCMP_ACT_ERRNO(EPERM), blacklist[i].syscall_num, 0);
2610 if (r == -EFAULT)
2611 continue; /* unknown syscall */
2612 if (r < 0) {
2613 log_error_errno(r, "Failed to block syscall: %m");
2614 goto finish;
2615 }
2616 }
2617
2618
2619 /*
2620 Audit is broken in containers, much of the userspace audit
2621 hookup will fail if running inside a container. We don't
2622 care and just turn off creation of audit sockets.
2623
2624 This will make socket(AF_NETLINK, *, NETLINK_AUDIT) fail
2625 with EAFNOSUPPORT which audit userspace uses as indication
2626 that audit is disabled in the kernel.
2627 */
2628
2629 r = seccomp_rule_add(
2630 seccomp,
2631 SCMP_ACT_ERRNO(EAFNOSUPPORT),
2632 SCMP_SYS(socket),
2633 2,
2634 SCMP_A0(SCMP_CMP_EQ, AF_NETLINK),
2635 SCMP_A2(SCMP_CMP_EQ, NETLINK_AUDIT));
2636 if (r < 0) {
2637 log_error_errno(r, "Failed to add audit seccomp rule: %m");
2638 goto finish;
2639 }
2640
2641 r = seccomp_attr_set(seccomp, SCMP_FLTATR_CTL_NNP, 0);
2642 if (r < 0) {
2643 log_error_errno(r, "Failed to unset NO_NEW_PRIVS: %m");
2644 goto finish;
2645 }
2646
2647 r = seccomp_load(seccomp);
2648 if (r < 0)
2649 log_error_errno(r, "Failed to install seccomp audit filter: %m");
2650
2651 finish:
2652 seccomp_release(seccomp);
2653 return r;
2654 #else
2655 return 0;
2656 #endif
2657
2658 }
2659
2660 static int setup_propagate(const char *root) {
2661 const char *p, *q;
2662
2663 (void) mkdir_p("/run/systemd/nspawn/", 0755);
2664 (void) mkdir_p("/run/systemd/nspawn/propagate", 0600);
2665 p = strjoina("/run/systemd/nspawn/propagate/", arg_machine);
2666 (void) mkdir_p(p, 0600);
2667
2668 q = strjoina(root, "/run/systemd/nspawn/incoming");
2669 mkdir_parents(q, 0755);
2670 mkdir_p(q, 0600);
2671
2672 if (mount(p, q, NULL, MS_BIND, NULL) < 0)
2673 return log_error_errno(errno, "Failed to install propagation bind mount.");
2674
2675 if (mount(NULL, q, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY, NULL) < 0)
2676 return log_error_errno(errno, "Failed to make propagation mount read-only");
2677
2678 return 0;
2679 }
2680
2681 static int setup_image(char **device_path, int *loop_nr) {
2682 struct loop_info64 info = {
2683 .lo_flags = LO_FLAGS_AUTOCLEAR|LO_FLAGS_PARTSCAN
2684 };
2685 _cleanup_close_ int fd = -1, control = -1, loop = -1;
2686 _cleanup_free_ char* loopdev = NULL;
2687 struct stat st;
2688 int r, nr;
2689
2690 assert(device_path);
2691 assert(loop_nr);
2692 assert(arg_image);
2693
2694 fd = open(arg_image, O_CLOEXEC|(arg_read_only ? O_RDONLY : O_RDWR)|O_NONBLOCK|O_NOCTTY);
2695 if (fd < 0)
2696 return log_error_errno(errno, "Failed to open %s: %m", arg_image);
2697
2698 if (fstat(fd, &st) < 0)
2699 return log_error_errno(errno, "Failed to stat %s: %m", arg_image);
2700
2701 if (S_ISBLK(st.st_mode)) {
2702 char *p;
2703
2704 p = strdup(arg_image);
2705 if (!p)
2706 return log_oom();
2707
2708 *device_path = p;
2709
2710 *loop_nr = -1;
2711
2712 r = fd;
2713 fd = -1;
2714
2715 return r;
2716 }
2717
2718 if (!S_ISREG(st.st_mode)) {
2719 log_error_errno(errno, "%s is not a regular file or block device: %m", arg_image);
2720 return -EINVAL;
2721 }
2722
2723 control = open("/dev/loop-control", O_RDWR|O_CLOEXEC|O_NOCTTY|O_NONBLOCK);
2724 if (control < 0)
2725 return log_error_errno(errno, "Failed to open /dev/loop-control: %m");
2726
2727 nr = ioctl(control, LOOP_CTL_GET_FREE);
2728 if (nr < 0)
2729 return log_error_errno(errno, "Failed to allocate loop device: %m");
2730
2731 if (asprintf(&loopdev, "/dev/loop%i", nr) < 0)
2732 return log_oom();
2733
2734 loop = open(loopdev, O_CLOEXEC|(arg_read_only ? O_RDONLY : O_RDWR)|O_NONBLOCK|O_NOCTTY);
2735 if (loop < 0)
2736 return log_error_errno(errno, "Failed to open loop device %s: %m", loopdev);
2737
2738 if (ioctl(loop, LOOP_SET_FD, fd) < 0)
2739 return log_error_errno(errno, "Failed to set loopback file descriptor on %s: %m", loopdev);
2740
2741 if (arg_read_only)
2742 info.lo_flags |= LO_FLAGS_READ_ONLY;
2743
2744 if (ioctl(loop, LOOP_SET_STATUS64, &info) < 0)
2745 return log_error_errno(errno, "Failed to set loopback settings on %s: %m", loopdev);
2746
2747 *device_path = loopdev;
2748 loopdev = NULL;
2749
2750 *loop_nr = nr;
2751
2752 r = loop;
2753 loop = -1;
2754
2755 return r;
2756 }
2757
2758 #define PARTITION_TABLE_BLURB \
2759 "Note that the disk image needs to either contain only a single MBR partition of\n" \
2760 "type 0x83 that is marked bootable, or a single GPT partition of type " \
2761 "0FC63DAF-8483-4772-8E79-3D69D8477DE4 or follow\n" \
2762 " http://www.freedesktop.org/wiki/Specifications/DiscoverablePartitionsSpec/\n" \
2763 "to be bootable with systemd-nspawn."
2764
2765 static int dissect_image(
2766 int fd,
2767 char **root_device, bool *root_device_rw,
2768 char **home_device, bool *home_device_rw,
2769 char **srv_device, bool *srv_device_rw,
2770 bool *secondary) {
2771
2772 #ifdef HAVE_BLKID
2773 int home_nr = -1, srv_nr = -1;
2774 #ifdef GPT_ROOT_NATIVE
2775 int root_nr = -1;
2776 #endif
2777 #ifdef GPT_ROOT_SECONDARY
2778 int secondary_root_nr = -1;
2779 #endif
2780 _cleanup_free_ char *home = NULL, *root = NULL, *secondary_root = NULL, *srv = NULL, *generic = NULL;
2781 _cleanup_udev_enumerate_unref_ struct udev_enumerate *e = NULL;
2782 _cleanup_udev_device_unref_ struct udev_device *d = NULL;
2783 _cleanup_blkid_free_probe_ blkid_probe b = NULL;
2784 _cleanup_udev_unref_ struct udev *udev = NULL;
2785 struct udev_list_entry *first, *item;
2786 bool home_rw = true, root_rw = true, secondary_root_rw = true, srv_rw = true, generic_rw = true;
2787 bool is_gpt, is_mbr, multiple_generic = false;
2788 const char *pttype = NULL;
2789 blkid_partlist pl;
2790 struct stat st;
2791 unsigned i;
2792 int r;
2793
2794 assert(fd >= 0);
2795 assert(root_device);
2796 assert(home_device);
2797 assert(srv_device);
2798 assert(secondary);
2799 assert(arg_image);
2800
2801 b = blkid_new_probe();
2802 if (!b)
2803 return log_oom();
2804
2805 errno = 0;
2806 r = blkid_probe_set_device(b, fd, 0, 0);
2807 if (r != 0) {
2808 if (errno == 0)
2809 return log_oom();
2810
2811 log_error_errno(errno, "Failed to set device on blkid probe: %m");
2812 return -errno;
2813 }
2814
2815 blkid_probe_enable_partitions(b, 1);
2816 blkid_probe_set_partitions_flags(b, BLKID_PARTS_ENTRY_DETAILS);
2817
2818 errno = 0;
2819 r = blkid_do_safeprobe(b);
2820 if (r == -2 || r == 1) {
2821 log_error("Failed to identify any partition table on\n"
2822 " %s\n"
2823 PARTITION_TABLE_BLURB, arg_image);
2824 return -EINVAL;
2825 } else if (r != 0) {
2826 if (errno == 0)
2827 errno = EIO;
2828 log_error_errno(errno, "Failed to probe: %m");
2829 return -errno;
2830 }
2831
2832 (void) blkid_probe_lookup_value(b, "PTTYPE", &pttype, NULL);
2833
2834 is_gpt = streq_ptr(pttype, "gpt");
2835 is_mbr = streq_ptr(pttype, "dos");
2836
2837 if (!is_gpt && !is_mbr) {
2838 log_error("No GPT or MBR partition table discovered on\n"
2839 " %s\n"
2840 PARTITION_TABLE_BLURB, arg_image);
2841 return -EINVAL;
2842 }
2843
2844 errno = 0;
2845 pl = blkid_probe_get_partitions(b);
2846 if (!pl) {
2847 if (errno == 0)
2848 return log_oom();
2849
2850 log_error("Failed to list partitions of %s", arg_image);
2851 return -errno;
2852 }
2853
2854 udev = udev_new();
2855 if (!udev)
2856 return log_oom();
2857
2858 if (fstat(fd, &st) < 0)
2859 return log_error_errno(errno, "Failed to stat block device: %m");
2860
2861 d = udev_device_new_from_devnum(udev, 'b', st.st_rdev);
2862 if (!d)
2863 return log_oom();
2864
2865 for (i = 0;; i++) {
2866 int n, m;
2867
2868 if (i >= 10) {
2869 log_error("Kernel partitions never appeared.");
2870 return -ENXIO;
2871 }
2872
2873 e = udev_enumerate_new(udev);
2874 if (!e)
2875 return log_oom();
2876
2877 r = udev_enumerate_add_match_parent(e, d);
2878 if (r < 0)
2879 return log_oom();
2880
2881 r = udev_enumerate_scan_devices(e);
2882 if (r < 0)
2883 return log_error_errno(r, "Failed to scan for partition devices of %s: %m", arg_image);
2884
2885 /* Count the partitions enumerated by the kernel */
2886 n = 0;
2887 first = udev_enumerate_get_list_entry(e);
2888 udev_list_entry_foreach(item, first)
2889 n++;
2890
2891 /* Count the partitions enumerated by blkid */
2892 m = blkid_partlist_numof_partitions(pl);
2893 if (n == m + 1)
2894 break;
2895 if (n > m + 1) {
2896 log_error("blkid and kernel partition list do not match.");
2897 return -EIO;
2898 }
2899 if (n < m + 1) {
2900 unsigned j;
2901
2902 /* The kernel has probed fewer partitions than
2903 * blkid? Maybe the kernel prober is still
2904 * running or it got EBUSY because udev
2905 * already opened the device. Let's reprobe
2906 * the device, which is a synchronous call
2907 * that waits until probing is complete. */
2908
2909 for (j = 0; j < 20; j++) {
2910
2911 r = ioctl(fd, BLKRRPART, 0);
2912 if (r < 0)
2913 r = -errno;
2914 if (r >= 0 || r != -EBUSY)
2915 break;
2916
2917 /* If something else has the device
2918 * open, such as an udev rule, the
2919 * ioctl will return EBUSY. Since
2920 * there's no way to wait until it
2921 * isn't busy anymore, let's just wait
2922 * a bit, and try again.
2923 *
2924 * This is really something they
2925 * should fix in the kernel! */
2926
2927 usleep(50 * USEC_PER_MSEC);
2928 }
2929
2930 if (r < 0)
2931 return log_error_errno(r, "Failed to reread partition table: %m");
2932 }
2933
2934 e = udev_enumerate_unref(e);
2935 }
2936
2937 first = udev_enumerate_get_list_entry(e);
2938 udev_list_entry_foreach(item, first) {
2939 _cleanup_udev_device_unref_ struct udev_device *q;
2940 const char *node;
2941 unsigned long long flags;
2942 blkid_partition pp;
2943 dev_t qn;
2944 int nr;
2945
2946 errno = 0;
2947 q = udev_device_new_from_syspath(udev, udev_list_entry_get_name(item));
2948 if (!q) {
2949 if (!errno)
2950 errno = ENOMEM;
2951
2952 log_error_errno(errno, "Failed to get partition device of %s: %m", arg_image);
2953 return -errno;
2954 }
2955
2956 qn = udev_device_get_devnum(q);
2957 if (major(qn) == 0)
2958 continue;
2959
2960 if (st.st_rdev == qn)
2961 continue;
2962
2963 node = udev_device_get_devnode(q);
2964 if (!node)
2965 continue;
2966
2967 pp = blkid_partlist_devno_to_partition(pl, qn);
2968 if (!pp)
2969 continue;
2970
2971 flags = blkid_partition_get_flags(pp);
2972
2973 nr = blkid_partition_get_partno(pp);
2974 if (nr < 0)
2975 continue;
2976
2977 if (is_gpt) {
2978 sd_id128_t type_id;
2979 const char *stype;
2980
2981 if (flags & GPT_FLAG_NO_AUTO)
2982 continue;
2983
2984 stype = blkid_partition_get_type_string(pp);
2985 if (!stype)
2986 continue;
2987
2988 if (sd_id128_from_string(stype, &type_id) < 0)
2989 continue;
2990
2991 if (sd_id128_equal(type_id, GPT_HOME)) {
2992
2993 if (home && nr >= home_nr)
2994 continue;
2995
2996 home_nr = nr;
2997 home_rw = !(flags & GPT_FLAG_READ_ONLY);
2998
2999 r = free_and_strdup(&home, node);
3000 if (r < 0)
3001 return log_oom();
3002
3003 } else if (sd_id128_equal(type_id, GPT_SRV)) {
3004
3005 if (srv && nr >= srv_nr)
3006 continue;
3007
3008 srv_nr = nr;
3009 srv_rw = !(flags & GPT_FLAG_READ_ONLY);
3010
3011 r = free_and_strdup(&srv, node);
3012 if (r < 0)
3013 return log_oom();
3014 }
3015 #ifdef GPT_ROOT_NATIVE
3016 else if (sd_id128_equal(type_id, GPT_ROOT_NATIVE)) {
3017
3018 if (root && nr >= root_nr)
3019 continue;
3020
3021 root_nr = nr;
3022 root_rw = !(flags & GPT_FLAG_READ_ONLY);
3023
3024 r = free_and_strdup(&root, node);
3025 if (r < 0)
3026 return log_oom();
3027 }
3028 #endif
3029 #ifdef GPT_ROOT_SECONDARY
3030 else if (sd_id128_equal(type_id, GPT_ROOT_SECONDARY)) {
3031
3032 if (secondary_root && nr >= secondary_root_nr)
3033 continue;
3034
3035 secondary_root_nr = nr;
3036 secondary_root_rw = !(flags & GPT_FLAG_READ_ONLY);
3037
3038 r = free_and_strdup(&secondary_root, node);
3039 if (r < 0)
3040 return log_oom();
3041 }
3042 #endif
3043 else if (sd_id128_equal(type_id, GPT_LINUX_GENERIC)) {
3044
3045 if (generic)
3046 multiple_generic = true;
3047 else {
3048 generic_rw = !(flags & GPT_FLAG_READ_ONLY);
3049
3050 r = free_and_strdup(&generic, node);
3051 if (r < 0)
3052 return log_oom();
3053 }
3054 }
3055
3056 } else if (is_mbr) {
3057 int type;
3058
3059 if (flags != 0x80) /* Bootable flag */
3060 continue;
3061
3062 type = blkid_partition_get_type(pp);
3063 if (type != 0x83) /* Linux partition */
3064 continue;
3065
3066 if (generic)
3067 multiple_generic = true;
3068 else {
3069 generic_rw = true;
3070
3071 r = free_and_strdup(&root, node);
3072 if (r < 0)
3073 return log_oom();
3074 }
3075 }
3076 }
3077
3078 if (root) {
3079 *root_device = root;
3080 root = NULL;
3081
3082 *root_device_rw = root_rw;
3083 *secondary = false;
3084 } else if (secondary_root) {
3085 *root_device = secondary_root;
3086 secondary_root = NULL;
3087
3088 *root_device_rw = secondary_root_rw;
3089 *secondary = true;
3090 } else if (generic) {
3091
3092 /* There were no partitions with precise meanings
3093 * around, but we found generic partitions. In this
3094 * case, if there's only one, we can go ahead and boot
3095 * it, otherwise we bail out, because we really cannot
3096 * make any sense of it. */
3097
3098 if (multiple_generic) {
3099 log_error("Identified multiple bootable Linux partitions on\n"
3100 " %s\n"
3101 PARTITION_TABLE_BLURB, arg_image);
3102 return -EINVAL;
3103 }
3104
3105 *root_device = generic;
3106 generic = NULL;
3107
3108 *root_device_rw = generic_rw;
3109 *secondary = false;
3110 } else {
3111 log_error("Failed to identify root partition in disk image\n"
3112 " %s\n"
3113 PARTITION_TABLE_BLURB, arg_image);
3114 return -EINVAL;
3115 }
3116
3117 if (home) {
3118 *home_device = home;
3119 home = NULL;
3120
3121 *home_device_rw = home_rw;
3122 }
3123
3124 if (srv) {
3125 *srv_device = srv;
3126 srv = NULL;
3127
3128 *srv_device_rw = srv_rw;
3129 }
3130
3131 return 0;
3132 #else
3133 log_error("--image= is not supported, compiled without blkid support.");
3134 return -EOPNOTSUPP;
3135 #endif
3136 }
3137
3138 static int mount_device(const char *what, const char *where, const char *directory, bool rw) {
3139 #ifdef HAVE_BLKID
3140 _cleanup_blkid_free_probe_ blkid_probe b = NULL;
3141 const char *fstype, *p;
3142 int r;
3143
3144 assert(what);
3145 assert(where);
3146
3147 if (arg_read_only)
3148 rw = false;
3149
3150 if (directory)
3151 p = strjoina(where, directory);
3152 else
3153 p = where;
3154
3155 errno = 0;
3156 b = blkid_new_probe_from_filename(what);
3157 if (!b) {
3158 if (errno == 0)
3159 return log_oom();
3160 log_error_errno(errno, "Failed to allocate prober for %s: %m", what);
3161 return -errno;
3162 }
3163
3164 blkid_probe_enable_superblocks(b, 1);
3165 blkid_probe_set_superblocks_flags(b, BLKID_SUBLKS_TYPE);
3166
3167 errno = 0;
3168 r = blkid_do_safeprobe(b);
3169 if (r == -1 || r == 1) {
3170 log_error("Cannot determine file system type of %s", what);
3171 return -EINVAL;
3172 } else if (r != 0) {
3173 if (errno == 0)
3174 errno = EIO;
3175 log_error_errno(errno, "Failed to probe %s: %m", what);
3176 return -errno;
3177 }
3178
3179 errno = 0;
3180 if (blkid_probe_lookup_value(b, "TYPE", &fstype, NULL) < 0) {
3181 if (errno == 0)
3182 errno = EINVAL;
3183 log_error("Failed to determine file system type of %s", what);
3184 return -errno;
3185 }
3186
3187 if (streq(fstype, "crypto_LUKS")) {
3188 log_error("nspawn currently does not support LUKS disk images.");
3189 return -EOPNOTSUPP;
3190 }
3191
3192 if (mount(what, p, fstype, MS_NODEV|(rw ? 0 : MS_RDONLY), NULL) < 0)
3193 return log_error_errno(errno, "Failed to mount %s: %m", what);
3194
3195 return 0;
3196 #else
3197 log_error("--image= is not supported, compiled without blkid support.");
3198 return -EOPNOTSUPP;
3199 #endif
3200 }
3201
3202 static int mount_devices(
3203 const char *where,
3204 const char *root_device, bool root_device_rw,
3205 const char *home_device, bool home_device_rw,
3206 const char *srv_device, bool srv_device_rw) {
3207 int r;
3208
3209 assert(where);
3210
3211 if (root_device) {
3212 r = mount_device(root_device, arg_directory, NULL, root_device_rw);
3213 if (r < 0)
3214 return log_error_errno(r, "Failed to mount root directory: %m");
3215 }
3216
3217 if (home_device) {
3218 r = mount_device(home_device, arg_directory, "/home", home_device_rw);
3219 if (r < 0)
3220 return log_error_errno(r, "Failed to mount home directory: %m");
3221 }
3222
3223 if (srv_device) {
3224 r = mount_device(srv_device, arg_directory, "/srv", srv_device_rw);
3225 if (r < 0)
3226 return log_error_errno(r, "Failed to mount server data directory: %m");
3227 }
3228
3229 return 0;
3230 }
3231
3232 static void loop_remove(int nr, int *image_fd) {
3233 _cleanup_close_ int control = -1;
3234 int r;
3235
3236 if (nr < 0)
3237 return;
3238
3239 if (image_fd && *image_fd >= 0) {
3240 r = ioctl(*image_fd, LOOP_CLR_FD);
3241 if (r < 0)
3242 log_debug_errno(errno, "Failed to close loop image: %m");
3243 *image_fd = safe_close(*image_fd);
3244 }
3245
3246 control = open("/dev/loop-control", O_RDWR|O_CLOEXEC|O_NOCTTY|O_NONBLOCK);
3247 if (control < 0) {
3248 log_warning_errno(errno, "Failed to open /dev/loop-control: %m");
3249 return;
3250 }
3251
3252 r = ioctl(control, LOOP_CTL_REMOVE, nr);
3253 if (r < 0)
3254 log_debug_errno(errno, "Failed to remove loop %d: %m", nr);
3255 }
3256
3257 static int spawn_getent(const char *database, const char *key, pid_t *rpid) {
3258 int pipe_fds[2];
3259 pid_t pid;
3260
3261 assert(database);
3262 assert(key);
3263 assert(rpid);
3264
3265 if (pipe2(pipe_fds, O_CLOEXEC) < 0)
3266 return log_error_errno(errno, "Failed to allocate pipe: %m");
3267
3268 pid = fork();
3269 if (pid < 0)
3270 return log_error_errno(errno, "Failed to fork getent child: %m");
3271 else if (pid == 0) {
3272 int nullfd;
3273 char *empty_env = NULL;
3274
3275 if (dup3(pipe_fds[1], STDOUT_FILENO, 0) < 0)
3276 _exit(EXIT_FAILURE);
3277
3278 if (pipe_fds[0] > 2)
3279 safe_close(pipe_fds[0]);
3280 if (pipe_fds[1] > 2)
3281 safe_close(pipe_fds[1]);
3282
3283 nullfd = open("/dev/null", O_RDWR);
3284 if (nullfd < 0)
3285 _exit(EXIT_FAILURE);
3286
3287 if (dup3(nullfd, STDIN_FILENO, 0) < 0)
3288 _exit(EXIT_FAILURE);
3289
3290 if (dup3(nullfd, STDERR_FILENO, 0) < 0)
3291 _exit(EXIT_FAILURE);
3292
3293 if (nullfd > 2)
3294 safe_close(nullfd);
3295
3296 reset_all_signal_handlers();
3297 close_all_fds(NULL, 0);
3298
3299 execle("/usr/bin/getent", "getent", database, key, NULL, &empty_env);
3300 execle("/bin/getent", "getent", database, key, NULL, &empty_env);
3301 _exit(EXIT_FAILURE);
3302 }
3303
3304 pipe_fds[1] = safe_close(pipe_fds[1]);
3305
3306 *rpid = pid;
3307
3308 return pipe_fds[0];
3309 }
3310
3311 static int change_uid_gid(char **_home) {
3312 char line[LINE_MAX], *x, *u, *g, *h;
3313 const char *word, *state;
3314 _cleanup_free_ uid_t *uids = NULL;
3315 _cleanup_free_ char *home = NULL;
3316 _cleanup_fclose_ FILE *f = NULL;
3317 _cleanup_close_ int fd = -1;
3318 unsigned n_uids = 0;
3319 size_t sz = 0, l;
3320 uid_t uid;
3321 gid_t gid;
3322 pid_t pid;
3323 int r;
3324
3325 assert(_home);
3326
3327 if (!arg_user || streq(arg_user, "root") || streq(arg_user, "0")) {
3328 /* Reset everything fully to 0, just in case */
3329
3330 if (setgroups(0, NULL) < 0)
3331 return log_error_errno(errno, "setgroups() failed: %m");
3332
3333 if (setresgid(0, 0, 0) < 0)
3334 return log_error_errno(errno, "setregid() failed: %m");
3335
3336 if (setresuid(0, 0, 0) < 0)
3337 return log_error_errno(errno, "setreuid() failed: %m");
3338
3339 *_home = NULL;
3340 return 0;
3341 }
3342
3343 /* First, get user credentials */
3344 fd = spawn_getent("passwd", arg_user, &pid);
3345 if (fd < 0)
3346 return fd;
3347
3348 f = fdopen(fd, "r");
3349 if (!f)
3350 return log_oom();
3351 fd = -1;
3352
3353 if (!fgets(line, sizeof(line), f)) {
3354
3355 if (!ferror(f)) {
3356 log_error("Failed to resolve user %s.", arg_user);
3357 return -ESRCH;
3358 }
3359
3360 log_error_errno(errno, "Failed to read from getent: %m");
3361 return -errno;
3362 }
3363
3364 truncate_nl(line);
3365
3366 wait_for_terminate_and_warn("getent passwd", pid, true);
3367
3368 x = strchr(line, ':');
3369 if (!x) {
3370 log_error("/etc/passwd entry has invalid user field.");
3371 return -EIO;
3372 }
3373
3374 u = strchr(x+1, ':');
3375 if (!u) {
3376 log_error("/etc/passwd entry has invalid password field.");
3377 return -EIO;
3378 }
3379
3380 u++;
3381 g = strchr(u, ':');
3382 if (!g) {
3383 log_error("/etc/passwd entry has invalid UID field.");
3384 return -EIO;
3385 }
3386
3387 *g = 0;
3388 g++;
3389 x = strchr(g, ':');
3390 if (!x) {
3391 log_error("/etc/passwd entry has invalid GID field.");
3392 return -EIO;
3393 }
3394
3395 *x = 0;
3396 h = strchr(x+1, ':');
3397 if (!h) {
3398 log_error("/etc/passwd entry has invalid GECOS field.");
3399 return -EIO;
3400 }
3401
3402 h++;
3403 x = strchr(h, ':');
3404 if (!x) {
3405 log_error("/etc/passwd entry has invalid home directory field.");
3406 return -EIO;
3407 }
3408
3409 *x = 0;
3410
3411 r = parse_uid(u, &uid);
3412 if (r < 0) {
3413 log_error("Failed to parse UID of user.");
3414 return -EIO;
3415 }
3416
3417 r = parse_gid(g, &gid);
3418 if (r < 0) {
3419 log_error("Failed to parse GID of user.");
3420 return -EIO;
3421 }
3422
3423 home = strdup(h);
3424 if (!home)
3425 return log_oom();
3426
3427 /* Second, get group memberships */
3428 fd = spawn_getent("initgroups", arg_user, &pid);
3429 if (fd < 0)
3430 return fd;
3431
3432 fclose(f);
3433 f = fdopen(fd, "r");
3434 if (!f)
3435 return log_oom();
3436 fd = -1;
3437
3438 if (!fgets(line, sizeof(line), f)) {
3439 if (!ferror(f)) {
3440 log_error("Failed to resolve user %s.", arg_user);
3441 return -ESRCH;
3442 }
3443
3444 log_error_errno(errno, "Failed to read from getent: %m");
3445 return -errno;
3446 }
3447
3448 truncate_nl(line);
3449
3450 wait_for_terminate_and_warn("getent initgroups", pid, true);
3451
3452 /* Skip over the username and subsequent separator whitespace */
3453 x = line;
3454 x += strcspn(x, WHITESPACE);
3455 x += strspn(x, WHITESPACE);
3456
3457 FOREACH_WORD(word, l, x, state) {
3458 char c[l+1];
3459
3460 memcpy(c, word, l);
3461 c[l] = 0;
3462
3463 if (!GREEDY_REALLOC(uids, sz, n_uids+1))
3464 return log_oom();
3465
3466 r = parse_uid(c, &uids[n_uids++]);
3467 if (r < 0) {
3468 log_error("Failed to parse group data from getent.");
3469 return -EIO;
3470 }
3471 }
3472
3473 r = mkdir_parents(home, 0775);
3474 if (r < 0)
3475 return log_error_errno(r, "Failed to make home root directory: %m");
3476
3477 r = mkdir_safe(home, 0755, uid, gid);
3478 if (r < 0 && r != -EEXIST)
3479 return log_error_errno(r, "Failed to make home directory: %m");
3480
3481 fchown(STDIN_FILENO, uid, gid);
3482 fchown(STDOUT_FILENO, uid, gid);
3483 fchown(STDERR_FILENO, uid, gid);
3484
3485 if (setgroups(n_uids, uids) < 0)
3486 return log_error_errno(errno, "Failed to set auxiliary groups: %m");
3487
3488 if (setresgid(gid, gid, gid) < 0)
3489 return log_error_errno(errno, "setregid() failed: %m");
3490
3491 if (setresuid(uid, uid, uid) < 0)
3492 return log_error_errno(errno, "setreuid() failed: %m");
3493
3494 if (_home) {
3495 *_home = home;
3496 home = NULL;
3497 }
3498
3499 return 0;
3500 }
3501
3502 /*
3503 * Return values:
3504 * < 0 : wait_for_terminate() failed to get the state of the
3505 * container, the container was terminated by a signal, or
3506 * failed for an unknown reason. No change is made to the
3507 * container argument.
3508 * > 0 : The program executed in the container terminated with an
3509 * error. The exit code of the program executed in the
3510 * container is returned. The container argument has been set
3511 * to CONTAINER_TERMINATED.
3512 * 0 : The container is being rebooted, has been shut down or exited
3513 * successfully. The container argument has been set to either
3514 * CONTAINER_TERMINATED or CONTAINER_REBOOTED.
3515 *
3516 * That is, success is indicated by a return value of zero, and an
3517 * error is indicated by a non-zero value.
3518 */
3519 static int wait_for_container(pid_t pid, ContainerStatus *container) {
3520 siginfo_t status;
3521 int r;
3522
3523 r = wait_for_terminate(pid, &status);
3524 if (r < 0)
3525 return log_warning_errno(r, "Failed to wait for container: %m");
3526
3527 switch (status.si_code) {
3528
3529 case CLD_EXITED:
3530 if (status.si_status == 0) {
3531 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s exited successfully.", arg_machine);
3532
3533 } else
3534 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s failed with error code %i.", arg_machine, status.si_status);
3535
3536 *container = CONTAINER_TERMINATED;
3537 return status.si_status;
3538
3539 case CLD_KILLED:
3540 if (status.si_status == SIGINT) {
3541
3542 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s has been shut down.", arg_machine);
3543 *container = CONTAINER_TERMINATED;
3544 return 0;
3545
3546 } else if (status.si_status == SIGHUP) {
3547
3548 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s is being rebooted.", arg_machine);
3549 *container = CONTAINER_REBOOTED;
3550 return 0;
3551 }
3552
3553 /* CLD_KILLED fallthrough */
3554
3555 case CLD_DUMPED:
3556 log_error("Container %s terminated by signal %s.", arg_machine, signal_to_string(status.si_status));
3557 return -EIO;
3558
3559 default:
3560 log_error("Container %s failed due to unknown reason.", arg_machine);
3561 return -EIO;
3562 }
3563
3564 return r;
3565 }
3566
3567 static void nop_handler(int sig) {}
3568
3569 static int on_orderly_shutdown(sd_event_source *s, const struct signalfd_siginfo *si, void *userdata) {
3570 pid_t pid;
3571
3572 pid = PTR_TO_UINT32(userdata);
3573 if (pid > 0) {
3574 if (kill(pid, arg_kill_signal) >= 0) {
3575 log_info("Trying to halt container. Send SIGTERM again to trigger immediate termination.");
3576 sd_event_source_set_userdata(s, NULL);
3577 return 0;
3578 }
3579 }
3580
3581 sd_event_exit(sd_event_source_get_event(s), 0);
3582 return 0;
3583 }
3584
3585 static int determine_names(void) {
3586 int r;
3587
3588 if (!arg_image && !arg_directory) {
3589 if (arg_machine) {
3590 _cleanup_(image_unrefp) Image *i = NULL;
3591
3592 r = image_find(arg_machine, &i);
3593 if (r < 0)
3594 return log_error_errno(r, "Failed to find image for machine '%s': %m", arg_machine);
3595 else if (r == 0) {
3596 log_error("No image for machine '%s': %m", arg_machine);
3597 return -ENOENT;
3598 }
3599
3600 if (i->type == IMAGE_RAW)
3601 r = set_sanitized_path(&arg_image, i->path);
3602 else
3603 r = set_sanitized_path(&arg_directory, i->path);
3604 if (r < 0)
3605 return log_error_errno(r, "Invalid image directory: %m");
3606
3607 arg_read_only = arg_read_only || i->read_only;
3608 } else
3609 arg_directory = get_current_dir_name();
3610
3611 if (!arg_directory && !arg_machine) {
3612 log_error("Failed to determine path, please use -D or -i.");
3613 return -EINVAL;
3614 }
3615 }
3616
3617 if (!arg_machine) {
3618 if (arg_directory && path_equal(arg_directory, "/"))
3619 arg_machine = gethostname_malloc();
3620 else
3621 arg_machine = strdup(basename(arg_image ?: arg_directory));
3622
3623 if (!arg_machine)
3624 return log_oom();
3625
3626 hostname_cleanup(arg_machine, false);
3627 if (!machine_name_is_valid(arg_machine)) {
3628 log_error("Failed to determine machine name automatically, please use -M.");
3629 return -EINVAL;
3630 }
3631
3632 if (arg_ephemeral) {
3633 char *b;
3634
3635 /* Add a random suffix when this is an
3636 * ephemeral machine, so that we can run many
3637 * instances at once without manually having
3638 * to specify -M each time. */
3639
3640 if (asprintf(&b, "%s-%016" PRIx64, arg_machine, random_u64()) < 0)
3641 return log_oom();
3642
3643 free(arg_machine);
3644 arg_machine = b;
3645 }
3646 }
3647
3648 return 0;
3649 }
3650
3651 static int determine_uid_shift(void) {
3652 int r;
3653
3654 if (!arg_userns)
3655 return 0;
3656
3657 if (arg_uid_shift == UID_INVALID) {
3658 struct stat st;
3659
3660 r = stat(arg_directory, &st);
3661 if (r < 0)
3662 return log_error_errno(errno, "Failed to determine UID base of %s: %m", arg_directory);
3663
3664 arg_uid_shift = st.st_uid & UINT32_C(0xffff0000);
3665
3666 if (arg_uid_shift != (st.st_gid & UINT32_C(0xffff0000))) {
3667 log_error("UID and GID base of %s don't match.", arg_directory);
3668 return -EINVAL;
3669 }
3670
3671 arg_uid_range = UINT32_C(0x10000);
3672 }
3673
3674 if (arg_uid_shift > (uid_t) -1 - arg_uid_range) {
3675 log_error("UID base too high for UID range.");
3676 return -EINVAL;
3677 }
3678
3679 log_info("Using user namespaces with base " UID_FMT " and range " UID_FMT ".", arg_uid_shift, arg_uid_range);
3680 return 0;
3681 }
3682
3683 int main(int argc, char *argv[]) {
3684
3685 _cleanup_free_ char *device_path = NULL, *root_device = NULL, *home_device = NULL, *srv_device = NULL, *console = NULL;
3686 bool root_device_rw = true, home_device_rw = true, srv_device_rw = true;
3687 _cleanup_close_ int master = -1, image_fd = -1;
3688 _cleanup_fdset_free_ FDSet *fds = NULL;
3689 int r, n_fd_passed, loop_nr = -1;
3690 char veth_name[IFNAMSIZ];
3691 bool secondary = false, remove_subvol = false;
3692 sigset_t mask, mask_chld;
3693 pid_t pid = 0;
3694 int ret = EXIT_SUCCESS;
3695 union in_addr_union exposed = {};
3696 _cleanup_release_lock_file_ LockFile tree_global_lock = LOCK_FILE_INIT, tree_local_lock = LOCK_FILE_INIT;
3697 bool interactive;
3698
3699 log_parse_environment();
3700 log_open();
3701
3702 r = parse_argv(argc, argv);
3703 if (r <= 0)
3704 goto finish;
3705
3706 r = determine_names();
3707 if (r < 0)
3708 goto finish;
3709
3710 if (geteuid() != 0) {
3711 log_error("Need to be root.");
3712 r = -EPERM;
3713 goto finish;
3714 }
3715
3716 log_close();
3717 n_fd_passed = sd_listen_fds(false);
3718 if (n_fd_passed > 0) {
3719 r = fdset_new_listen_fds(&fds, false);
3720 if (r < 0) {
3721 log_error_errno(r, "Failed to collect file descriptors: %m");
3722 goto finish;
3723 }
3724 }
3725 fdset_close_others(fds);
3726 log_open();
3727
3728 if (arg_directory) {
3729 assert(!arg_image);
3730
3731 if (path_equal(arg_directory, "/") && !arg_ephemeral) {
3732 log_error("Spawning container on root directory is not supported. Consider using --ephemeral.");
3733 r = -EINVAL;
3734 goto finish;
3735 }
3736
3737 if (arg_ephemeral) {
3738 _cleanup_free_ char *np = NULL;
3739
3740 /* If the specified path is a mount point we
3741 * generate the new snapshot immediately
3742 * inside it under a random name. However if
3743 * the specified is not a mount point we
3744 * create the new snapshot in the parent
3745 * directory, just next to it. */
3746 r = path_is_mount_point(arg_directory, false);
3747 if (r < 0) {
3748 log_error_errno(r, "Failed to determine whether directory %s is mount point: %m", arg_directory);
3749 goto finish;
3750 }
3751 if (r > 0)
3752 r = tempfn_random_child(arg_directory, &np);
3753 else
3754 r = tempfn_random(arg_directory, &np);
3755 if (r < 0) {
3756 log_error_errno(r, "Failed to generate name for snapshot: %m");
3757 goto finish;
3758 }
3759
3760 r = image_path_lock(np, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
3761 if (r < 0) {
3762 log_error_errno(r, "Failed to lock %s: %m", np);
3763 goto finish;
3764 }
3765
3766 r = btrfs_subvol_snapshot(arg_directory, np, (arg_read_only ? BTRFS_SNAPSHOT_READ_ONLY : 0) | BTRFS_SNAPSHOT_FALLBACK_COPY | BTRFS_SNAPSHOT_RECURSIVE);
3767 if (r < 0) {
3768 log_error_errno(r, "Failed to create snapshot %s from %s: %m", np, arg_directory);
3769 goto finish;
3770 }
3771
3772 free(arg_directory);
3773 arg_directory = np;
3774 np = NULL;
3775
3776 remove_subvol = true;
3777
3778 } else {
3779 r = image_path_lock(arg_directory, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
3780 if (r == -EBUSY) {
3781 log_error_errno(r, "Directory tree %s is currently busy.", arg_directory);
3782 goto finish;
3783 }
3784 if (r < 0) {
3785 log_error_errno(r, "Failed to lock %s: %m", arg_directory);
3786 return r;
3787 }
3788
3789 if (arg_template) {
3790 r = btrfs_subvol_snapshot(arg_template, arg_directory, (arg_read_only ? BTRFS_SNAPSHOT_READ_ONLY : 0) | BTRFS_SNAPSHOT_FALLBACK_COPY | BTRFS_SNAPSHOT_RECURSIVE);
3791 if (r == -EEXIST) {
3792 if (!arg_quiet)
3793 log_info("Directory %s already exists, not populating from template %s.", arg_directory, arg_template);
3794 } else if (r < 0) {
3795 log_error_errno(r, "Couldn't create snapshot %s from %s: %m", arg_directory, arg_template);
3796 goto finish;
3797 } else {
3798 if (!arg_quiet)
3799 log_info("Populated %s from template %s.", arg_directory, arg_template);
3800 }
3801 }
3802 }
3803
3804 if (arg_boot) {
3805 if (path_is_os_tree(arg_directory) <= 0) {
3806 log_error("Directory %s doesn't look like an OS root directory (os-release file is missing). Refusing.", arg_directory);
3807 r = -EINVAL;
3808 goto finish;
3809 }
3810 } else {
3811 const char *p;
3812
3813 p = strjoina(arg_directory,
3814 argc > optind && path_is_absolute(argv[optind]) ? argv[optind] : "/usr/bin/");
3815 if (access(p, F_OK) < 0) {
3816 log_error("Directory %s lacks the binary to execute or doesn't look like a binary tree. Refusing.", arg_directory);
3817 r = -EINVAL;
3818 goto finish;
3819 }
3820 }
3821
3822 } else {
3823 char template[] = "/tmp/nspawn-root-XXXXXX";
3824
3825 assert(arg_image);
3826 assert(!arg_template);
3827
3828 r = image_path_lock(arg_image, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
3829 if (r == -EBUSY) {
3830 r = log_error_errno(r, "Disk image %s is currently busy.", arg_image);
3831 goto finish;
3832 }
3833 if (r < 0) {
3834 r = log_error_errno(r, "Failed to create image lock: %m");
3835 goto finish;
3836 }
3837
3838 if (!mkdtemp(template)) {
3839 log_error_errno(errno, "Failed to create temporary directory: %m");
3840 r = -errno;
3841 goto finish;
3842 }
3843
3844 arg_directory = strdup(template);
3845 if (!arg_directory) {
3846 r = log_oom();
3847 goto finish;
3848 }
3849
3850 image_fd = setup_image(&device_path, &loop_nr);
3851 if (image_fd < 0) {
3852 r = image_fd;
3853 goto finish;
3854 }
3855
3856 r = dissect_image(image_fd,
3857 &root_device, &root_device_rw,
3858 &home_device, &home_device_rw,
3859 &srv_device, &srv_device_rw,
3860 &secondary);
3861 if (r < 0)
3862 goto finish;
3863 }
3864
3865 r = determine_uid_shift();
3866 if (r < 0)
3867 goto finish;
3868
3869 interactive = isatty(STDIN_FILENO) > 0 && isatty(STDOUT_FILENO) > 0;
3870
3871 master = posix_openpt(O_RDWR|O_NOCTTY|O_CLOEXEC|O_NDELAY);
3872 if (master < 0) {
3873 r = log_error_errno(errno, "Failed to acquire pseudo tty: %m");
3874 goto finish;
3875 }
3876
3877 r = ptsname_malloc(master, &console);
3878 if (r < 0) {
3879 r = log_error_errno(r, "Failed to determine tty name: %m");
3880 goto finish;
3881 }
3882
3883 if (unlockpt(master) < 0) {
3884 r = log_error_errno(errno, "Failed to unlock tty: %m");
3885 goto finish;
3886 }
3887
3888 if (!arg_quiet)
3889 log_info("Spawning container %s on %s.\nPress ^] three times within 1s to kill container.",
3890 arg_machine, arg_image ?: arg_directory);
3891
3892 assert_se(sigemptyset(&mask) == 0);
3893 sigset_add_many(&mask, SIGCHLD, SIGWINCH, SIGTERM, SIGINT, -1);
3894 assert_se(sigprocmask(SIG_BLOCK, &mask, NULL) == 0);
3895
3896 assert_se(sigemptyset(&mask_chld) == 0);
3897 assert_se(sigaddset(&mask_chld, SIGCHLD) == 0);
3898
3899 for (;;) {
3900 _cleanup_close_pair_ int kmsg_socket_pair[2] = { -1, -1 }, rtnl_socket_pair[2] = { -1, -1 };
3901 ContainerStatus container_status;
3902 _cleanup_(barrier_destroy) Barrier barrier = BARRIER_NULL;
3903 struct sigaction sa = {
3904 .sa_handler = nop_handler,
3905 .sa_flags = SA_NOCLDSTOP,
3906 };
3907
3908 r = barrier_create(&barrier);
3909 if (r < 0) {
3910 log_error_errno(r, "Cannot initialize IPC barrier: %m");
3911 goto finish;
3912 }
3913
3914 if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC, 0, kmsg_socket_pair) < 0) {
3915 r = log_error_errno(errno, "Failed to create kmsg socket pair: %m");
3916 goto finish;
3917 }
3918
3919 if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC, 0, rtnl_socket_pair) < 0) {
3920 r = log_error_errno(errno, "Failed to create rtnl socket pair: %m");
3921 goto finish;
3922 }
3923
3924 /* Child can be killed before execv(), so handle SIGCHLD
3925 * in order to interrupt parent's blocking calls and
3926 * give it a chance to call wait() and terminate. */
3927 r = sigprocmask(SIG_UNBLOCK, &mask_chld, NULL);
3928 if (r < 0) {
3929 r = log_error_errno(errno, "Failed to change the signal mask: %m");
3930 goto finish;
3931 }
3932
3933 r = sigaction(SIGCHLD, &sa, NULL);
3934 if (r < 0) {
3935 r = log_error_errno(errno, "Failed to install SIGCHLD handler: %m");
3936 goto finish;
3937 }
3938
3939 pid = raw_clone(SIGCHLD|CLONE_NEWNS|
3940 (arg_share_system ? 0 : CLONE_NEWIPC|CLONE_NEWPID|CLONE_NEWUTS)|
3941 (arg_private_network ? CLONE_NEWNET : 0), NULL);
3942 if (pid < 0) {
3943 if (errno == EINVAL)
3944 r = log_error_errno(errno, "clone() failed, do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in): %m");
3945 else
3946 r = log_error_errno(errno, "clone() failed: %m");
3947
3948 goto finish;
3949 }
3950
3951 if (pid == 0) {
3952 /* child */
3953 _cleanup_free_ char *home = NULL;
3954 unsigned n_env = 2;
3955 const char *envp[] = {
3956 "PATH=" DEFAULT_PATH_SPLIT_USR,
3957 "container=systemd-nspawn", /* LXC sets container=lxc, so follow the scheme here */
3958 NULL, /* TERM */
3959 NULL, /* HOME */
3960 NULL, /* USER */
3961 NULL, /* LOGNAME */
3962 NULL, /* container_uuid */
3963 NULL, /* LISTEN_FDS */
3964 NULL, /* LISTEN_PID */
3965 NULL
3966 };
3967 char **env_use;
3968
3969 barrier_set_role(&barrier, BARRIER_CHILD);
3970
3971 envp[n_env] = strv_find_prefix(environ, "TERM=");
3972 if (envp[n_env])
3973 n_env ++;
3974
3975 master = safe_close(master);
3976
3977 kmsg_socket_pair[0] = safe_close(kmsg_socket_pair[0]);
3978 rtnl_socket_pair[0] = safe_close(rtnl_socket_pair[0]);
3979
3980 reset_all_signal_handlers();
3981 reset_signal_mask();
3982
3983 if (interactive) {
3984 close_nointr(STDIN_FILENO);
3985 close_nointr(STDOUT_FILENO);
3986 close_nointr(STDERR_FILENO);
3987
3988 r = open_terminal(console, O_RDWR);
3989 if (r != STDIN_FILENO) {
3990 if (r >= 0) {
3991 safe_close(r);
3992 r = -EINVAL;
3993 }
3994
3995 log_error_errno(r, "Failed to open console: %m");
3996 _exit(EXIT_FAILURE);
3997 }
3998
3999 if (dup2(STDIN_FILENO, STDOUT_FILENO) != STDOUT_FILENO ||
4000 dup2(STDIN_FILENO, STDERR_FILENO) != STDERR_FILENO) {
4001 log_error_errno(errno, "Failed to duplicate console: %m");
4002 _exit(EXIT_FAILURE);
4003 }
4004 }
4005
4006 if (setsid() < 0) {
4007 log_error_errno(errno, "setsid() failed: %m");
4008 _exit(EXIT_FAILURE);
4009 }
4010
4011 if (reset_audit_loginuid() < 0)
4012 _exit(EXIT_FAILURE);
4013
4014 if (prctl(PR_SET_PDEATHSIG, SIGKILL) < 0) {
4015 log_error_errno(errno, "PR_SET_PDEATHSIG failed: %m");
4016 _exit(EXIT_FAILURE);
4017 }
4018
4019 if (arg_private_network)
4020 loopback_setup();
4021
4022 /* Mark everything as slave, so that we still
4023 * receive mounts from the real root, but don't
4024 * propagate mounts to the real root. */
4025 if (mount(NULL, "/", NULL, MS_SLAVE|MS_REC, NULL) < 0) {
4026 log_error_errno(errno, "MS_SLAVE|MS_REC failed: %m");
4027 _exit(EXIT_FAILURE);
4028 }
4029
4030 if (mount_devices(arg_directory,
4031 root_device, root_device_rw,
4032 home_device, home_device_rw,
4033 srv_device, srv_device_rw) < 0)
4034 _exit(EXIT_FAILURE);
4035
4036 /* Turn directory into bind mount */
4037 if (mount(arg_directory, arg_directory, NULL, MS_BIND|MS_REC, NULL) < 0) {
4038 log_error_errno(errno, "Failed to make bind mount: %m");
4039 _exit(EXIT_FAILURE);
4040 }
4041
4042 r = setup_volatile(arg_directory);
4043 if (r < 0)
4044 _exit(EXIT_FAILURE);
4045
4046 if (setup_volatile_state(arg_directory) < 0)
4047 _exit(EXIT_FAILURE);
4048
4049 r = base_filesystem_create(arg_directory);
4050 if (r < 0)
4051 _exit(EXIT_FAILURE);
4052
4053 if (arg_read_only) {
4054 r = bind_remount_recursive(arg_directory, true);
4055 if (r < 0) {
4056 log_error_errno(r, "Failed to make tree read-only: %m");
4057 _exit(EXIT_FAILURE);
4058 }
4059 }
4060
4061 if (mount_all(arg_directory) < 0)
4062 _exit(EXIT_FAILURE);
4063
4064 if (copy_devnodes(arg_directory) < 0)
4065 _exit(EXIT_FAILURE);
4066
4067 if (setup_ptmx(arg_directory) < 0)
4068 _exit(EXIT_FAILURE);
4069
4070 dev_setup(arg_directory);
4071
4072 if (setup_propagate(arg_directory) < 0)
4073 _exit(EXIT_FAILURE);
4074
4075 if (setup_seccomp() < 0)
4076 _exit(EXIT_FAILURE);
4077
4078 if (setup_dev_console(arg_directory, console) < 0)
4079 _exit(EXIT_FAILURE);
4080
4081 if (setup_kmsg(arg_directory, kmsg_socket_pair[1]) < 0)
4082 _exit(EXIT_FAILURE);
4083 kmsg_socket_pair[1] = safe_close(kmsg_socket_pair[1]);
4084
4085 if (send_rtnl(rtnl_socket_pair[1]) < 0)
4086 _exit(EXIT_FAILURE);
4087 rtnl_socket_pair[1] = safe_close(rtnl_socket_pair[1]);
4088
4089 /* Tell the parent that we are ready, and that
4090 * it can cgroupify us to that we lack access
4091 * to certain devices and resources. */
4092 (void) barrier_place(&barrier); /* #1 */
4093
4094 if (setup_boot_id(arg_directory) < 0)
4095 _exit(EXIT_FAILURE);
4096
4097 if (setup_timezone(arg_directory) < 0)
4098 _exit(EXIT_FAILURE);
4099
4100 if (setup_resolv_conf(arg_directory) < 0)
4101 _exit(EXIT_FAILURE);
4102
4103 if (setup_journal(arg_directory) < 0)
4104 _exit(EXIT_FAILURE);
4105
4106 if (mount_binds(arg_directory, arg_bind, false) < 0)
4107 _exit(EXIT_FAILURE);
4108
4109 if (mount_binds(arg_directory, arg_bind_ro, true) < 0)
4110 _exit(EXIT_FAILURE);
4111
4112 if (mount_tmpfs(arg_directory) < 0)
4113 _exit(EXIT_FAILURE);
4114
4115 /* Wait until we are cgroup-ified, so that we
4116 * can mount the right cgroup path writable */
4117 (void) barrier_place_and_sync(&barrier); /* #2 */
4118
4119 if (mount_cgroup(arg_directory) < 0)
4120 _exit(EXIT_FAILURE);
4121
4122 if (chdir(arg_directory) < 0) {
4123 log_error_errno(errno, "chdir(%s) failed: %m", arg_directory);
4124 _exit(EXIT_FAILURE);
4125 }
4126
4127 if (mount(arg_directory, "/", NULL, MS_MOVE, NULL) < 0) {
4128 log_error_errno(errno, "mount(MS_MOVE) failed: %m");
4129 _exit(EXIT_FAILURE);
4130 }
4131
4132 if (chroot(".") < 0) {
4133 log_error_errno(errno, "chroot() failed: %m");
4134 _exit(EXIT_FAILURE);
4135 }
4136
4137 if (chdir("/") < 0) {
4138 log_error_errno(errno, "chdir() failed: %m");
4139 _exit(EXIT_FAILURE);
4140 }
4141
4142 if (arg_userns) {
4143 if (unshare(CLONE_NEWUSER) < 0) {
4144 log_error_errno(errno, "unshare(CLONE_NEWUSER) failed: %m");
4145 _exit(EXIT_FAILURE);
4146 }
4147
4148 /* Tell the parent, that it now can
4149 * write the UID map. */
4150 (void) barrier_place(&barrier); /* #3 */
4151
4152 /* Wait until the parent wrote the UID
4153 * map */
4154 (void) barrier_place_and_sync(&barrier); /* #4 */
4155 }
4156
4157 umask(0022);
4158
4159 if (drop_capabilities() < 0) {
4160 log_error_errno(errno, "drop_capabilities() failed: %m");
4161 _exit(EXIT_FAILURE);
4162 }
4163
4164 setup_hostname();
4165
4166 if (arg_personality != 0xffffffffLU) {
4167 if (personality(arg_personality) < 0) {
4168 log_error_errno(errno, "personality() failed: %m");
4169 _exit(EXIT_FAILURE);
4170 }
4171 } else if (secondary) {
4172 if (personality(PER_LINUX32) < 0) {
4173 log_error_errno(errno, "personality() failed: %m");
4174 _exit(EXIT_FAILURE);
4175 }
4176 }
4177
4178 #ifdef HAVE_SELINUX
4179 if (arg_selinux_context)
4180 if (setexeccon((security_context_t) arg_selinux_context) < 0) {
4181 log_error_errno(errno, "setexeccon(\"%s\") failed: %m", arg_selinux_context);
4182 _exit(EXIT_FAILURE);
4183 }
4184 #endif
4185
4186 r = change_uid_gid(&home);
4187 if (r < 0)
4188 _exit(EXIT_FAILURE);
4189
4190 if ((asprintf((char**)(envp + n_env++), "HOME=%s", home ? home: "/root") < 0) ||
4191 (asprintf((char**)(envp + n_env++), "USER=%s", arg_user ? arg_user : "root") < 0) ||
4192 (asprintf((char**)(envp + n_env++), "LOGNAME=%s", arg_user ? arg_user : "root") < 0)) {
4193 log_oom();
4194 _exit(EXIT_FAILURE);
4195 }
4196
4197 if (!sd_id128_equal(arg_uuid, SD_ID128_NULL)) {
4198 char as_uuid[37];
4199
4200 if (asprintf((char**)(envp + n_env++), "container_uuid=%s", id128_format_as_uuid(arg_uuid, as_uuid)) < 0) {
4201 log_oom();
4202 _exit(EXIT_FAILURE);
4203 }
4204 }
4205
4206 if (fdset_size(fds) > 0) {
4207 r = fdset_cloexec(fds, false);
4208 if (r < 0) {
4209 log_error_errno(r, "Failed to unset O_CLOEXEC for file descriptors.");
4210 _exit(EXIT_FAILURE);
4211 }
4212
4213 if ((asprintf((char **)(envp + n_env++), "LISTEN_FDS=%u", n_fd_passed) < 0) ||
4214 (asprintf((char **)(envp + n_env++), "LISTEN_PID=1") < 0)) {
4215 log_oom();
4216 _exit(EXIT_FAILURE);
4217 }
4218 }
4219
4220 if (!strv_isempty(arg_setenv)) {
4221 char **n;
4222
4223 n = strv_env_merge(2, envp, arg_setenv);
4224 if (!n) {
4225 log_oom();
4226 _exit(EXIT_FAILURE);
4227 }
4228
4229 env_use = n;
4230 } else
4231 env_use = (char**) envp;
4232
4233 /* Let the parent know that we are ready and
4234 * wait until the parent is ready with the
4235 * setup, too... */
4236 (void) barrier_place_and_sync(&barrier); /* #5 */
4237
4238 if (arg_boot) {
4239 char **a;
4240 size_t l;
4241
4242 /* Automatically search for the init system */
4243
4244 l = 1 + argc - optind;
4245 a = newa(char*, l + 1);
4246 memcpy(a + 1, argv + optind, l * sizeof(char*));
4247
4248 a[0] = (char*) "/usr/lib/systemd/systemd";
4249 execve(a[0], a, env_use);
4250
4251 a[0] = (char*) "/lib/systemd/systemd";
4252 execve(a[0], a, env_use);
4253
4254 a[0] = (char*) "/sbin/init";
4255 execve(a[0], a, env_use);
4256 } else if (argc > optind)
4257 execvpe(argv[optind], argv + optind, env_use);
4258 else {
4259 chdir(home ? home : "/root");
4260 execle("/bin/bash", "-bash", NULL, env_use);
4261 execle("/bin/sh", "-sh", NULL, env_use);
4262 }
4263
4264 log_error_errno(errno, "execv() failed: %m");
4265 _exit(EXIT_FAILURE);
4266 }
4267
4268 barrier_set_role(&barrier, BARRIER_PARENT);
4269 fdset_free(fds);
4270 fds = NULL;
4271
4272 kmsg_socket_pair[1] = safe_close(kmsg_socket_pair[1]);
4273 rtnl_socket_pair[1] = safe_close(rtnl_socket_pair[1]);
4274
4275 (void) barrier_place(&barrier); /* #1 */
4276
4277 /* Wait for the most basic Child-setup to be done,
4278 * before we add hardware to it, and place it in a
4279 * cgroup. */
4280 if (barrier_sync(&barrier)) { /* #1 */
4281 int ifi = 0;
4282
4283 r = move_network_interfaces(pid);
4284 if (r < 0)
4285 goto finish;
4286
4287 r = setup_veth(pid, veth_name, &ifi);
4288 if (r < 0)
4289 goto finish;
4290
4291 r = setup_bridge(veth_name, &ifi);
4292 if (r < 0)
4293 goto finish;
4294
4295 r = setup_macvlan(pid);
4296 if (r < 0)
4297 goto finish;
4298
4299 r = setup_ipvlan(pid);
4300 if (r < 0)
4301 goto finish;
4302
4303 r = register_machine(pid, ifi);
4304 if (r < 0)
4305 goto finish;
4306
4307 /* Notify the child that the parent is ready with all
4308 * its setup, and that the child can now hand over
4309 * control to the code to run inside the container. */
4310 (void) barrier_place(&barrier); /* #2 */
4311
4312 if (arg_userns) {
4313 char uid_map[strlen("/proc//uid_map") + DECIMAL_STR_MAX(uid_t) + 1], line[DECIMAL_STR_MAX(uid_t)*3+3+1];
4314
4315 (void) barrier_place_and_sync(&barrier); /* #3 */
4316
4317 xsprintf(uid_map, "/proc/" PID_FMT "/uid_map", pid);
4318 xsprintf(line, UID_FMT " " UID_FMT " " UID_FMT "\n", 0, arg_uid_shift, arg_uid_range);
4319 r = write_string_file(uid_map, line);
4320 if (r < 0) {
4321 log_error_errno(r, "Failed to write UID map: %m");
4322 goto finish;
4323 }
4324
4325 /* We always assign the same UID and GID ranges */
4326 xsprintf(uid_map, "/proc/" PID_FMT "/gid_map", pid);
4327 r = write_string_file(uid_map, line);
4328 if (r < 0) {
4329 log_error_errno(r, "Failed to write GID map: %m");
4330 goto finish;
4331 }
4332
4333 (void) barrier_place(&barrier); /* #4 */
4334 }
4335
4336 /* Block SIGCHLD here, before notifying child.
4337 * process_pty() will handle it with the other signals. */
4338 r = sigprocmask(SIG_BLOCK, &mask_chld, NULL);
4339 if (r < 0)
4340 goto finish;
4341
4342 /* Reset signal to default */
4343 r = default_signals(SIGCHLD, -1);
4344 if (r < 0)
4345 goto finish;
4346
4347 /* Let the child know that we are ready and wait that the child is completely ready now. */
4348 if (barrier_place_and_sync(&barrier)) { /* #5 */
4349 _cleanup_event_unref_ sd_event *event = NULL;
4350 _cleanup_(pty_forward_freep) PTYForward *forward = NULL;
4351 _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
4352 char last_char = 0;
4353
4354 sd_notifyf(false,
4355 "READY=1\n"
4356 "STATUS=Container running.\n"
4357 "X_NSPAWN_LEADER_PID=" PID_FMT, pid);
4358
4359 r = sd_event_new(&event);
4360 if (r < 0) {
4361 log_error_errno(r, "Failed to get default event source: %m");
4362 goto finish;
4363 }
4364
4365 if (arg_kill_signal > 0) {
4366 /* Try to kill the init system on SIGINT or SIGTERM */
4367 sd_event_add_signal(event, NULL, SIGINT, on_orderly_shutdown, UINT32_TO_PTR(pid));
4368 sd_event_add_signal(event, NULL, SIGTERM, on_orderly_shutdown, UINT32_TO_PTR(pid));
4369 } else {
4370 /* Immediately exit */
4371 sd_event_add_signal(event, NULL, SIGINT, NULL, NULL);
4372 sd_event_add_signal(event, NULL, SIGTERM, NULL, NULL);
4373 }
4374
4375 /* simply exit on sigchld */
4376 sd_event_add_signal(event, NULL, SIGCHLD, NULL, NULL);
4377
4378 if (arg_expose_ports) {
4379 r = watch_rtnl(event, rtnl_socket_pair[0], &exposed, &rtnl);
4380 if (r < 0)
4381 goto finish;
4382
4383 (void) expose_ports(rtnl, &exposed);
4384 }
4385
4386 rtnl_socket_pair[0] = safe_close(rtnl_socket_pair[0]);
4387
4388 r = pty_forward_new(event, master, true, !interactive, &forward);
4389 if (r < 0) {
4390 log_error_errno(r, "Failed to create PTY forwarder: %m");
4391 goto finish;
4392 }
4393
4394 r = sd_event_loop(event);
4395 if (r < 0) {
4396 log_error_errno(r, "Failed to run event loop: %m");
4397 goto finish;
4398 }
4399
4400 pty_forward_get_last_char(forward, &last_char);
4401
4402 forward = pty_forward_free(forward);
4403
4404 if (!arg_quiet && last_char != '\n')
4405 putc('\n', stdout);
4406
4407 /* Kill if it is not dead yet anyway */
4408 terminate_machine(pid);
4409 }
4410 }
4411
4412 /* Normally redundant, but better safe than sorry */
4413 kill(pid, SIGKILL);
4414
4415 r = wait_for_container(pid, &container_status);
4416 pid = 0;
4417
4418 if (r < 0)
4419 /* We failed to wait for the container, or the
4420 * container exited abnormally */
4421 goto finish;
4422 else if (r > 0 || container_status == CONTAINER_TERMINATED){
4423 /* The container exited with a non-zero
4424 * status, or with zero status and no reboot
4425 * was requested. */
4426 ret = r;
4427 break;
4428 }
4429
4430 /* CONTAINER_REBOOTED, loop again */
4431
4432 if (arg_keep_unit) {
4433 /* Special handling if we are running as a
4434 * service: instead of simply restarting the
4435 * machine we want to restart the entire
4436 * service, so let's inform systemd about this
4437 * with the special exit code 133. The service
4438 * file uses RestartForceExitStatus=133 so
4439 * that this results in a full nspawn
4440 * restart. This is necessary since we might
4441 * have cgroup parameters set we want to have
4442 * flushed out. */
4443 ret = 133;
4444 r = 0;
4445 break;
4446 }
4447
4448 flush_ports(&exposed);
4449 }
4450
4451 finish:
4452 sd_notify(false,
4453 "STOPPING=1\n"
4454 "STATUS=Terminating...");
4455
4456 loop_remove(loop_nr, &image_fd);
4457
4458 if (pid > 0)
4459 kill(pid, SIGKILL);
4460
4461 if (remove_subvol && arg_directory) {
4462 int k;
4463
4464 k = btrfs_subvol_remove(arg_directory, true);
4465 if (k < 0)
4466 log_warning_errno(k, "Cannot remove subvolume '%s', ignoring: %m", arg_directory);
4467 }
4468
4469 if (arg_machine) {
4470 const char *p;
4471
4472 p = strjoina("/run/systemd/nspawn/propagate/", arg_machine);
4473 (void) rm_rf(p, REMOVE_ROOT);
4474 }
4475
4476 free(arg_directory);
4477 free(arg_template);
4478 free(arg_image);
4479 free(arg_machine);
4480 free(arg_user);
4481 strv_free(arg_setenv);
4482 strv_free(arg_network_interfaces);
4483 strv_free(arg_network_macvlan);
4484 strv_free(arg_network_ipvlan);
4485 strv_free(arg_bind);
4486 strv_free(arg_bind_ro);
4487 strv_free(arg_tmpfs);
4488
4489 flush_ports(&exposed);
4490
4491 while (arg_expose_ports) {
4492 ExposePort *p = arg_expose_ports;
4493 LIST_REMOVE(ports, arg_expose_ports, p);
4494 free(p);
4495 }
4496
4497 return r < 0 ? EXIT_FAILURE : ret;
4498 }