]> git.ipfire.org Git - thirdparty/systemd.git/blob - src/nspawn/nspawn.c
util: rework rm_rf() logic
[thirdparty/systemd.git] / src / nspawn / nspawn.c
1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
2
3 /***
4 This file is part of systemd.
5
6 Copyright 2010 Lennart Poettering
7
8 systemd is free software; you can redistribute it and/or modify it
9 under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 2.1 of the License, or
11 (at your option) any later version.
12
13 systemd is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 Lesser General Public License for more details.
17
18 You should have received a copy of the GNU Lesser General Public License
19 along with systemd; If not, see <http://www.gnu.org/licenses/>.
20 ***/
21
22 #include <signal.h>
23 #include <sched.h>
24 #include <unistd.h>
25 #include <sys/types.h>
26 #include <sys/mount.h>
27 #include <stdlib.h>
28 #include <string.h>
29 #include <stdio.h>
30 #include <errno.h>
31 #include <sys/prctl.h>
32 #include <getopt.h>
33 #include <grp.h>
34 #include <linux/fs.h>
35 #include <sys/socket.h>
36 #include <linux/netlink.h>
37 #include <net/if.h>
38 #include <linux/veth.h>
39 #include <sys/personality.h>
40 #include <linux/loop.h>
41 #include <sys/file.h>
42
43 #ifdef HAVE_SELINUX
44 #include <selinux/selinux.h>
45 #endif
46
47 #ifdef HAVE_SECCOMP
48 #include <seccomp.h>
49 #endif
50
51 #ifdef HAVE_BLKID
52 #include <blkid/blkid.h>
53 #endif
54
55 #include "sd-daemon.h"
56 #include "sd-bus.h"
57 #include "sd-id128.h"
58 #include "sd-rtnl.h"
59 #include "log.h"
60 #include "util.h"
61 #include "mkdir.h"
62 #include "rm-rf.h"
63 #include "macro.h"
64 #include "missing.h"
65 #include "cgroup-util.h"
66 #include "strv.h"
67 #include "path-util.h"
68 #include "loopback-setup.h"
69 #include "dev-setup.h"
70 #include "fdset.h"
71 #include "build.h"
72 #include "fileio.h"
73 #include "bus-util.h"
74 #include "bus-error.h"
75 #include "ptyfwd.h"
76 #include "env-util.h"
77 #include "rtnl-util.h"
78 #include "udev-util.h"
79 #include "blkid-util.h"
80 #include "gpt.h"
81 #include "siphash24.h"
82 #include "copy.h"
83 #include "base-filesystem.h"
84 #include "barrier.h"
85 #include "event-util.h"
86 #include "capability.h"
87 #include "cap-list.h"
88 #include "btrfs-util.h"
89 #include "machine-image.h"
90 #include "list.h"
91 #include "in-addr-util.h"
92 #include "fw-util.h"
93 #include "local-addresses.h"
94
95 #ifdef HAVE_SECCOMP
96 #include "seccomp-util.h"
97 #endif
98
99 typedef struct ExposePort {
100 int protocol;
101 uint16_t host_port;
102 uint16_t container_port;
103 LIST_FIELDS(struct ExposePort, ports);
104 } ExposePort;
105
106 typedef enum ContainerStatus {
107 CONTAINER_TERMINATED,
108 CONTAINER_REBOOTED
109 } ContainerStatus;
110
111 typedef enum LinkJournal {
112 LINK_NO,
113 LINK_AUTO,
114 LINK_HOST,
115 LINK_GUEST
116 } LinkJournal;
117
118 typedef enum Volatile {
119 VOLATILE_NO,
120 VOLATILE_YES,
121 VOLATILE_STATE,
122 } Volatile;
123
124 static char *arg_directory = NULL;
125 static char *arg_template = NULL;
126 static char *arg_user = NULL;
127 static sd_id128_t arg_uuid = {};
128 static char *arg_machine = NULL;
129 static const char *arg_selinux_context = NULL;
130 static const char *arg_selinux_apifs_context = NULL;
131 static const char *arg_slice = NULL;
132 static bool arg_private_network = false;
133 static bool arg_read_only = false;
134 static bool arg_boot = false;
135 static bool arg_ephemeral = false;
136 static LinkJournal arg_link_journal = LINK_AUTO;
137 static bool arg_link_journal_try = false;
138 static uint64_t arg_retain =
139 (1ULL << CAP_CHOWN) |
140 (1ULL << CAP_DAC_OVERRIDE) |
141 (1ULL << CAP_DAC_READ_SEARCH) |
142 (1ULL << CAP_FOWNER) |
143 (1ULL << CAP_FSETID) |
144 (1ULL << CAP_IPC_OWNER) |
145 (1ULL << CAP_KILL) |
146 (1ULL << CAP_LEASE) |
147 (1ULL << CAP_LINUX_IMMUTABLE) |
148 (1ULL << CAP_NET_BIND_SERVICE) |
149 (1ULL << CAP_NET_BROADCAST) |
150 (1ULL << CAP_NET_RAW) |
151 (1ULL << CAP_SETGID) |
152 (1ULL << CAP_SETFCAP) |
153 (1ULL << CAP_SETPCAP) |
154 (1ULL << CAP_SETUID) |
155 (1ULL << CAP_SYS_ADMIN) |
156 (1ULL << CAP_SYS_CHROOT) |
157 (1ULL << CAP_SYS_NICE) |
158 (1ULL << CAP_SYS_PTRACE) |
159 (1ULL << CAP_SYS_TTY_CONFIG) |
160 (1ULL << CAP_SYS_RESOURCE) |
161 (1ULL << CAP_SYS_BOOT) |
162 (1ULL << CAP_AUDIT_WRITE) |
163 (1ULL << CAP_AUDIT_CONTROL) |
164 (1ULL << CAP_MKNOD);
165 static char **arg_bind = NULL;
166 static char **arg_bind_ro = NULL;
167 static char **arg_tmpfs = NULL;
168 static char **arg_setenv = NULL;
169 static bool arg_quiet = false;
170 static bool arg_share_system = false;
171 static bool arg_register = true;
172 static bool arg_keep_unit = false;
173 static char **arg_network_interfaces = NULL;
174 static char **arg_network_macvlan = NULL;
175 static char **arg_network_ipvlan = NULL;
176 static bool arg_network_veth = false;
177 static const char *arg_network_bridge = NULL;
178 static unsigned long arg_personality = 0xffffffffLU;
179 static char *arg_image = NULL;
180 static Volatile arg_volatile = VOLATILE_NO;
181 static ExposePort *arg_expose_ports = NULL;
182 static char **arg_property = NULL;
183 static uid_t arg_uid_shift = UID_INVALID, arg_uid_range = 0x10000U;
184 static bool arg_userns = false;
185 static int arg_kill_signal = 0;
186
187 static void help(void) {
188 printf("%s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
189 "Spawn a minimal namespace container for debugging, testing and building.\n\n"
190 " -h --help Show this help\n"
191 " --version Print version string\n"
192 " -q --quiet Do not show status information\n"
193 " -D --directory=PATH Root directory for the container\n"
194 " --template=PATH Initialize root directory from template directory,\n"
195 " if missing\n"
196 " -x --ephemeral Run container with snapshot of root directory, and\n"
197 " remove it after exit\n"
198 " -i --image=PATH File system device or disk image for the container\n"
199 " -b --boot Boot up full system (i.e. invoke init)\n"
200 " -u --user=USER Run the command under specified user or uid\n"
201 " -M --machine=NAME Set the machine name for the container\n"
202 " --uuid=UUID Set a specific machine UUID for the container\n"
203 " -S --slice=SLICE Place the container in the specified slice\n"
204 " --property=NAME=VALUE Set scope unit property\n"
205 " --private-network Disable network in container\n"
206 " --network-interface=INTERFACE\n"
207 " Assign an existing network interface to the\n"
208 " container\n"
209 " --network-macvlan=INTERFACE\n"
210 " Create a macvlan network interface based on an\n"
211 " existing network interface to the container\n"
212 " --network-ipvlan=INTERFACE\n"
213 " Create a ipvlan network interface based on an\n"
214 " existing network interface to the container\n"
215 " -n --network-veth Add a virtual ethernet connection between host\n"
216 " and container\n"
217 " --network-bridge=INTERFACE\n"
218 " Add a virtual ethernet connection between host\n"
219 " and container and add it to an existing bridge on\n"
220 " the host\n"
221 " --private-users[=UIDBASE[:NUIDS]]\n"
222 " Run within user namespace\n"
223 " -p --port=[PROTOCOL:]HOSTPORT[:CONTAINERPORT]\n"
224 " Expose a container IP port on the host\n"
225 " -Z --selinux-context=SECLABEL\n"
226 " Set the SELinux security context to be used by\n"
227 " processes in the container\n"
228 " -L --selinux-apifs-context=SECLABEL\n"
229 " Set the SELinux security context to be used by\n"
230 " API/tmpfs file systems in the container\n"
231 " --capability=CAP In addition to the default, retain specified\n"
232 " capability\n"
233 " --drop-capability=CAP Drop the specified capability from the default set\n"
234 " --kill-signal=SIGNAL Select signal to use for shutting down PID 1\n"
235 " --link-journal=MODE Link up guest journal, one of no, auto, guest, host,\n"
236 " try-guest, try-host\n"
237 " -j Equivalent to --link-journal=try-guest\n"
238 " --read-only Mount the root directory read-only\n"
239 " --bind=PATH[:PATH] Bind mount a file or directory from the host into\n"
240 " the container\n"
241 " --bind-ro=PATH[:PATH] Similar, but creates a read-only bind mount\n"
242 " --tmpfs=PATH:[OPTIONS] Mount an empty tmpfs to the specified directory\n"
243 " --setenv=NAME=VALUE Pass an environment variable to PID 1\n"
244 " --share-system Share system namespaces with host\n"
245 " --register=BOOLEAN Register container as machine\n"
246 " --keep-unit Do not register a scope for the machine, reuse\n"
247 " the service unit nspawn is running in\n"
248 " --volatile[=MODE] Run the system in volatile mode\n"
249 , program_invocation_short_name);
250 }
251
252 static int set_sanitized_path(char **b, const char *path) {
253 char *p;
254
255 assert(b);
256 assert(path);
257
258 p = canonicalize_file_name(path);
259 if (!p) {
260 if (errno != ENOENT)
261 return -errno;
262
263 p = path_make_absolute_cwd(path);
264 if (!p)
265 return -ENOMEM;
266 }
267
268 free(*b);
269 *b = path_kill_slashes(p);
270 return 0;
271 }
272
273 static int parse_argv(int argc, char *argv[]) {
274
275 enum {
276 ARG_VERSION = 0x100,
277 ARG_PRIVATE_NETWORK,
278 ARG_UUID,
279 ARG_READ_ONLY,
280 ARG_CAPABILITY,
281 ARG_DROP_CAPABILITY,
282 ARG_LINK_JOURNAL,
283 ARG_BIND,
284 ARG_BIND_RO,
285 ARG_TMPFS,
286 ARG_SETENV,
287 ARG_SHARE_SYSTEM,
288 ARG_REGISTER,
289 ARG_KEEP_UNIT,
290 ARG_NETWORK_INTERFACE,
291 ARG_NETWORK_MACVLAN,
292 ARG_NETWORK_IPVLAN,
293 ARG_NETWORK_BRIDGE,
294 ARG_PERSONALITY,
295 ARG_VOLATILE,
296 ARG_TEMPLATE,
297 ARG_PROPERTY,
298 ARG_PRIVATE_USERS,
299 ARG_KILL_SIGNAL,
300 };
301
302 static const struct option options[] = {
303 { "help", no_argument, NULL, 'h' },
304 { "version", no_argument, NULL, ARG_VERSION },
305 { "directory", required_argument, NULL, 'D' },
306 { "template", required_argument, NULL, ARG_TEMPLATE },
307 { "ephemeral", no_argument, NULL, 'x' },
308 { "user", required_argument, NULL, 'u' },
309 { "private-network", no_argument, NULL, ARG_PRIVATE_NETWORK },
310 { "boot", no_argument, NULL, 'b' },
311 { "uuid", required_argument, NULL, ARG_UUID },
312 { "read-only", no_argument, NULL, ARG_READ_ONLY },
313 { "capability", required_argument, NULL, ARG_CAPABILITY },
314 { "drop-capability", required_argument, NULL, ARG_DROP_CAPABILITY },
315 { "link-journal", required_argument, NULL, ARG_LINK_JOURNAL },
316 { "bind", required_argument, NULL, ARG_BIND },
317 { "bind-ro", required_argument, NULL, ARG_BIND_RO },
318 { "tmpfs", required_argument, NULL, ARG_TMPFS },
319 { "machine", required_argument, NULL, 'M' },
320 { "slice", required_argument, NULL, 'S' },
321 { "setenv", required_argument, NULL, ARG_SETENV },
322 { "selinux-context", required_argument, NULL, 'Z' },
323 { "selinux-apifs-context", required_argument, NULL, 'L' },
324 { "quiet", no_argument, NULL, 'q' },
325 { "share-system", no_argument, NULL, ARG_SHARE_SYSTEM },
326 { "register", required_argument, NULL, ARG_REGISTER },
327 { "keep-unit", no_argument, NULL, ARG_KEEP_UNIT },
328 { "network-interface", required_argument, NULL, ARG_NETWORK_INTERFACE },
329 { "network-macvlan", required_argument, NULL, ARG_NETWORK_MACVLAN },
330 { "network-ipvlan", required_argument, NULL, ARG_NETWORK_IPVLAN },
331 { "network-veth", no_argument, NULL, 'n' },
332 { "network-bridge", required_argument, NULL, ARG_NETWORK_BRIDGE },
333 { "personality", required_argument, NULL, ARG_PERSONALITY },
334 { "image", required_argument, NULL, 'i' },
335 { "volatile", optional_argument, NULL, ARG_VOLATILE },
336 { "port", required_argument, NULL, 'p' },
337 { "property", required_argument, NULL, ARG_PROPERTY },
338 { "private-users", optional_argument, NULL, ARG_PRIVATE_USERS },
339 { "kill-signal", required_argument, NULL, ARG_KILL_SIGNAL },
340 {}
341 };
342
343 int c, r;
344 uint64_t plus = 0, minus = 0;
345
346 assert(argc >= 0);
347 assert(argv);
348
349 while ((c = getopt_long(argc, argv, "+hD:u:bL:M:jS:Z:qi:xp:n", options, NULL)) >= 0)
350
351 switch (c) {
352
353 case 'h':
354 help();
355 return 0;
356
357 case ARG_VERSION:
358 puts(PACKAGE_STRING);
359 puts(SYSTEMD_FEATURES);
360 return 0;
361
362 case 'D':
363 r = set_sanitized_path(&arg_directory, optarg);
364 if (r < 0)
365 return log_error_errno(r, "Invalid root directory: %m");
366
367 break;
368
369 case ARG_TEMPLATE:
370 r = set_sanitized_path(&arg_template, optarg);
371 if (r < 0)
372 return log_error_errno(r, "Invalid template directory: %m");
373
374 break;
375
376 case 'i':
377 r = set_sanitized_path(&arg_image, optarg);
378 if (r < 0)
379 return log_error_errno(r, "Invalid image path: %m");
380
381 break;
382
383 case 'x':
384 arg_ephemeral = true;
385 break;
386
387 case 'u':
388 free(arg_user);
389 arg_user = strdup(optarg);
390 if (!arg_user)
391 return log_oom();
392
393 break;
394
395 case ARG_NETWORK_BRIDGE:
396 arg_network_bridge = optarg;
397
398 /* fall through */
399
400 case 'n':
401 arg_network_veth = true;
402 arg_private_network = true;
403 break;
404
405 case ARG_NETWORK_INTERFACE:
406 if (strv_extend(&arg_network_interfaces, optarg) < 0)
407 return log_oom();
408
409 arg_private_network = true;
410 break;
411
412 case ARG_NETWORK_MACVLAN:
413 if (strv_extend(&arg_network_macvlan, optarg) < 0)
414 return log_oom();
415
416 arg_private_network = true;
417 break;
418
419 case ARG_NETWORK_IPVLAN:
420 if (strv_extend(&arg_network_ipvlan, optarg) < 0)
421 return log_oom();
422
423 /* fall through */
424
425 case ARG_PRIVATE_NETWORK:
426 arg_private_network = true;
427 break;
428
429 case 'b':
430 arg_boot = true;
431 break;
432
433 case ARG_UUID:
434 r = sd_id128_from_string(optarg, &arg_uuid);
435 if (r < 0) {
436 log_error("Invalid UUID: %s", optarg);
437 return r;
438 }
439 break;
440
441 case 'S':
442 arg_slice = optarg;
443 break;
444
445 case 'M':
446 if (isempty(optarg)) {
447 free(arg_machine);
448 arg_machine = NULL;
449 } else {
450 if (!machine_name_is_valid(optarg)) {
451 log_error("Invalid machine name: %s", optarg);
452 return -EINVAL;
453 }
454
455 r = free_and_strdup(&arg_machine, optarg);
456 if (r < 0)
457 return log_oom();
458
459 break;
460 }
461
462 case 'Z':
463 arg_selinux_context = optarg;
464 break;
465
466 case 'L':
467 arg_selinux_apifs_context = optarg;
468 break;
469
470 case ARG_READ_ONLY:
471 arg_read_only = true;
472 break;
473
474 case ARG_CAPABILITY:
475 case ARG_DROP_CAPABILITY: {
476 const char *state, *word;
477 size_t length;
478
479 FOREACH_WORD_SEPARATOR(word, length, optarg, ",", state) {
480 _cleanup_free_ char *t;
481
482 t = strndup(word, length);
483 if (!t)
484 return log_oom();
485
486 if (streq(t, "all")) {
487 if (c == ARG_CAPABILITY)
488 plus = (uint64_t) -1;
489 else
490 minus = (uint64_t) -1;
491 } else {
492 int cap;
493
494 cap = capability_from_name(t);
495 if (cap < 0) {
496 log_error("Failed to parse capability %s.", t);
497 return -EINVAL;
498 }
499
500 if (c == ARG_CAPABILITY)
501 plus |= 1ULL << (uint64_t) cap;
502 else
503 minus |= 1ULL << (uint64_t) cap;
504 }
505 }
506
507 break;
508 }
509
510 case 'j':
511 arg_link_journal = LINK_GUEST;
512 arg_link_journal_try = true;
513 break;
514
515 case ARG_LINK_JOURNAL:
516 if (streq(optarg, "auto")) {
517 arg_link_journal = LINK_AUTO;
518 arg_link_journal_try = false;
519 } else if (streq(optarg, "no")) {
520 arg_link_journal = LINK_NO;
521 arg_link_journal_try = false;
522 } else if (streq(optarg, "guest")) {
523 arg_link_journal = LINK_GUEST;
524 arg_link_journal_try = false;
525 } else if (streq(optarg, "host")) {
526 arg_link_journal = LINK_HOST;
527 arg_link_journal_try = false;
528 } else if (streq(optarg, "try-guest")) {
529 arg_link_journal = LINK_GUEST;
530 arg_link_journal_try = true;
531 } else if (streq(optarg, "try-host")) {
532 arg_link_journal = LINK_HOST;
533 arg_link_journal_try = true;
534 } else {
535 log_error("Failed to parse link journal mode %s", optarg);
536 return -EINVAL;
537 }
538
539 break;
540
541 case ARG_BIND:
542 case ARG_BIND_RO: {
543 _cleanup_free_ char *a = NULL, *b = NULL;
544 char *e;
545 char ***x;
546
547 x = c == ARG_BIND ? &arg_bind : &arg_bind_ro;
548
549 e = strchr(optarg, ':');
550 if (e) {
551 a = strndup(optarg, e - optarg);
552 b = strdup(e + 1);
553 } else {
554 a = strdup(optarg);
555 b = strdup(optarg);
556 }
557
558 if (!a || !b)
559 return log_oom();
560
561 if (!path_is_absolute(a) || !path_is_absolute(b)) {
562 log_error("Invalid bind mount specification: %s", optarg);
563 return -EINVAL;
564 }
565
566 r = strv_extend(x, a);
567 if (r < 0)
568 return log_oom();
569
570 r = strv_extend(x, b);
571 if (r < 0)
572 return log_oom();
573
574 break;
575 }
576
577 case ARG_TMPFS: {
578 _cleanup_free_ char *a = NULL, *b = NULL;
579 char *e;
580
581 e = strchr(optarg, ':');
582 if (e) {
583 a = strndup(optarg, e - optarg);
584 b = strdup(e + 1);
585 } else {
586 a = strdup(optarg);
587 b = strdup("mode=0755");
588 }
589
590 if (!a || !b)
591 return log_oom();
592
593 if (!path_is_absolute(a)) {
594 log_error("Invalid tmpfs specification: %s", optarg);
595 return -EINVAL;
596 }
597
598 r = strv_push(&arg_tmpfs, a);
599 if (r < 0)
600 return log_oom();
601
602 a = NULL;
603
604 r = strv_push(&arg_tmpfs, b);
605 if (r < 0)
606 return log_oom();
607
608 b = NULL;
609
610 break;
611 }
612
613 case ARG_SETENV: {
614 char **n;
615
616 if (!env_assignment_is_valid(optarg)) {
617 log_error("Environment variable assignment '%s' is not valid.", optarg);
618 return -EINVAL;
619 }
620
621 n = strv_env_set(arg_setenv, optarg);
622 if (!n)
623 return log_oom();
624
625 strv_free(arg_setenv);
626 arg_setenv = n;
627 break;
628 }
629
630 case 'q':
631 arg_quiet = true;
632 break;
633
634 case ARG_SHARE_SYSTEM:
635 arg_share_system = true;
636 break;
637
638 case ARG_REGISTER:
639 r = parse_boolean(optarg);
640 if (r < 0) {
641 log_error("Failed to parse --register= argument: %s", optarg);
642 return r;
643 }
644
645 arg_register = r;
646 break;
647
648 case ARG_KEEP_UNIT:
649 arg_keep_unit = true;
650 break;
651
652 case ARG_PERSONALITY:
653
654 arg_personality = personality_from_string(optarg);
655 if (arg_personality == 0xffffffffLU) {
656 log_error("Unknown or unsupported personality '%s'.", optarg);
657 return -EINVAL;
658 }
659
660 break;
661
662 case ARG_VOLATILE:
663
664 if (!optarg)
665 arg_volatile = VOLATILE_YES;
666 else {
667 r = parse_boolean(optarg);
668 if (r < 0) {
669 if (streq(optarg, "state"))
670 arg_volatile = VOLATILE_STATE;
671 else {
672 log_error("Failed to parse --volatile= argument: %s", optarg);
673 return r;
674 }
675 } else
676 arg_volatile = r ? VOLATILE_YES : VOLATILE_NO;
677 }
678
679 break;
680
681 case 'p': {
682 const char *split, *e;
683 uint16_t container_port, host_port;
684 int protocol;
685 ExposePort *p;
686
687 if ((e = startswith(optarg, "tcp:")))
688 protocol = IPPROTO_TCP;
689 else if ((e = startswith(optarg, "udp:")))
690 protocol = IPPROTO_UDP;
691 else {
692 e = optarg;
693 protocol = IPPROTO_TCP;
694 }
695
696 split = strchr(e, ':');
697 if (split) {
698 char v[split - e + 1];
699
700 memcpy(v, e, split - e);
701 v[split - e] = 0;
702
703 r = safe_atou16(v, &host_port);
704 if (r < 0 || host_port <= 0) {
705 log_error("Failed to parse host port: %s", optarg);
706 return -EINVAL;
707 }
708
709 r = safe_atou16(split + 1, &container_port);
710 } else {
711 r = safe_atou16(e, &container_port);
712 host_port = container_port;
713 }
714
715 if (r < 0 || container_port <= 0) {
716 log_error("Failed to parse host port: %s", optarg);
717 return -EINVAL;
718 }
719
720 LIST_FOREACH(ports, p, arg_expose_ports) {
721 if (p->protocol == protocol && p->host_port == host_port) {
722 log_error("Duplicate port specification: %s", optarg);
723 return -EINVAL;
724 }
725 }
726
727 p = new(ExposePort, 1);
728 if (!p)
729 return log_oom();
730
731 p->protocol = protocol;
732 p->host_port = host_port;
733 p->container_port = container_port;
734
735 LIST_PREPEND(ports, arg_expose_ports, p);
736
737 break;
738 }
739
740 case ARG_PROPERTY:
741 if (strv_extend(&arg_property, optarg) < 0)
742 return log_oom();
743
744 break;
745
746 case ARG_PRIVATE_USERS:
747 if (optarg) {
748 _cleanup_free_ char *buffer = NULL;
749 const char *range, *shift;
750
751 range = strchr(optarg, ':');
752 if (range) {
753 buffer = strndup(optarg, range - optarg);
754 if (!buffer)
755 return log_oom();
756 shift = buffer;
757
758 range++;
759 if (safe_atou32(range, &arg_uid_range) < 0 || arg_uid_range <= 0) {
760 log_error("Failed to parse UID range: %s", range);
761 return -EINVAL;
762 }
763 } else
764 shift = optarg;
765
766 if (parse_uid(shift, &arg_uid_shift) < 0) {
767 log_error("Failed to parse UID: %s", optarg);
768 return -EINVAL;
769 }
770 }
771
772 arg_userns = true;
773 break;
774
775 case ARG_KILL_SIGNAL:
776 arg_kill_signal = signal_from_string_try_harder(optarg);
777 if (arg_kill_signal < 0) {
778 log_error("Cannot parse signal: %s", optarg);
779 return -EINVAL;
780 }
781
782 break;
783
784 case '?':
785 return -EINVAL;
786
787 default:
788 assert_not_reached("Unhandled option");
789 }
790
791 if (arg_share_system)
792 arg_register = false;
793
794 if (arg_boot && arg_share_system) {
795 log_error("--boot and --share-system may not be combined.");
796 return -EINVAL;
797 }
798
799 if (arg_keep_unit && cg_pid_get_owner_uid(0, NULL) >= 0) {
800 log_error("--keep-unit may not be used when invoked from a user session.");
801 return -EINVAL;
802 }
803
804 if (arg_directory && arg_image) {
805 log_error("--directory= and --image= may not be combined.");
806 return -EINVAL;
807 }
808
809 if (arg_template && arg_image) {
810 log_error("--template= and --image= may not be combined.");
811 return -EINVAL;
812 }
813
814 if (arg_template && !(arg_directory || arg_machine)) {
815 log_error("--template= needs --directory= or --machine=.");
816 return -EINVAL;
817 }
818
819 if (arg_ephemeral && arg_template) {
820 log_error("--ephemeral and --template= may not be combined.");
821 return -EINVAL;
822 }
823
824 if (arg_ephemeral && arg_image) {
825 log_error("--ephemeral and --image= may not be combined.");
826 return -EINVAL;
827 }
828
829 if (arg_ephemeral && !IN_SET(arg_link_journal, LINK_NO, LINK_AUTO)) {
830 log_error("--ephemeral and --link-journal= may not be combined.");
831 return -EINVAL;
832 }
833
834 if (arg_volatile != VOLATILE_NO && arg_read_only) {
835 log_error("Cannot combine --read-only with --volatile. Note that --volatile already implies a read-only base hierarchy.");
836 return -EINVAL;
837 }
838
839 if (arg_expose_ports && !arg_private_network) {
840 log_error("Cannot use --port= without private networking.");
841 return -EINVAL;
842 }
843
844 arg_retain = (arg_retain | plus | (arg_private_network ? 1ULL << CAP_NET_ADMIN : 0)) & ~minus;
845
846 if (arg_boot && arg_kill_signal <= 0)
847 arg_kill_signal = SIGRTMIN+3;
848
849 return 1;
850 }
851
852 static int mount_all(const char *dest) {
853
854 typedef struct MountPoint {
855 const char *what;
856 const char *where;
857 const char *type;
858 const char *options;
859 unsigned long flags;
860 bool fatal;
861 } MountPoint;
862
863 static const MountPoint mount_table[] = {
864 { "proc", "/proc", "proc", NULL, MS_NOSUID|MS_NOEXEC|MS_NODEV, true },
865 { "/proc/sys", "/proc/sys", NULL, NULL, MS_BIND, true }, /* Bind mount first */
866 { NULL, "/proc/sys", NULL, NULL, MS_BIND|MS_RDONLY|MS_REMOUNT, true }, /* Then, make it r/o */
867 { "sysfs", "/sys", "sysfs", NULL, MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV, true },
868 { "tmpfs", "/dev", "tmpfs", "mode=755", MS_NOSUID|MS_STRICTATIME, true },
869 { "devpts", "/dev/pts", "devpts","newinstance,ptmxmode=0666,mode=620,gid=" STRINGIFY(TTY_GID), MS_NOSUID|MS_NOEXEC, true },
870 { "tmpfs", "/dev/shm", "tmpfs", "mode=1777", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true },
871 { "tmpfs", "/run", "tmpfs", "mode=755", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true },
872 { "tmpfs", "/tmp", "tmpfs", "mode=1777", MS_STRICTATIME, true },
873 #ifdef HAVE_SELINUX
874 { "/sys/fs/selinux", "/sys/fs/selinux", NULL, NULL, MS_BIND, false }, /* Bind mount first */
875 { NULL, "/sys/fs/selinux", NULL, NULL, MS_BIND|MS_RDONLY|MS_REMOUNT, false }, /* Then, make it r/o */
876 #endif
877 };
878
879 unsigned k;
880 int r = 0;
881
882 for (k = 0; k < ELEMENTSOF(mount_table); k++) {
883 _cleanup_free_ char *where = NULL, *options = NULL;
884 const char *o;
885 int t;
886
887 where = strjoin(dest, "/", mount_table[k].where, NULL);
888 if (!where)
889 return log_oom();
890
891 t = path_is_mount_point(where, true);
892 if (t < 0) {
893 log_error_errno(t, "Failed to detect whether %s is a mount point: %m", where);
894
895 if (r == 0)
896 r = t;
897
898 continue;
899 }
900
901 /* Skip this entry if it is not a remount. */
902 if (mount_table[k].what && t > 0)
903 continue;
904
905 t = mkdir_p(where, 0755);
906 if (t < 0) {
907 if (mount_table[k].fatal) {
908 log_error_errno(t, "Failed to create directory %s: %m", where);
909
910 if (r == 0)
911 r = t;
912 } else
913 log_warning_errno(t, "Failed to create directory %s: %m", where);
914
915 continue;
916 }
917
918 #ifdef HAVE_SELINUX
919 if (arg_selinux_apifs_context &&
920 (streq_ptr(mount_table[k].what, "tmpfs") || streq_ptr(mount_table[k].what, "devpts"))) {
921 options = strjoin(mount_table[k].options, ",context=\"", arg_selinux_apifs_context, "\"", NULL);
922 if (!options)
923 return log_oom();
924
925 o = options;
926 } else
927 #endif
928 o = mount_table[k].options;
929
930 if (arg_userns && arg_uid_shift != UID_INVALID && streq_ptr(mount_table[k].type, "tmpfs")) {
931 char *uid_options = NULL;
932
933 if (o)
934 asprintf(&uid_options, "%s,uid=" UID_FMT ",gid=" UID_FMT, o, arg_uid_shift, arg_uid_shift);
935 else
936 asprintf(&uid_options, "uid=" UID_FMT ",gid=" UID_FMT, arg_uid_shift, arg_uid_shift);
937 if (!uid_options)
938 return log_oom();
939
940 free(options);
941 o = options = uid_options;
942 }
943
944 if (mount(mount_table[k].what,
945 where,
946 mount_table[k].type,
947 mount_table[k].flags,
948 o) < 0) {
949
950 if (mount_table[k].fatal) {
951 log_error_errno(errno, "mount(%s) failed: %m", where);
952
953 if (r == 0)
954 r = -errno;
955 } else
956 log_warning_errno(errno, "mount(%s) failed: %m", where);
957 }
958 }
959
960 return r;
961 }
962
963 static int mount_binds(const char *dest, char **l, bool ro) {
964 char **x, **y;
965
966 STRV_FOREACH_PAIR(x, y, l) {
967 _cleanup_free_ char *where = NULL;
968 struct stat source_st, dest_st;
969 int r;
970
971 if (stat(*x, &source_st) < 0)
972 return log_error_errno(errno, "Failed to stat %s: %m", *x);
973
974 where = strappend(dest, *y);
975 if (!where)
976 return log_oom();
977
978 r = stat(where, &dest_st);
979 if (r == 0) {
980 if (S_ISDIR(source_st.st_mode) && !S_ISDIR(dest_st.st_mode)) {
981 log_error("Cannot bind mount directory %s on file %s.", *x, where);
982 return -EINVAL;
983 }
984 if (!S_ISDIR(source_st.st_mode) && S_ISDIR(dest_st.st_mode)) {
985 log_error("Cannot bind mount file %s on directory %s.", *x, where);
986 return -EINVAL;
987 }
988 } else if (errno == ENOENT) {
989 r = mkdir_parents_label(where, 0755);
990 if (r < 0)
991 return log_error_errno(r, "Failed to bind mount %s: %m", *x);
992 } else {
993 log_error_errno(errno, "Failed to bind mount %s: %m", *x);
994 return -errno;
995 }
996
997 /* Create the mount point. Any non-directory file can be
998 * mounted on any non-directory file (regular, fifo, socket,
999 * char, block).
1000 */
1001 if (S_ISDIR(source_st.st_mode)) {
1002 r = mkdir_label(where, 0755);
1003 if (r < 0 && errno != EEXIST)
1004 return log_error_errno(r, "Failed to create mount point %s: %m", where);
1005 } else {
1006 r = touch(where);
1007 if (r < 0)
1008 return log_error_errno(r, "Failed to create mount point %s: %m", where);
1009 }
1010
1011 if (mount(*x, where, NULL, MS_BIND, NULL) < 0)
1012 return log_error_errno(errno, "mount(%s) failed: %m", where);
1013
1014 if (ro) {
1015 r = bind_remount_recursive(where, true);
1016 if (r < 0)
1017 return log_error_errno(r, "Read-Only bind mount failed: %m");
1018 }
1019 }
1020
1021 return 0;
1022 }
1023
1024 static int mount_cgroup_hierarchy(const char *dest, const char *controller, const char *hierarchy, bool read_only) {
1025 char *to;
1026 int r;
1027
1028 to = strjoina(dest, "/sys/fs/cgroup/", hierarchy);
1029
1030 r = path_is_mount_point(to, false);
1031 if (r < 0)
1032 return log_error_errno(r, "Failed to determine if %s is mounted already: %m", to);
1033 if (r > 0)
1034 return 0;
1035
1036 mkdir_p(to, 0755);
1037
1038 /* The superblock mount options of the mount point need to be
1039 * identical to the hosts', and hence writable... */
1040 if (mount("cgroup", to, "cgroup", MS_NOSUID|MS_NOEXEC|MS_NODEV, controller) < 0)
1041 return log_error_errno(errno, "Failed to mount to %s: %m", to);
1042
1043 /* ... hence let's only make the bind mount read-only, not the
1044 * superblock. */
1045 if (read_only) {
1046 if (mount(NULL, to, NULL, MS_BIND|MS_REMOUNT|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_RDONLY, NULL) < 0)
1047 return log_error_errno(errno, "Failed to remount %s read-only: %m", to);
1048 }
1049 return 1;
1050 }
1051
1052 static int mount_cgroup(const char *dest) {
1053 _cleanup_set_free_free_ Set *controllers = NULL;
1054 _cleanup_free_ char *own_cgroup_path = NULL;
1055 const char *cgroup_root, *systemd_root, *systemd_own;
1056 int r;
1057
1058 controllers = set_new(&string_hash_ops);
1059 if (!controllers)
1060 return log_oom();
1061
1062 r = cg_kernel_controllers(controllers);
1063 if (r < 0)
1064 return log_error_errno(r, "Failed to determine cgroup controllers: %m");
1065
1066 r = cg_pid_get_path(NULL, 0, &own_cgroup_path);
1067 if (r < 0)
1068 return log_error_errno(r, "Failed to determine our own cgroup path: %m");
1069
1070 cgroup_root = strjoina(dest, "/sys/fs/cgroup");
1071 if (mount("tmpfs", cgroup_root, "tmpfs", MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_STRICTATIME, "mode=755") < 0)
1072 return log_error_errno(errno, "Failed to mount tmpfs to /sys/fs/cgroup: %m");
1073
1074 for (;;) {
1075 _cleanup_free_ char *controller = NULL, *origin = NULL, *combined = NULL;
1076
1077 controller = set_steal_first(controllers);
1078 if (!controller)
1079 break;
1080
1081 origin = strappend("/sys/fs/cgroup/", controller);
1082 if (!origin)
1083 return log_oom();
1084
1085 r = readlink_malloc(origin, &combined);
1086 if (r == -EINVAL) {
1087 /* Not a symbolic link, but directly a single cgroup hierarchy */
1088
1089 r = mount_cgroup_hierarchy(dest, controller, controller, true);
1090 if (r < 0)
1091 return r;
1092
1093 } else if (r < 0)
1094 return log_error_errno(r, "Failed to read link %s: %m", origin);
1095 else {
1096 _cleanup_free_ char *target = NULL;
1097
1098 target = strjoin(dest, "/sys/fs/cgroup/", controller, NULL);
1099 if (!target)
1100 return log_oom();
1101
1102 /* A symbolic link, a combination of controllers in one hierarchy */
1103
1104 if (!filename_is_valid(combined)) {
1105 log_warning("Ignoring invalid combined hierarchy %s.", combined);
1106 continue;
1107 }
1108
1109 r = mount_cgroup_hierarchy(dest, combined, combined, true);
1110 if (r < 0)
1111 return r;
1112
1113 if (symlink(combined, target) < 0)
1114 return log_error_errno(errno, "Failed to create symlink for combined hierarchy: %m");
1115 }
1116 }
1117
1118 r = mount_cgroup_hierarchy(dest, "name=systemd,xattr", "systemd", false);
1119 if (r < 0)
1120 return r;
1121
1122 /* Make our own cgroup a (writable) bind mount */
1123 systemd_own = strjoina(dest, "/sys/fs/cgroup/systemd", own_cgroup_path);
1124 if (mount(systemd_own, systemd_own, NULL, MS_BIND, NULL) < 0)
1125 return log_error_errno(errno, "Failed to turn %s into a bind mount: %m", own_cgroup_path);
1126
1127 /* And then remount the systemd cgroup root read-only */
1128 systemd_root = strjoina(dest, "/sys/fs/cgroup/systemd");
1129 if (mount(NULL, systemd_root, NULL, MS_BIND|MS_REMOUNT|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_RDONLY, NULL) < 0)
1130 return log_error_errno(errno, "Failed to mount cgroup root read-only: %m");
1131
1132 if (mount(NULL, cgroup_root, NULL, MS_REMOUNT|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_STRICTATIME|MS_RDONLY, "mode=755") < 0)
1133 return log_error_errno(errno, "Failed to remount %s read-only: %m", cgroup_root);
1134
1135 return 0;
1136 }
1137
1138 static int mount_tmpfs(const char *dest) {
1139 char **i, **o;
1140
1141 STRV_FOREACH_PAIR(i, o, arg_tmpfs) {
1142 _cleanup_free_ char *where = NULL;
1143 int r;
1144
1145 where = strappend(dest, *i);
1146 if (!where)
1147 return log_oom();
1148
1149 r = mkdir_label(where, 0755);
1150 if (r < 0 && r != -EEXIST)
1151 return log_error_errno(r, "Creating mount point for tmpfs %s failed: %m", where);
1152
1153 if (mount("tmpfs", where, "tmpfs", MS_NODEV|MS_STRICTATIME, *o) < 0)
1154 return log_error_errno(errno, "tmpfs mount to %s failed: %m", where);
1155 }
1156
1157 return 0;
1158 }
1159
1160 static int setup_timezone(const char *dest) {
1161 _cleanup_free_ char *where = NULL, *p = NULL, *q = NULL, *check = NULL, *what = NULL;
1162 char *z, *y;
1163 int r;
1164
1165 assert(dest);
1166
1167 /* Fix the timezone, if possible */
1168 r = readlink_malloc("/etc/localtime", &p);
1169 if (r < 0) {
1170 log_warning("/etc/localtime is not a symlink, not updating container timezone.");
1171 return 0;
1172 }
1173
1174 z = path_startswith(p, "../usr/share/zoneinfo/");
1175 if (!z)
1176 z = path_startswith(p, "/usr/share/zoneinfo/");
1177 if (!z) {
1178 log_warning("/etc/localtime does not point into /usr/share/zoneinfo/, not updating container timezone.");
1179 return 0;
1180 }
1181
1182 where = strappend(dest, "/etc/localtime");
1183 if (!where)
1184 return log_oom();
1185
1186 r = readlink_malloc(where, &q);
1187 if (r >= 0) {
1188 y = path_startswith(q, "../usr/share/zoneinfo/");
1189 if (!y)
1190 y = path_startswith(q, "/usr/share/zoneinfo/");
1191
1192 /* Already pointing to the right place? Then do nothing .. */
1193 if (y && streq(y, z))
1194 return 0;
1195 }
1196
1197 check = strjoin(dest, "/usr/share/zoneinfo/", z, NULL);
1198 if (!check)
1199 return log_oom();
1200
1201 if (access(check, F_OK) < 0) {
1202 log_warning("Timezone %s does not exist in container, not updating container timezone.", z);
1203 return 0;
1204 }
1205
1206 what = strappend("../usr/share/zoneinfo/", z);
1207 if (!what)
1208 return log_oom();
1209
1210 r = mkdir_parents(where, 0755);
1211 if (r < 0) {
1212 log_error_errno(r, "Failed to create directory for timezone info %s in container: %m", where);
1213
1214 return 0;
1215 }
1216
1217 r = unlink(where);
1218 if (r < 0 && errno != ENOENT) {
1219 log_error_errno(errno, "Failed to remove existing timezone info %s in container: %m", where);
1220
1221 return 0;
1222 }
1223
1224 if (symlink(what, where) < 0) {
1225 log_error_errno(errno, "Failed to correct timezone of container: %m");
1226 return 0;
1227 }
1228
1229 return 0;
1230 }
1231
1232 static int setup_resolv_conf(const char *dest) {
1233 _cleanup_free_ char *where = NULL;
1234 int r;
1235
1236 assert(dest);
1237
1238 if (arg_private_network)
1239 return 0;
1240
1241 /* Fix resolv.conf, if possible */
1242 where = strappend(dest, "/etc/resolv.conf");
1243 if (!where)
1244 return log_oom();
1245
1246 /* We don't really care for the results of this really. If it
1247 * fails, it fails, but meh... */
1248 r = mkdir_parents(where, 0755);
1249 if (r < 0) {
1250 log_warning_errno(r, "Failed to create parent directory for resolv.conf %s: %m", where);
1251
1252 return 0;
1253 }
1254
1255 r = copy_file("/etc/resolv.conf", where, O_TRUNC|O_NOFOLLOW, 0644, 0);
1256 if (r < 0) {
1257 log_warning_errno(r, "Failed to copy /etc/resolv.conf to %s: %m", where);
1258
1259 return 0;
1260 }
1261
1262 return 0;
1263 }
1264
1265 static int setup_volatile_state(const char *directory) {
1266 const char *p;
1267 int r;
1268
1269 assert(directory);
1270
1271 if (arg_volatile != VOLATILE_STATE)
1272 return 0;
1273
1274 /* --volatile=state means we simply overmount /var
1275 with a tmpfs, and the rest read-only. */
1276
1277 r = bind_remount_recursive(directory, true);
1278 if (r < 0)
1279 return log_error_errno(r, "Failed to remount %s read-only: %m", directory);
1280
1281 p = strjoina(directory, "/var");
1282 r = mkdir(p, 0755);
1283 if (r < 0 && errno != EEXIST)
1284 return log_error_errno(errno, "Failed to create %s: %m", directory);
1285
1286 if (mount("tmpfs", p, "tmpfs", MS_STRICTATIME, "mode=755") < 0)
1287 return log_error_errno(errno, "Failed to mount tmpfs to /var: %m");
1288
1289 return 0;
1290 }
1291
1292 static int setup_volatile(const char *directory) {
1293 bool tmpfs_mounted = false, bind_mounted = false;
1294 char template[] = "/tmp/nspawn-volatile-XXXXXX";
1295 const char *f, *t;
1296 int r;
1297
1298 assert(directory);
1299
1300 if (arg_volatile != VOLATILE_YES)
1301 return 0;
1302
1303 /* --volatile=yes means we mount a tmpfs to the root dir, and
1304 the original /usr to use inside it, and that read-only. */
1305
1306 if (!mkdtemp(template))
1307 return log_error_errno(errno, "Failed to create temporary directory: %m");
1308
1309 if (mount("tmpfs", template, "tmpfs", MS_STRICTATIME, "mode=755") < 0) {
1310 log_error_errno(errno, "Failed to mount tmpfs for root directory: %m");
1311 r = -errno;
1312 goto fail;
1313 }
1314
1315 tmpfs_mounted = true;
1316
1317 f = strjoina(directory, "/usr");
1318 t = strjoina(template, "/usr");
1319
1320 r = mkdir(t, 0755);
1321 if (r < 0 && errno != EEXIST) {
1322 log_error_errno(errno, "Failed to create %s: %m", t);
1323 r = -errno;
1324 goto fail;
1325 }
1326
1327 if (mount(f, t, NULL, MS_BIND|MS_REC, NULL) < 0) {
1328 log_error_errno(errno, "Failed to create /usr bind mount: %m");
1329 r = -errno;
1330 goto fail;
1331 }
1332
1333 bind_mounted = true;
1334
1335 r = bind_remount_recursive(t, true);
1336 if (r < 0) {
1337 log_error_errno(r, "Failed to remount %s read-only: %m", t);
1338 goto fail;
1339 }
1340
1341 if (mount(template, directory, NULL, MS_MOVE, NULL) < 0) {
1342 log_error_errno(errno, "Failed to move root mount: %m");
1343 r = -errno;
1344 goto fail;
1345 }
1346
1347 rmdir(template);
1348
1349 return 0;
1350
1351 fail:
1352 if (bind_mounted)
1353 umount(t);
1354 if (tmpfs_mounted)
1355 umount(template);
1356 rmdir(template);
1357 return r;
1358 }
1359
1360 static char* id128_format_as_uuid(sd_id128_t id, char s[37]) {
1361
1362 snprintf(s, 37,
1363 "%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
1364 SD_ID128_FORMAT_VAL(id));
1365
1366 return s;
1367 }
1368
1369 static int setup_boot_id(const char *dest) {
1370 _cleanup_free_ char *from = NULL, *to = NULL;
1371 sd_id128_t rnd = {};
1372 char as_uuid[37];
1373 int r;
1374
1375 assert(dest);
1376
1377 if (arg_share_system)
1378 return 0;
1379
1380 /* Generate a new randomized boot ID, so that each boot-up of
1381 * the container gets a new one */
1382
1383 from = strappend(dest, "/dev/proc-sys-kernel-random-boot-id");
1384 to = strappend(dest, "/proc/sys/kernel/random/boot_id");
1385 if (!from || !to)
1386 return log_oom();
1387
1388 r = sd_id128_randomize(&rnd);
1389 if (r < 0)
1390 return log_error_errno(r, "Failed to generate random boot id: %m");
1391
1392 id128_format_as_uuid(rnd, as_uuid);
1393
1394 r = write_string_file(from, as_uuid);
1395 if (r < 0)
1396 return log_error_errno(r, "Failed to write boot id: %m");
1397
1398 if (mount(from, to, NULL, MS_BIND, NULL) < 0) {
1399 log_error_errno(errno, "Failed to bind mount boot id: %m");
1400 r = -errno;
1401 } else if (mount(from, to, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY, NULL))
1402 log_warning_errno(errno, "Failed to make boot id read-only: %m");
1403
1404 unlink(from);
1405 return r;
1406 }
1407
1408 static int copy_devnodes(const char *dest) {
1409
1410 static const char devnodes[] =
1411 "null\0"
1412 "zero\0"
1413 "full\0"
1414 "random\0"
1415 "urandom\0"
1416 "tty\0"
1417 "net/tun\0";
1418
1419 const char *d;
1420 int r = 0;
1421 _cleanup_umask_ mode_t u;
1422
1423 assert(dest);
1424
1425 u = umask(0000);
1426
1427 NULSTR_FOREACH(d, devnodes) {
1428 _cleanup_free_ char *from = NULL, *to = NULL;
1429 struct stat st;
1430
1431 from = strappend("/dev/", d);
1432 to = strjoin(dest, "/dev/", d, NULL);
1433 if (!from || !to)
1434 return log_oom();
1435
1436 if (stat(from, &st) < 0) {
1437
1438 if (errno != ENOENT)
1439 return log_error_errno(errno, "Failed to stat %s: %m", from);
1440
1441 } else if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
1442
1443 log_error("%s is not a char or block device, cannot copy", from);
1444 return -EIO;
1445
1446 } else {
1447 r = mkdir_parents(to, 0775);
1448 if (r < 0) {
1449 log_error_errno(r, "Failed to create parent directory of %s: %m", to);
1450 return -r;
1451 }
1452
1453 if (mknod(to, st.st_mode, st.st_rdev) < 0) {
1454 if (errno != EPERM)
1455 return log_error_errno(errno, "mknod(%s) failed: %m", to);
1456
1457 /* Some systems abusively restrict mknod but
1458 * allow bind mounts. */
1459 r = touch(to);
1460 if (r < 0)
1461 return log_error_errno(r, "touch (%s) failed: %m", to);
1462 if (mount(from, to, NULL, MS_BIND, NULL) < 0)
1463 return log_error_errno(errno, "Both mknod and bind mount (%s) failed: %m", to);
1464 }
1465
1466 if (arg_userns && arg_uid_shift != UID_INVALID)
1467 if (lchown(to, arg_uid_shift, arg_uid_shift) < 0)
1468 return log_error_errno(errno, "chown() of device node %s failed: %m", to);
1469 }
1470 }
1471
1472 return r;
1473 }
1474
1475 static int setup_ptmx(const char *dest) {
1476 _cleanup_free_ char *p = NULL;
1477
1478 p = strappend(dest, "/dev/ptmx");
1479 if (!p)
1480 return log_oom();
1481
1482 if (symlink("pts/ptmx", p) < 0)
1483 return log_error_errno(errno, "Failed to create /dev/ptmx symlink: %m");
1484
1485 if (arg_userns && arg_uid_shift != UID_INVALID)
1486 if (lchown(p, arg_uid_shift, arg_uid_shift) < 0)
1487 return log_error_errno(errno, "lchown() of symlink %s failed: %m", p);
1488
1489 return 0;
1490 }
1491
1492 static int setup_dev_console(const char *dest, const char *console) {
1493 _cleanup_umask_ mode_t u;
1494 const char *to;
1495 int r;
1496
1497 assert(dest);
1498 assert(console);
1499
1500 u = umask(0000);
1501
1502 r = chmod_and_chown(console, 0600, 0, 0);
1503 if (r < 0)
1504 return log_error_errno(r, "Failed to correct access mode for TTY: %m");
1505
1506 /* We need to bind mount the right tty to /dev/console since
1507 * ptys can only exist on pts file systems. To have something
1508 * to bind mount things on we create a empty regular file. */
1509
1510 to = strjoina(dest, "/dev/console");
1511 r = touch(to);
1512 if (r < 0)
1513 return log_error_errno(r, "touch() for /dev/console failed: %m");
1514
1515 if (mount(console, to, NULL, MS_BIND, NULL) < 0)
1516 return log_error_errno(errno, "Bind mount for /dev/console failed: %m");
1517
1518 return 0;
1519 }
1520
1521 static int setup_kmsg(const char *dest, int kmsg_socket) {
1522 _cleanup_free_ char *from = NULL, *to = NULL;
1523 _cleanup_umask_ mode_t u;
1524 int r, fd, k;
1525 union {
1526 struct cmsghdr cmsghdr;
1527 uint8_t buf[CMSG_SPACE(sizeof(int))];
1528 } control = {};
1529 struct msghdr mh = {
1530 .msg_control = &control,
1531 .msg_controllen = sizeof(control),
1532 };
1533 struct cmsghdr *cmsg;
1534
1535 assert(dest);
1536 assert(kmsg_socket >= 0);
1537
1538 u = umask(0000);
1539
1540 /* We create the kmsg FIFO as /dev/kmsg, but immediately
1541 * delete it after bind mounting it to /proc/kmsg. While FIFOs
1542 * on the reading side behave very similar to /proc/kmsg,
1543 * their writing side behaves differently from /dev/kmsg in
1544 * that writing blocks when nothing is reading. In order to
1545 * avoid any problems with containers deadlocking due to this
1546 * we simply make /dev/kmsg unavailable to the container. */
1547 if (asprintf(&from, "%s/dev/kmsg", dest) < 0 ||
1548 asprintf(&to, "%s/proc/kmsg", dest) < 0)
1549 return log_oom();
1550
1551 if (mkfifo(from, 0600) < 0)
1552 return log_error_errno(errno, "mkfifo() for /dev/kmsg failed: %m");
1553
1554 r = chmod_and_chown(from, 0600, 0, 0);
1555 if (r < 0)
1556 return log_error_errno(r, "Failed to correct access mode for /dev/kmsg: %m");
1557
1558 if (mount(from, to, NULL, MS_BIND, NULL) < 0)
1559 return log_error_errno(errno, "Bind mount for /proc/kmsg failed: %m");
1560
1561 fd = open(from, O_RDWR|O_NDELAY|O_CLOEXEC);
1562 if (fd < 0)
1563 return log_error_errno(errno, "Failed to open fifo: %m");
1564
1565 cmsg = CMSG_FIRSTHDR(&mh);
1566 cmsg->cmsg_level = SOL_SOCKET;
1567 cmsg->cmsg_type = SCM_RIGHTS;
1568 cmsg->cmsg_len = CMSG_LEN(sizeof(int));
1569 memcpy(CMSG_DATA(cmsg), &fd, sizeof(int));
1570
1571 mh.msg_controllen = cmsg->cmsg_len;
1572
1573 /* Store away the fd in the socket, so that it stays open as
1574 * long as we run the child */
1575 k = sendmsg(kmsg_socket, &mh, MSG_NOSIGNAL);
1576 safe_close(fd);
1577
1578 if (k < 0)
1579 return log_error_errno(errno, "Failed to send FIFO fd: %m");
1580
1581 /* And now make the FIFO unavailable as /dev/kmsg... */
1582 unlink(from);
1583 return 0;
1584 }
1585
1586 static int send_rtnl(int send_fd) {
1587 union {
1588 struct cmsghdr cmsghdr;
1589 uint8_t buf[CMSG_SPACE(sizeof(int))];
1590 } control = {};
1591 struct msghdr mh = {
1592 .msg_control = &control,
1593 .msg_controllen = sizeof(control),
1594 };
1595 struct cmsghdr *cmsg;
1596 _cleanup_close_ int fd = -1;
1597 ssize_t k;
1598
1599 assert(send_fd >= 0);
1600
1601 if (!arg_expose_ports)
1602 return 0;
1603
1604 fd = socket(PF_NETLINK, SOCK_RAW|SOCK_CLOEXEC|SOCK_NONBLOCK, NETLINK_ROUTE);
1605 if (fd < 0)
1606 return log_error_errno(errno, "failed to allocate container netlink: %m");
1607
1608 cmsg = CMSG_FIRSTHDR(&mh);
1609 cmsg->cmsg_level = SOL_SOCKET;
1610 cmsg->cmsg_type = SCM_RIGHTS;
1611 cmsg->cmsg_len = CMSG_LEN(sizeof(int));
1612 memcpy(CMSG_DATA(cmsg), &fd, sizeof(int));
1613
1614 mh.msg_controllen = cmsg->cmsg_len;
1615
1616 /* Store away the fd in the socket, so that it stays open as
1617 * long as we run the child */
1618 k = sendmsg(send_fd, &mh, MSG_NOSIGNAL);
1619 if (k < 0)
1620 return log_error_errno(errno, "Failed to send netlink fd: %m");
1621
1622 return 0;
1623 }
1624
1625 static int flush_ports(union in_addr_union *exposed) {
1626 ExposePort *p;
1627 int r, af = AF_INET;
1628
1629 assert(exposed);
1630
1631 if (!arg_expose_ports)
1632 return 0;
1633
1634 if (in_addr_is_null(af, exposed))
1635 return 0;
1636
1637 log_debug("Lost IP address.");
1638
1639 LIST_FOREACH(ports, p, arg_expose_ports) {
1640 r = fw_add_local_dnat(false,
1641 af,
1642 p->protocol,
1643 NULL,
1644 NULL, 0,
1645 NULL, 0,
1646 p->host_port,
1647 exposed,
1648 p->container_port,
1649 NULL);
1650 if (r < 0)
1651 log_warning_errno(r, "Failed to modify firewall: %m");
1652 }
1653
1654 *exposed = IN_ADDR_NULL;
1655 return 0;
1656 }
1657
1658 static int expose_ports(sd_rtnl *rtnl, union in_addr_union *exposed) {
1659 _cleanup_free_ struct local_address *addresses = NULL;
1660 _cleanup_free_ char *pretty = NULL;
1661 union in_addr_union new_exposed;
1662 ExposePort *p;
1663 bool add;
1664 int af = AF_INET, r;
1665
1666 assert(exposed);
1667
1668 /* Invoked each time an address is added or removed inside the
1669 * container */
1670
1671 if (!arg_expose_ports)
1672 return 0;
1673
1674 r = local_addresses(rtnl, 0, af, &addresses);
1675 if (r < 0)
1676 return log_error_errno(r, "Failed to enumerate local addresses: %m");
1677
1678 add = r > 0 &&
1679 addresses[0].family == af &&
1680 addresses[0].scope < RT_SCOPE_LINK;
1681
1682 if (!add)
1683 return flush_ports(exposed);
1684
1685 new_exposed = addresses[0].address;
1686 if (in_addr_equal(af, exposed, &new_exposed))
1687 return 0;
1688
1689 in_addr_to_string(af, &new_exposed, &pretty);
1690 log_debug("New container IP is %s.", strna(pretty));
1691
1692 LIST_FOREACH(ports, p, arg_expose_ports) {
1693
1694 r = fw_add_local_dnat(true,
1695 af,
1696 p->protocol,
1697 NULL,
1698 NULL, 0,
1699 NULL, 0,
1700 p->host_port,
1701 &new_exposed,
1702 p->container_port,
1703 in_addr_is_null(af, exposed) ? NULL : exposed);
1704 if (r < 0)
1705 log_warning_errno(r, "Failed to modify firewall: %m");
1706 }
1707
1708 *exposed = new_exposed;
1709 return 0;
1710 }
1711
1712 static int on_address_change(sd_rtnl *rtnl, sd_rtnl_message *m, void *userdata) {
1713 union in_addr_union *exposed = userdata;
1714
1715 assert(rtnl);
1716 assert(m);
1717 assert(exposed);
1718
1719 expose_ports(rtnl, exposed);
1720 return 0;
1721 }
1722
1723 static int watch_rtnl(sd_event *event, int recv_fd, union in_addr_union *exposed, sd_rtnl **ret) {
1724 union {
1725 struct cmsghdr cmsghdr;
1726 uint8_t buf[CMSG_SPACE(sizeof(int))];
1727 } control = {};
1728 struct msghdr mh = {
1729 .msg_control = &control,
1730 .msg_controllen = sizeof(control),
1731 };
1732 struct cmsghdr *cmsg;
1733 _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
1734 int fd, r;
1735 ssize_t k;
1736
1737 assert(event);
1738 assert(recv_fd >= 0);
1739 assert(ret);
1740
1741 if (!arg_expose_ports)
1742 return 0;
1743
1744 k = recvmsg(recv_fd, &mh, MSG_NOSIGNAL);
1745 if (k < 0)
1746 return log_error_errno(errno, "Failed to recv netlink fd: %m");
1747
1748 cmsg = CMSG_FIRSTHDR(&mh);
1749 assert(cmsg->cmsg_level == SOL_SOCKET);
1750 assert(cmsg->cmsg_type == SCM_RIGHTS);
1751 assert(cmsg->cmsg_len == CMSG_LEN(sizeof(int)));
1752 memcpy(&fd, CMSG_DATA(cmsg), sizeof(int));
1753
1754 r = sd_rtnl_open_fd(&rtnl, fd, 1, RTNLGRP_IPV4_IFADDR);
1755 if (r < 0) {
1756 safe_close(fd);
1757 return log_error_errno(r, "Failed to create rtnl object: %m");
1758 }
1759
1760 r = sd_rtnl_add_match(rtnl, RTM_NEWADDR, on_address_change, exposed);
1761 if (r < 0)
1762 return log_error_errno(r, "Failed to subscribe to RTM_NEWADDR messages: %m");
1763
1764 r = sd_rtnl_add_match(rtnl, RTM_DELADDR, on_address_change, exposed);
1765 if (r < 0)
1766 return log_error_errno(r, "Failed to subscribe to RTM_DELADDR messages: %m");
1767
1768 r = sd_rtnl_attach_event(rtnl, event, 0);
1769 if (r < 0)
1770 return log_error_errno(r, "Failed to add to even loop: %m");
1771
1772 *ret = rtnl;
1773 rtnl = NULL;
1774
1775 return 0;
1776 }
1777
1778 static int setup_hostname(void) {
1779
1780 if (arg_share_system)
1781 return 0;
1782
1783 if (sethostname_idempotent(arg_machine) < 0)
1784 return -errno;
1785
1786 return 0;
1787 }
1788
1789 static int setup_journal(const char *directory) {
1790 sd_id128_t machine_id, this_id;
1791 _cleanup_free_ char *p = NULL, *b = NULL, *q = NULL, *d = NULL;
1792 char *id;
1793 int r;
1794
1795 /* Don't link journals in ephemeral mode */
1796 if (arg_ephemeral)
1797 return 0;
1798
1799 p = strappend(directory, "/etc/machine-id");
1800 if (!p)
1801 return log_oom();
1802
1803 r = read_one_line_file(p, &b);
1804 if (r == -ENOENT && arg_link_journal == LINK_AUTO)
1805 return 0;
1806 else if (r < 0)
1807 return log_error_errno(r, "Failed to read machine ID from %s: %m", p);
1808
1809 id = strstrip(b);
1810 if (isempty(id) && arg_link_journal == LINK_AUTO)
1811 return 0;
1812
1813 /* Verify validity */
1814 r = sd_id128_from_string(id, &machine_id);
1815 if (r < 0)
1816 return log_error_errno(r, "Failed to parse machine ID from %s: %m", p);
1817
1818 r = sd_id128_get_machine(&this_id);
1819 if (r < 0)
1820 return log_error_errno(r, "Failed to retrieve machine ID: %m");
1821
1822 if (sd_id128_equal(machine_id, this_id)) {
1823 log_full(arg_link_journal == LINK_AUTO ? LOG_WARNING : LOG_ERR,
1824 "Host and machine ids are equal (%s): refusing to link journals", id);
1825 if (arg_link_journal == LINK_AUTO)
1826 return 0;
1827 return -EEXIST;
1828 }
1829
1830 if (arg_link_journal == LINK_NO)
1831 return 0;
1832
1833 free(p);
1834 p = strappend("/var/log/journal/", id);
1835 q = strjoin(directory, "/var/log/journal/", id, NULL);
1836 if (!p || !q)
1837 return log_oom();
1838
1839 if (path_is_mount_point(p, false) > 0) {
1840 if (arg_link_journal != LINK_AUTO) {
1841 log_error("%s: already a mount point, refusing to use for journal", p);
1842 return -EEXIST;
1843 }
1844
1845 return 0;
1846 }
1847
1848 if (path_is_mount_point(q, false) > 0) {
1849 if (arg_link_journal != LINK_AUTO) {
1850 log_error("%s: already a mount point, refusing to use for journal", q);
1851 return -EEXIST;
1852 }
1853
1854 return 0;
1855 }
1856
1857 r = readlink_and_make_absolute(p, &d);
1858 if (r >= 0) {
1859 if ((arg_link_journal == LINK_GUEST ||
1860 arg_link_journal == LINK_AUTO) &&
1861 path_equal(d, q)) {
1862
1863 r = mkdir_p(q, 0755);
1864 if (r < 0)
1865 log_warning_errno(errno, "Failed to create directory %s: %m", q);
1866 return 0;
1867 }
1868
1869 if (unlink(p) < 0)
1870 return log_error_errno(errno, "Failed to remove symlink %s: %m", p);
1871 } else if (r == -EINVAL) {
1872
1873 if (arg_link_journal == LINK_GUEST &&
1874 rmdir(p) < 0) {
1875
1876 if (errno == ENOTDIR) {
1877 log_error("%s already exists and is neither a symlink nor a directory", p);
1878 return r;
1879 } else {
1880 log_error_errno(errno, "Failed to remove %s: %m", p);
1881 return -errno;
1882 }
1883 }
1884 } else if (r != -ENOENT) {
1885 log_error_errno(errno, "readlink(%s) failed: %m", p);
1886 return r;
1887 }
1888
1889 if (arg_link_journal == LINK_GUEST) {
1890
1891 if (symlink(q, p) < 0) {
1892 if (arg_link_journal_try) {
1893 log_debug_errno(errno, "Failed to symlink %s to %s, skipping journal setup: %m", q, p);
1894 return 0;
1895 } else {
1896 log_error_errno(errno, "Failed to symlink %s to %s: %m", q, p);
1897 return -errno;
1898 }
1899 }
1900
1901 r = mkdir_p(q, 0755);
1902 if (r < 0)
1903 log_warning_errno(errno, "Failed to create directory %s: %m", q);
1904 return 0;
1905 }
1906
1907 if (arg_link_journal == LINK_HOST) {
1908 /* don't create parents here -- if the host doesn't have
1909 * permanent journal set up, don't force it here */
1910 r = mkdir(p, 0755);
1911 if (r < 0) {
1912 if (arg_link_journal_try) {
1913 log_debug_errno(errno, "Failed to create %s, skipping journal setup: %m", p);
1914 return 0;
1915 } else {
1916 log_error_errno(errno, "Failed to create %s: %m", p);
1917 return r;
1918 }
1919 }
1920
1921 } else if (access(p, F_OK) < 0)
1922 return 0;
1923
1924 if (dir_is_empty(q) == 0)
1925 log_warning("%s is not empty, proceeding anyway.", q);
1926
1927 r = mkdir_p(q, 0755);
1928 if (r < 0) {
1929 log_error_errno(errno, "Failed to create %s: %m", q);
1930 return r;
1931 }
1932
1933 if (mount(p, q, NULL, MS_BIND, NULL) < 0)
1934 return log_error_errno(errno, "Failed to bind mount journal from host into guest: %m");
1935
1936 return 0;
1937 }
1938
1939 static int drop_capabilities(void) {
1940 return capability_bounding_set_drop(~arg_retain, false);
1941 }
1942
1943 static int register_machine(pid_t pid, int local_ifindex) {
1944 _cleanup_bus_error_free_ sd_bus_error error = SD_BUS_ERROR_NULL;
1945 _cleanup_bus_close_unref_ sd_bus *bus = NULL;
1946 int r;
1947
1948 if (!arg_register)
1949 return 0;
1950
1951 r = sd_bus_default_system(&bus);
1952 if (r < 0)
1953 return log_error_errno(r, "Failed to open system bus: %m");
1954
1955 if (arg_keep_unit) {
1956 r = sd_bus_call_method(
1957 bus,
1958 "org.freedesktop.machine1",
1959 "/org/freedesktop/machine1",
1960 "org.freedesktop.machine1.Manager",
1961 "RegisterMachineWithNetwork",
1962 &error,
1963 NULL,
1964 "sayssusai",
1965 arg_machine,
1966 SD_BUS_MESSAGE_APPEND_ID128(arg_uuid),
1967 "nspawn",
1968 "container",
1969 (uint32_t) pid,
1970 strempty(arg_directory),
1971 local_ifindex > 0 ? 1 : 0, local_ifindex);
1972 } else {
1973 _cleanup_bus_message_unref_ sd_bus_message *m = NULL;
1974 char **i;
1975
1976 r = sd_bus_message_new_method_call(
1977 bus,
1978 &m,
1979 "org.freedesktop.machine1",
1980 "/org/freedesktop/machine1",
1981 "org.freedesktop.machine1.Manager",
1982 "CreateMachineWithNetwork");
1983 if (r < 0)
1984 return bus_log_create_error(r);
1985
1986 r = sd_bus_message_append(
1987 m,
1988 "sayssusai",
1989 arg_machine,
1990 SD_BUS_MESSAGE_APPEND_ID128(arg_uuid),
1991 "nspawn",
1992 "container",
1993 (uint32_t) pid,
1994 strempty(arg_directory),
1995 local_ifindex > 0 ? 1 : 0, local_ifindex);
1996 if (r < 0)
1997 return bus_log_create_error(r);
1998
1999 r = sd_bus_message_open_container(m, 'a', "(sv)");
2000 if (r < 0)
2001 return bus_log_create_error(r);
2002
2003 if (!isempty(arg_slice)) {
2004 r = sd_bus_message_append(m, "(sv)", "Slice", "s", arg_slice);
2005 if (r < 0)
2006 return bus_log_create_error(r);
2007 }
2008
2009 r = sd_bus_message_append(m, "(sv)", "DevicePolicy", "s", "strict");
2010 if (r < 0)
2011 return bus_log_create_error(r);
2012
2013 r = sd_bus_message_append(m, "(sv)", "DeviceAllow", "a(ss)", 9,
2014 /* Allow the container to
2015 * access and create the API
2016 * device nodes, so that
2017 * PrivateDevices= in the
2018 * container can work
2019 * fine */
2020 "/dev/null", "rwm",
2021 "/dev/zero", "rwm",
2022 "/dev/full", "rwm",
2023 "/dev/random", "rwm",
2024 "/dev/urandom", "rwm",
2025 "/dev/tty", "rwm",
2026 "/dev/net/tun", "rwm",
2027 /* Allow the container
2028 * access to ptys. However,
2029 * do not permit the
2030 * container to ever create
2031 * these device nodes. */
2032 "/dev/pts/ptmx", "rw",
2033 "char-pts", "rw");
2034 if (r < 0)
2035 return log_error_errno(r, "Failed to add device whitelist: %m");
2036
2037 STRV_FOREACH(i, arg_property) {
2038 r = sd_bus_message_open_container(m, 'r', "sv");
2039 if (r < 0)
2040 return bus_log_create_error(r);
2041
2042 r = bus_append_unit_property_assignment(m, *i);
2043 if (r < 0)
2044 return r;
2045
2046 r = sd_bus_message_close_container(m);
2047 if (r < 0)
2048 return bus_log_create_error(r);
2049 }
2050
2051 r = sd_bus_message_close_container(m);
2052 if (r < 0)
2053 return bus_log_create_error(r);
2054
2055 r = sd_bus_call(bus, m, 0, &error, NULL);
2056 }
2057
2058 if (r < 0) {
2059 log_error("Failed to register machine: %s", bus_error_message(&error, r));
2060 return r;
2061 }
2062
2063 return 0;
2064 }
2065
2066 static int terminate_machine(pid_t pid) {
2067 _cleanup_bus_error_free_ sd_bus_error error = SD_BUS_ERROR_NULL;
2068 _cleanup_bus_message_unref_ sd_bus_message *reply = NULL;
2069 _cleanup_bus_close_unref_ sd_bus *bus = NULL;
2070 const char *path;
2071 int r;
2072
2073 if (!arg_register)
2074 return 0;
2075
2076 r = sd_bus_default_system(&bus);
2077 if (r < 0)
2078 return log_error_errno(r, "Failed to open system bus: %m");
2079
2080 r = sd_bus_call_method(
2081 bus,
2082 "org.freedesktop.machine1",
2083 "/org/freedesktop/machine1",
2084 "org.freedesktop.machine1.Manager",
2085 "GetMachineByPID",
2086 &error,
2087 &reply,
2088 "u",
2089 (uint32_t) pid);
2090 if (r < 0) {
2091 /* Note that the machine might already have been
2092 * cleaned up automatically, hence don't consider it a
2093 * failure if we cannot get the machine object. */
2094 log_debug("Failed to get machine: %s", bus_error_message(&error, r));
2095 return 0;
2096 }
2097
2098 r = sd_bus_message_read(reply, "o", &path);
2099 if (r < 0)
2100 return bus_log_parse_error(r);
2101
2102 r = sd_bus_call_method(
2103 bus,
2104 "org.freedesktop.machine1",
2105 path,
2106 "org.freedesktop.machine1.Machine",
2107 "Terminate",
2108 &error,
2109 NULL,
2110 NULL);
2111 if (r < 0) {
2112 log_debug("Failed to terminate machine: %s", bus_error_message(&error, r));
2113 return 0;
2114 }
2115
2116 return 0;
2117 }
2118
2119 static int reset_audit_loginuid(void) {
2120 _cleanup_free_ char *p = NULL;
2121 int r;
2122
2123 if (arg_share_system)
2124 return 0;
2125
2126 r = read_one_line_file("/proc/self/loginuid", &p);
2127 if (r == -ENOENT)
2128 return 0;
2129 if (r < 0)
2130 return log_error_errno(r, "Failed to read /proc/self/loginuid: %m");
2131
2132 /* Already reset? */
2133 if (streq(p, "4294967295"))
2134 return 0;
2135
2136 r = write_string_file("/proc/self/loginuid", "4294967295");
2137 if (r < 0) {
2138 log_error("Failed to reset audit login UID. This probably means that your kernel is too\n"
2139 "old and you have audit enabled. Note that the auditing subsystem is known to\n"
2140 "be incompatible with containers on old kernels. Please make sure to upgrade\n"
2141 "your kernel or to off auditing with 'audit=0' on the kernel command line before\n"
2142 "using systemd-nspawn. Sleeping for 5s... (%s)\n", strerror(-r));
2143
2144 sleep(5);
2145 }
2146
2147 return 0;
2148 }
2149
2150 #define HOST_HASH_KEY SD_ID128_MAKE(1a,37,6f,c7,46,ec,45,0b,ad,a3,d5,31,06,60,5d,b1)
2151 #define CONTAINER_HASH_KEY SD_ID128_MAKE(c3,c4,f9,19,b5,57,b2,1c,e6,cf,14,27,03,9c,ee,a2)
2152 #define MACVLAN_HASH_KEY SD_ID128_MAKE(00,13,6d,bc,66,83,44,81,bb,0c,f9,51,1f,24,a6,6f)
2153
2154 static int generate_mac(struct ether_addr *mac, sd_id128_t hash_key, uint64_t idx) {
2155 uint8_t result[8];
2156 size_t l, sz;
2157 uint8_t *v, *i;
2158 int r;
2159
2160 l = strlen(arg_machine);
2161 sz = sizeof(sd_id128_t) + l;
2162 if (idx > 0)
2163 sz += sizeof(idx);
2164
2165 v = alloca(sz);
2166
2167 /* fetch some persistent data unique to the host */
2168 r = sd_id128_get_machine((sd_id128_t*) v);
2169 if (r < 0)
2170 return r;
2171
2172 /* combine with some data unique (on this host) to this
2173 * container instance */
2174 i = mempcpy(v + sizeof(sd_id128_t), arg_machine, l);
2175 if (idx > 0) {
2176 idx = htole64(idx);
2177 memcpy(i, &idx, sizeof(idx));
2178 }
2179
2180 /* Let's hash the host machine ID plus the container name. We
2181 * use a fixed, but originally randomly created hash key here. */
2182 siphash24(result, v, sz, hash_key.bytes);
2183
2184 assert_cc(ETH_ALEN <= sizeof(result));
2185 memcpy(mac->ether_addr_octet, result, ETH_ALEN);
2186
2187 /* see eth_random_addr in the kernel */
2188 mac->ether_addr_octet[0] &= 0xfe; /* clear multicast bit */
2189 mac->ether_addr_octet[0] |= 0x02; /* set local assignment bit (IEEE802) */
2190
2191 return 0;
2192 }
2193
2194 static int setup_veth(pid_t pid, char iface_name[IFNAMSIZ], int *ifi) {
2195 _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
2196 _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
2197 struct ether_addr mac_host, mac_container;
2198 int r, i;
2199
2200 if (!arg_private_network)
2201 return 0;
2202
2203 if (!arg_network_veth)
2204 return 0;
2205
2206 /* Use two different interface name prefixes depending whether
2207 * we are in bridge mode or not. */
2208 snprintf(iface_name, IFNAMSIZ - 1, "%s-%s",
2209 arg_network_bridge ? "vb" : "ve", arg_machine);
2210
2211 r = generate_mac(&mac_container, CONTAINER_HASH_KEY, 0);
2212 if (r < 0)
2213 return log_error_errno(r, "Failed to generate predictable MAC address for container side: %m");
2214
2215 r = generate_mac(&mac_host, HOST_HASH_KEY, 0);
2216 if (r < 0)
2217 return log_error_errno(r, "Failed to generate predictable MAC address for host side: %m");
2218
2219 r = sd_rtnl_open(&rtnl, 0);
2220 if (r < 0)
2221 return log_error_errno(r, "Failed to connect to netlink: %m");
2222
2223 r = sd_rtnl_message_new_link(rtnl, &m, RTM_NEWLINK, 0);
2224 if (r < 0)
2225 return log_error_errno(r, "Failed to allocate netlink message: %m");
2226
2227 r = sd_rtnl_message_append_string(m, IFLA_IFNAME, iface_name);
2228 if (r < 0)
2229 return log_error_errno(r, "Failed to add netlink interface name: %m");
2230
2231 r = sd_rtnl_message_append_ether_addr(m, IFLA_ADDRESS, &mac_host);
2232 if (r < 0)
2233 return log_error_errno(r, "Failed to add netlink MAC address: %m");
2234
2235 r = sd_rtnl_message_open_container(m, IFLA_LINKINFO);
2236 if (r < 0)
2237 return log_error_errno(r, "Failed to open netlink container: %m");
2238
2239 r = sd_rtnl_message_open_container_union(m, IFLA_INFO_DATA, "veth");
2240 if (r < 0)
2241 return log_error_errno(r, "Failed to open netlink container: %m");
2242
2243 r = sd_rtnl_message_open_container(m, VETH_INFO_PEER);
2244 if (r < 0)
2245 return log_error_errno(r, "Failed to open netlink container: %m");
2246
2247 r = sd_rtnl_message_append_string(m, IFLA_IFNAME, "host0");
2248 if (r < 0)
2249 return log_error_errno(r, "Failed to add netlink interface name: %m");
2250
2251 r = sd_rtnl_message_append_ether_addr(m, IFLA_ADDRESS, &mac_container);
2252 if (r < 0)
2253 return log_error_errno(r, "Failed to add netlink MAC address: %m");
2254
2255 r = sd_rtnl_message_append_u32(m, IFLA_NET_NS_PID, pid);
2256 if (r < 0)
2257 return log_error_errno(r, "Failed to add netlink namespace field: %m");
2258
2259 r = sd_rtnl_message_close_container(m);
2260 if (r < 0)
2261 return log_error_errno(r, "Failed to close netlink container: %m");
2262
2263 r = sd_rtnl_message_close_container(m);
2264 if (r < 0)
2265 return log_error_errno(r, "Failed to close netlink container: %m");
2266
2267 r = sd_rtnl_message_close_container(m);
2268 if (r < 0)
2269 return log_error_errno(r, "Failed to close netlink container: %m");
2270
2271 r = sd_rtnl_call(rtnl, m, 0, NULL);
2272 if (r < 0)
2273 return log_error_errno(r, "Failed to add new veth interfaces: %m");
2274
2275 i = (int) if_nametoindex(iface_name);
2276 if (i <= 0)
2277 return log_error_errno(errno, "Failed to resolve interface %s: %m", iface_name);
2278
2279 *ifi = i;
2280
2281 return 0;
2282 }
2283
2284 static int setup_bridge(const char veth_name[], int *ifi) {
2285 _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
2286 _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
2287 int r, bridge;
2288
2289 if (!arg_private_network)
2290 return 0;
2291
2292 if (!arg_network_veth)
2293 return 0;
2294
2295 if (!arg_network_bridge)
2296 return 0;
2297
2298 bridge = (int) if_nametoindex(arg_network_bridge);
2299 if (bridge <= 0)
2300 return log_error_errno(errno, "Failed to resolve interface %s: %m", arg_network_bridge);
2301
2302 *ifi = bridge;
2303
2304 r = sd_rtnl_open(&rtnl, 0);
2305 if (r < 0)
2306 return log_error_errno(r, "Failed to connect to netlink: %m");
2307
2308 r = sd_rtnl_message_new_link(rtnl, &m, RTM_SETLINK, 0);
2309 if (r < 0)
2310 return log_error_errno(r, "Failed to allocate netlink message: %m");
2311
2312 r = sd_rtnl_message_link_set_flags(m, IFF_UP, IFF_UP);
2313 if (r < 0)
2314 return log_error_errno(r, "Failed to set IFF_UP flag: %m");
2315
2316 r = sd_rtnl_message_append_string(m, IFLA_IFNAME, veth_name);
2317 if (r < 0)
2318 return log_error_errno(r, "Failed to add netlink interface name field: %m");
2319
2320 r = sd_rtnl_message_append_u32(m, IFLA_MASTER, bridge);
2321 if (r < 0)
2322 return log_error_errno(r, "Failed to add netlink master field: %m");
2323
2324 r = sd_rtnl_call(rtnl, m, 0, NULL);
2325 if (r < 0)
2326 return log_error_errno(r, "Failed to add veth interface to bridge: %m");
2327
2328 return 0;
2329 }
2330
2331 static int parse_interface(struct udev *udev, const char *name) {
2332 _cleanup_udev_device_unref_ struct udev_device *d = NULL;
2333 char ifi_str[2 + DECIMAL_STR_MAX(int)];
2334 int ifi;
2335
2336 ifi = (int) if_nametoindex(name);
2337 if (ifi <= 0)
2338 return log_error_errno(errno, "Failed to resolve interface %s: %m", name);
2339
2340 sprintf(ifi_str, "n%i", ifi);
2341 d = udev_device_new_from_device_id(udev, ifi_str);
2342 if (!d)
2343 return log_error_errno(errno, "Failed to get udev device for interface %s: %m", name);
2344
2345 if (udev_device_get_is_initialized(d) <= 0) {
2346 log_error("Network interface %s is not initialized yet.", name);
2347 return -EBUSY;
2348 }
2349
2350 return ifi;
2351 }
2352
2353 static int move_network_interfaces(pid_t pid) {
2354 _cleanup_udev_unref_ struct udev *udev = NULL;
2355 _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
2356 char **i;
2357 int r;
2358
2359 if (!arg_private_network)
2360 return 0;
2361
2362 if (strv_isempty(arg_network_interfaces))
2363 return 0;
2364
2365 r = sd_rtnl_open(&rtnl, 0);
2366 if (r < 0)
2367 return log_error_errno(r, "Failed to connect to netlink: %m");
2368
2369 udev = udev_new();
2370 if (!udev) {
2371 log_error("Failed to connect to udev.");
2372 return -ENOMEM;
2373 }
2374
2375 STRV_FOREACH(i, arg_network_interfaces) {
2376 _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
2377 int ifi;
2378
2379 ifi = parse_interface(udev, *i);
2380 if (ifi < 0)
2381 return ifi;
2382
2383 r = sd_rtnl_message_new_link(rtnl, &m, RTM_SETLINK, ifi);
2384 if (r < 0)
2385 return log_error_errno(r, "Failed to allocate netlink message: %m");
2386
2387 r = sd_rtnl_message_append_u32(m, IFLA_NET_NS_PID, pid);
2388 if (r < 0)
2389 return log_error_errno(r, "Failed to append namespace PID to netlink message: %m");
2390
2391 r = sd_rtnl_call(rtnl, m, 0, NULL);
2392 if (r < 0)
2393 return log_error_errno(r, "Failed to move interface %s to namespace: %m", *i);
2394 }
2395
2396 return 0;
2397 }
2398
2399 static int setup_macvlan(pid_t pid) {
2400 _cleanup_udev_unref_ struct udev *udev = NULL;
2401 _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
2402 unsigned idx = 0;
2403 char **i;
2404 int r;
2405
2406 if (!arg_private_network)
2407 return 0;
2408
2409 if (strv_isempty(arg_network_macvlan))
2410 return 0;
2411
2412 r = sd_rtnl_open(&rtnl, 0);
2413 if (r < 0)
2414 return log_error_errno(r, "Failed to connect to netlink: %m");
2415
2416 udev = udev_new();
2417 if (!udev) {
2418 log_error("Failed to connect to udev.");
2419 return -ENOMEM;
2420 }
2421
2422 STRV_FOREACH(i, arg_network_macvlan) {
2423 _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
2424 _cleanup_free_ char *n = NULL;
2425 struct ether_addr mac;
2426 int ifi;
2427
2428 ifi = parse_interface(udev, *i);
2429 if (ifi < 0)
2430 return ifi;
2431
2432 r = generate_mac(&mac, MACVLAN_HASH_KEY, idx++);
2433 if (r < 0)
2434 return log_error_errno(r, "Failed to create MACVLAN MAC address: %m");
2435
2436 r = sd_rtnl_message_new_link(rtnl, &m, RTM_NEWLINK, 0);
2437 if (r < 0)
2438 return log_error_errno(r, "Failed to allocate netlink message: %m");
2439
2440 r = sd_rtnl_message_append_u32(m, IFLA_LINK, ifi);
2441 if (r < 0)
2442 return log_error_errno(r, "Failed to add netlink interface index: %m");
2443
2444 n = strappend("mv-", *i);
2445 if (!n)
2446 return log_oom();
2447
2448 strshorten(n, IFNAMSIZ-1);
2449
2450 r = sd_rtnl_message_append_string(m, IFLA_IFNAME, n);
2451 if (r < 0)
2452 return log_error_errno(r, "Failed to add netlink interface name: %m");
2453
2454 r = sd_rtnl_message_append_ether_addr(m, IFLA_ADDRESS, &mac);
2455 if (r < 0)
2456 return log_error_errno(r, "Failed to add netlink MAC address: %m");
2457
2458 r = sd_rtnl_message_append_u32(m, IFLA_NET_NS_PID, pid);
2459 if (r < 0)
2460 return log_error_errno(r, "Failed to add netlink namespace field: %m");
2461
2462 r = sd_rtnl_message_open_container(m, IFLA_LINKINFO);
2463 if (r < 0)
2464 return log_error_errno(r, "Failed to open netlink container: %m");
2465
2466 r = sd_rtnl_message_open_container_union(m, IFLA_INFO_DATA, "macvlan");
2467 if (r < 0)
2468 return log_error_errno(r, "Failed to open netlink container: %m");
2469
2470 r = sd_rtnl_message_append_u32(m, IFLA_MACVLAN_MODE, MACVLAN_MODE_BRIDGE);
2471 if (r < 0)
2472 return log_error_errno(r, "Failed to append macvlan mode: %m");
2473
2474 r = sd_rtnl_message_close_container(m);
2475 if (r < 0)
2476 return log_error_errno(r, "Failed to close netlink container: %m");
2477
2478 r = sd_rtnl_message_close_container(m);
2479 if (r < 0)
2480 return log_error_errno(r, "Failed to close netlink container: %m");
2481
2482 r = sd_rtnl_call(rtnl, m, 0, NULL);
2483 if (r < 0)
2484 return log_error_errno(r, "Failed to add new macvlan interfaces: %m");
2485 }
2486
2487 return 0;
2488 }
2489
2490 static int setup_ipvlan(pid_t pid) {
2491 _cleanup_udev_unref_ struct udev *udev = NULL;
2492 _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
2493 char **i;
2494 int r;
2495
2496 if (!arg_private_network)
2497 return 0;
2498
2499 if (strv_isempty(arg_network_ipvlan))
2500 return 0;
2501
2502 r = sd_rtnl_open(&rtnl, 0);
2503 if (r < 0)
2504 return log_error_errno(r, "Failed to connect to netlink: %m");
2505
2506 udev = udev_new();
2507 if (!udev) {
2508 log_error("Failed to connect to udev.");
2509 return -ENOMEM;
2510 }
2511
2512 STRV_FOREACH(i, arg_network_ipvlan) {
2513 _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
2514 _cleanup_free_ char *n = NULL;
2515 int ifi;
2516
2517 ifi = parse_interface(udev, *i);
2518 if (ifi < 0)
2519 return ifi;
2520
2521 r = sd_rtnl_message_new_link(rtnl, &m, RTM_NEWLINK, 0);
2522 if (r < 0)
2523 return log_error_errno(r, "Failed to allocate netlink message: %m");
2524
2525 r = sd_rtnl_message_append_u32(m, IFLA_LINK, ifi);
2526 if (r < 0)
2527 return log_error_errno(r, "Failed to add netlink interface index: %m");
2528
2529 n = strappend("iv-", *i);
2530 if (!n)
2531 return log_oom();
2532
2533 strshorten(n, IFNAMSIZ-1);
2534
2535 r = sd_rtnl_message_append_string(m, IFLA_IFNAME, n);
2536 if (r < 0)
2537 return log_error_errno(r, "Failed to add netlink interface name: %m");
2538
2539 r = sd_rtnl_message_append_u32(m, IFLA_NET_NS_PID, pid);
2540 if (r < 0)
2541 return log_error_errno(r, "Failed to add netlink namespace field: %m");
2542
2543 r = sd_rtnl_message_open_container(m, IFLA_LINKINFO);
2544 if (r < 0)
2545 return log_error_errno(r, "Failed to open netlink container: %m");
2546
2547 r = sd_rtnl_message_open_container_union(m, IFLA_INFO_DATA, "ipvlan");
2548 if (r < 0)
2549 return log_error_errno(r, "Failed to open netlink container: %m");
2550
2551 r = sd_rtnl_message_append_u16(m, IFLA_IPVLAN_MODE, IPVLAN_MODE_L2);
2552 if (r < 0)
2553 return log_error_errno(r, "Failed to add ipvlan mode: %m");
2554
2555 r = sd_rtnl_message_close_container(m);
2556 if (r < 0)
2557 return log_error_errno(r, "Failed to close netlink container: %m");
2558
2559 r = sd_rtnl_message_close_container(m);
2560 if (r < 0)
2561 return log_error_errno(r, "Failed to close netlink container: %m");
2562
2563 r = sd_rtnl_call(rtnl, m, 0, NULL);
2564 if (r < 0)
2565 return log_error_errno(r, "Failed to add new ipvlan interfaces: %m");
2566 }
2567
2568 return 0;
2569 }
2570
2571 static int setup_seccomp(void) {
2572
2573 #ifdef HAVE_SECCOMP
2574 static const struct {
2575 uint64_t capability;
2576 int syscall_num;
2577 } blacklist[] = {
2578 { CAP_SYS_RAWIO, SCMP_SYS(iopl)},
2579 { CAP_SYS_RAWIO, SCMP_SYS(ioperm)},
2580 { CAP_SYS_BOOT, SCMP_SYS(kexec_load)},
2581 { CAP_SYS_ADMIN, SCMP_SYS(swapon)},
2582 { CAP_SYS_ADMIN, SCMP_SYS(swapoff)},
2583 { CAP_SYS_ADMIN, SCMP_SYS(open_by_handle_at)},
2584 { CAP_SYS_MODULE, SCMP_SYS(init_module)},
2585 { CAP_SYS_MODULE, SCMP_SYS(finit_module)},
2586 { CAP_SYS_MODULE, SCMP_SYS(delete_module)},
2587 };
2588
2589 scmp_filter_ctx seccomp;
2590 unsigned i;
2591 int r;
2592
2593 seccomp = seccomp_init(SCMP_ACT_ALLOW);
2594 if (!seccomp)
2595 return log_oom();
2596
2597 r = seccomp_add_secondary_archs(seccomp);
2598 if (r < 0) {
2599 log_error_errno(r, "Failed to add secondary archs to seccomp filter: %m");
2600 goto finish;
2601 }
2602
2603 for (i = 0; i < ELEMENTSOF(blacklist); i++) {
2604 if (arg_retain & (1ULL << blacklist[i].capability))
2605 continue;
2606
2607 r = seccomp_rule_add(seccomp, SCMP_ACT_ERRNO(EPERM), blacklist[i].syscall_num, 0);
2608 if (r == -EFAULT)
2609 continue; /* unknown syscall */
2610 if (r < 0) {
2611 log_error_errno(r, "Failed to block syscall: %m");
2612 goto finish;
2613 }
2614 }
2615
2616
2617 /*
2618 Audit is broken in containers, much of the userspace audit
2619 hookup will fail if running inside a container. We don't
2620 care and just turn off creation of audit sockets.
2621
2622 This will make socket(AF_NETLINK, *, NETLINK_AUDIT) fail
2623 with EAFNOSUPPORT which audit userspace uses as indication
2624 that audit is disabled in the kernel.
2625 */
2626
2627 r = seccomp_rule_add(
2628 seccomp,
2629 SCMP_ACT_ERRNO(EAFNOSUPPORT),
2630 SCMP_SYS(socket),
2631 2,
2632 SCMP_A0(SCMP_CMP_EQ, AF_NETLINK),
2633 SCMP_A2(SCMP_CMP_EQ, NETLINK_AUDIT));
2634 if (r < 0) {
2635 log_error_errno(r, "Failed to add audit seccomp rule: %m");
2636 goto finish;
2637 }
2638
2639 r = seccomp_attr_set(seccomp, SCMP_FLTATR_CTL_NNP, 0);
2640 if (r < 0) {
2641 log_error_errno(r, "Failed to unset NO_NEW_PRIVS: %m");
2642 goto finish;
2643 }
2644
2645 r = seccomp_load(seccomp);
2646 if (r < 0)
2647 log_error_errno(r, "Failed to install seccomp audit filter: %m");
2648
2649 finish:
2650 seccomp_release(seccomp);
2651 return r;
2652 #else
2653 return 0;
2654 #endif
2655
2656 }
2657
2658 static int setup_propagate(const char *root) {
2659 const char *p, *q;
2660
2661 (void) mkdir_p("/run/systemd/nspawn/", 0755);
2662 (void) mkdir_p("/run/systemd/nspawn/propagate", 0600);
2663 p = strjoina("/run/systemd/nspawn/propagate/", arg_machine);
2664 (void) mkdir_p(p, 0600);
2665
2666 q = strjoina(root, "/run/systemd/nspawn/incoming");
2667 mkdir_parents(q, 0755);
2668 mkdir_p(q, 0600);
2669
2670 if (mount(p, q, NULL, MS_BIND, NULL) < 0)
2671 return log_error_errno(errno, "Failed to install propagation bind mount.");
2672
2673 if (mount(NULL, q, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY, NULL) < 0)
2674 return log_error_errno(errno, "Failed to make propagation mount read-only");
2675
2676 return 0;
2677 }
2678
2679 static int setup_image(char **device_path, int *loop_nr) {
2680 struct loop_info64 info = {
2681 .lo_flags = LO_FLAGS_AUTOCLEAR|LO_FLAGS_PARTSCAN
2682 };
2683 _cleanup_close_ int fd = -1, control = -1, loop = -1;
2684 _cleanup_free_ char* loopdev = NULL;
2685 struct stat st;
2686 int r, nr;
2687
2688 assert(device_path);
2689 assert(loop_nr);
2690 assert(arg_image);
2691
2692 fd = open(arg_image, O_CLOEXEC|(arg_read_only ? O_RDONLY : O_RDWR)|O_NONBLOCK|O_NOCTTY);
2693 if (fd < 0)
2694 return log_error_errno(errno, "Failed to open %s: %m", arg_image);
2695
2696 if (fstat(fd, &st) < 0)
2697 return log_error_errno(errno, "Failed to stat %s: %m", arg_image);
2698
2699 if (S_ISBLK(st.st_mode)) {
2700 char *p;
2701
2702 p = strdup(arg_image);
2703 if (!p)
2704 return log_oom();
2705
2706 *device_path = p;
2707
2708 *loop_nr = -1;
2709
2710 r = fd;
2711 fd = -1;
2712
2713 return r;
2714 }
2715
2716 if (!S_ISREG(st.st_mode)) {
2717 log_error_errno(errno, "%s is not a regular file or block device: %m", arg_image);
2718 return -EINVAL;
2719 }
2720
2721 control = open("/dev/loop-control", O_RDWR|O_CLOEXEC|O_NOCTTY|O_NONBLOCK);
2722 if (control < 0)
2723 return log_error_errno(errno, "Failed to open /dev/loop-control: %m");
2724
2725 nr = ioctl(control, LOOP_CTL_GET_FREE);
2726 if (nr < 0)
2727 return log_error_errno(errno, "Failed to allocate loop device: %m");
2728
2729 if (asprintf(&loopdev, "/dev/loop%i", nr) < 0)
2730 return log_oom();
2731
2732 loop = open(loopdev, O_CLOEXEC|(arg_read_only ? O_RDONLY : O_RDWR)|O_NONBLOCK|O_NOCTTY);
2733 if (loop < 0)
2734 return log_error_errno(errno, "Failed to open loop device %s: %m", loopdev);
2735
2736 if (ioctl(loop, LOOP_SET_FD, fd) < 0)
2737 return log_error_errno(errno, "Failed to set loopback file descriptor on %s: %m", loopdev);
2738
2739 if (arg_read_only)
2740 info.lo_flags |= LO_FLAGS_READ_ONLY;
2741
2742 if (ioctl(loop, LOOP_SET_STATUS64, &info) < 0)
2743 return log_error_errno(errno, "Failed to set loopback settings on %s: %m", loopdev);
2744
2745 *device_path = loopdev;
2746 loopdev = NULL;
2747
2748 *loop_nr = nr;
2749
2750 r = loop;
2751 loop = -1;
2752
2753 return r;
2754 }
2755
2756 #define PARTITION_TABLE_BLURB \
2757 "Note that the disk image needs to either contain only a single MBR partition of\n" \
2758 "type 0x83 that is marked bootable, or a single GPT partition of type " \
2759 "0FC63DAF-8483-4772-8E79-3D69D8477DE4 or follow\n" \
2760 " http://www.freedesktop.org/wiki/Specifications/DiscoverablePartitionsSpec/\n" \
2761 "to be bootable with systemd-nspawn."
2762
2763 static int dissect_image(
2764 int fd,
2765 char **root_device, bool *root_device_rw,
2766 char **home_device, bool *home_device_rw,
2767 char **srv_device, bool *srv_device_rw,
2768 bool *secondary) {
2769
2770 #ifdef HAVE_BLKID
2771 int home_nr = -1, srv_nr = -1;
2772 #ifdef GPT_ROOT_NATIVE
2773 int root_nr = -1;
2774 #endif
2775 #ifdef GPT_ROOT_SECONDARY
2776 int secondary_root_nr = -1;
2777 #endif
2778 _cleanup_free_ char *home = NULL, *root = NULL, *secondary_root = NULL, *srv = NULL, *generic = NULL;
2779 _cleanup_udev_enumerate_unref_ struct udev_enumerate *e = NULL;
2780 _cleanup_udev_device_unref_ struct udev_device *d = NULL;
2781 _cleanup_blkid_free_probe_ blkid_probe b = NULL;
2782 _cleanup_udev_unref_ struct udev *udev = NULL;
2783 struct udev_list_entry *first, *item;
2784 bool home_rw = true, root_rw = true, secondary_root_rw = true, srv_rw = true, generic_rw = true;
2785 bool is_gpt, is_mbr, multiple_generic = false;
2786 const char *pttype = NULL;
2787 blkid_partlist pl;
2788 struct stat st;
2789 unsigned i;
2790 int r;
2791
2792 assert(fd >= 0);
2793 assert(root_device);
2794 assert(home_device);
2795 assert(srv_device);
2796 assert(secondary);
2797 assert(arg_image);
2798
2799 b = blkid_new_probe();
2800 if (!b)
2801 return log_oom();
2802
2803 errno = 0;
2804 r = blkid_probe_set_device(b, fd, 0, 0);
2805 if (r != 0) {
2806 if (errno == 0)
2807 return log_oom();
2808
2809 log_error_errno(errno, "Failed to set device on blkid probe: %m");
2810 return -errno;
2811 }
2812
2813 blkid_probe_enable_partitions(b, 1);
2814 blkid_probe_set_partitions_flags(b, BLKID_PARTS_ENTRY_DETAILS);
2815
2816 errno = 0;
2817 r = blkid_do_safeprobe(b);
2818 if (r == -2 || r == 1) {
2819 log_error("Failed to identify any partition table on\n"
2820 " %s\n"
2821 PARTITION_TABLE_BLURB, arg_image);
2822 return -EINVAL;
2823 } else if (r != 0) {
2824 if (errno == 0)
2825 errno = EIO;
2826 log_error_errno(errno, "Failed to probe: %m");
2827 return -errno;
2828 }
2829
2830 (void) blkid_probe_lookup_value(b, "PTTYPE", &pttype, NULL);
2831
2832 is_gpt = streq_ptr(pttype, "gpt");
2833 is_mbr = streq_ptr(pttype, "dos");
2834
2835 if (!is_gpt && !is_mbr) {
2836 log_error("No GPT or MBR partition table discovered on\n"
2837 " %s\n"
2838 PARTITION_TABLE_BLURB, arg_image);
2839 return -EINVAL;
2840 }
2841
2842 errno = 0;
2843 pl = blkid_probe_get_partitions(b);
2844 if (!pl) {
2845 if (errno == 0)
2846 return log_oom();
2847
2848 log_error("Failed to list partitions of %s", arg_image);
2849 return -errno;
2850 }
2851
2852 udev = udev_new();
2853 if (!udev)
2854 return log_oom();
2855
2856 if (fstat(fd, &st) < 0)
2857 return log_error_errno(errno, "Failed to stat block device: %m");
2858
2859 d = udev_device_new_from_devnum(udev, 'b', st.st_rdev);
2860 if (!d)
2861 return log_oom();
2862
2863 for (i = 0;; i++) {
2864 int n, m;
2865
2866 if (i >= 10) {
2867 log_error("Kernel partitions never appeared.");
2868 return -ENXIO;
2869 }
2870
2871 e = udev_enumerate_new(udev);
2872 if (!e)
2873 return log_oom();
2874
2875 r = udev_enumerate_add_match_parent(e, d);
2876 if (r < 0)
2877 return log_oom();
2878
2879 r = udev_enumerate_scan_devices(e);
2880 if (r < 0)
2881 return log_error_errno(r, "Failed to scan for partition devices of %s: %m", arg_image);
2882
2883 /* Count the partitions enumerated by the kernel */
2884 n = 0;
2885 first = udev_enumerate_get_list_entry(e);
2886 udev_list_entry_foreach(item, first)
2887 n++;
2888
2889 /* Count the partitions enumerated by blkid */
2890 m = blkid_partlist_numof_partitions(pl);
2891 if (n == m + 1)
2892 break;
2893 if (n > m + 1) {
2894 log_error("blkid and kernel partition list do not match.");
2895 return -EIO;
2896 }
2897 if (n < m + 1) {
2898 unsigned j;
2899
2900 /* The kernel has probed fewer partitions than
2901 * blkid? Maybe the kernel prober is still
2902 * running or it got EBUSY because udev
2903 * already opened the device. Let's reprobe
2904 * the device, which is a synchronous call
2905 * that waits until probing is complete. */
2906
2907 for (j = 0; j < 20; j++) {
2908
2909 r = ioctl(fd, BLKRRPART, 0);
2910 if (r < 0)
2911 r = -errno;
2912 if (r >= 0 || r != -EBUSY)
2913 break;
2914
2915 /* If something else has the device
2916 * open, such as an udev rule, the
2917 * ioctl will return EBUSY. Since
2918 * there's no way to wait until it
2919 * isn't busy anymore, let's just wait
2920 * a bit, and try again.
2921 *
2922 * This is really something they
2923 * should fix in the kernel! */
2924
2925 usleep(50 * USEC_PER_MSEC);
2926 }
2927
2928 if (r < 0)
2929 return log_error_errno(r, "Failed to reread partition table: %m");
2930 }
2931
2932 e = udev_enumerate_unref(e);
2933 }
2934
2935 first = udev_enumerate_get_list_entry(e);
2936 udev_list_entry_foreach(item, first) {
2937 _cleanup_udev_device_unref_ struct udev_device *q;
2938 const char *node;
2939 unsigned long long flags;
2940 blkid_partition pp;
2941 dev_t qn;
2942 int nr;
2943
2944 errno = 0;
2945 q = udev_device_new_from_syspath(udev, udev_list_entry_get_name(item));
2946 if (!q) {
2947 if (!errno)
2948 errno = ENOMEM;
2949
2950 log_error_errno(errno, "Failed to get partition device of %s: %m", arg_image);
2951 return -errno;
2952 }
2953
2954 qn = udev_device_get_devnum(q);
2955 if (major(qn) == 0)
2956 continue;
2957
2958 if (st.st_rdev == qn)
2959 continue;
2960
2961 node = udev_device_get_devnode(q);
2962 if (!node)
2963 continue;
2964
2965 pp = blkid_partlist_devno_to_partition(pl, qn);
2966 if (!pp)
2967 continue;
2968
2969 flags = blkid_partition_get_flags(pp);
2970
2971 nr = blkid_partition_get_partno(pp);
2972 if (nr < 0)
2973 continue;
2974
2975 if (is_gpt) {
2976 sd_id128_t type_id;
2977 const char *stype;
2978
2979 if (flags & GPT_FLAG_NO_AUTO)
2980 continue;
2981
2982 stype = blkid_partition_get_type_string(pp);
2983 if (!stype)
2984 continue;
2985
2986 if (sd_id128_from_string(stype, &type_id) < 0)
2987 continue;
2988
2989 if (sd_id128_equal(type_id, GPT_HOME)) {
2990
2991 if (home && nr >= home_nr)
2992 continue;
2993
2994 home_nr = nr;
2995 home_rw = !(flags & GPT_FLAG_READ_ONLY);
2996
2997 r = free_and_strdup(&home, node);
2998 if (r < 0)
2999 return log_oom();
3000
3001 } else if (sd_id128_equal(type_id, GPT_SRV)) {
3002
3003 if (srv && nr >= srv_nr)
3004 continue;
3005
3006 srv_nr = nr;
3007 srv_rw = !(flags & GPT_FLAG_READ_ONLY);
3008
3009 r = free_and_strdup(&srv, node);
3010 if (r < 0)
3011 return log_oom();
3012 }
3013 #ifdef GPT_ROOT_NATIVE
3014 else if (sd_id128_equal(type_id, GPT_ROOT_NATIVE)) {
3015
3016 if (root && nr >= root_nr)
3017 continue;
3018
3019 root_nr = nr;
3020 root_rw = !(flags & GPT_FLAG_READ_ONLY);
3021
3022 r = free_and_strdup(&root, node);
3023 if (r < 0)
3024 return log_oom();
3025 }
3026 #endif
3027 #ifdef GPT_ROOT_SECONDARY
3028 else if (sd_id128_equal(type_id, GPT_ROOT_SECONDARY)) {
3029
3030 if (secondary_root && nr >= secondary_root_nr)
3031 continue;
3032
3033 secondary_root_nr = nr;
3034 secondary_root_rw = !(flags & GPT_FLAG_READ_ONLY);
3035
3036 r = free_and_strdup(&secondary_root, node);
3037 if (r < 0)
3038 return log_oom();
3039 }
3040 #endif
3041 else if (sd_id128_equal(type_id, GPT_LINUX_GENERIC)) {
3042
3043 if (generic)
3044 multiple_generic = true;
3045 else {
3046 generic_rw = !(flags & GPT_FLAG_READ_ONLY);
3047
3048 r = free_and_strdup(&generic, node);
3049 if (r < 0)
3050 return log_oom();
3051 }
3052 }
3053
3054 } else if (is_mbr) {
3055 int type;
3056
3057 if (flags != 0x80) /* Bootable flag */
3058 continue;
3059
3060 type = blkid_partition_get_type(pp);
3061 if (type != 0x83) /* Linux partition */
3062 continue;
3063
3064 if (generic)
3065 multiple_generic = true;
3066 else {
3067 generic_rw = true;
3068
3069 r = free_and_strdup(&root, node);
3070 if (r < 0)
3071 return log_oom();
3072 }
3073 }
3074 }
3075
3076 if (root) {
3077 *root_device = root;
3078 root = NULL;
3079
3080 *root_device_rw = root_rw;
3081 *secondary = false;
3082 } else if (secondary_root) {
3083 *root_device = secondary_root;
3084 secondary_root = NULL;
3085
3086 *root_device_rw = secondary_root_rw;
3087 *secondary = true;
3088 } else if (generic) {
3089
3090 /* There were no partitions with precise meanings
3091 * around, but we found generic partitions. In this
3092 * case, if there's only one, we can go ahead and boot
3093 * it, otherwise we bail out, because we really cannot
3094 * make any sense of it. */
3095
3096 if (multiple_generic) {
3097 log_error("Identified multiple bootable Linux partitions on\n"
3098 " %s\n"
3099 PARTITION_TABLE_BLURB, arg_image);
3100 return -EINVAL;
3101 }
3102
3103 *root_device = generic;
3104 generic = NULL;
3105
3106 *root_device_rw = generic_rw;
3107 *secondary = false;
3108 } else {
3109 log_error("Failed to identify root partition in disk image\n"
3110 " %s\n"
3111 PARTITION_TABLE_BLURB, arg_image);
3112 return -EINVAL;
3113 }
3114
3115 if (home) {
3116 *home_device = home;
3117 home = NULL;
3118
3119 *home_device_rw = home_rw;
3120 }
3121
3122 if (srv) {
3123 *srv_device = srv;
3124 srv = NULL;
3125
3126 *srv_device_rw = srv_rw;
3127 }
3128
3129 return 0;
3130 #else
3131 log_error("--image= is not supported, compiled without blkid support.");
3132 return -EOPNOTSUPP;
3133 #endif
3134 }
3135
3136 static int mount_device(const char *what, const char *where, const char *directory, bool rw) {
3137 #ifdef HAVE_BLKID
3138 _cleanup_blkid_free_probe_ blkid_probe b = NULL;
3139 const char *fstype, *p;
3140 int r;
3141
3142 assert(what);
3143 assert(where);
3144
3145 if (arg_read_only)
3146 rw = false;
3147
3148 if (directory)
3149 p = strjoina(where, directory);
3150 else
3151 p = where;
3152
3153 errno = 0;
3154 b = blkid_new_probe_from_filename(what);
3155 if (!b) {
3156 if (errno == 0)
3157 return log_oom();
3158 log_error_errno(errno, "Failed to allocate prober for %s: %m", what);
3159 return -errno;
3160 }
3161
3162 blkid_probe_enable_superblocks(b, 1);
3163 blkid_probe_set_superblocks_flags(b, BLKID_SUBLKS_TYPE);
3164
3165 errno = 0;
3166 r = blkid_do_safeprobe(b);
3167 if (r == -1 || r == 1) {
3168 log_error("Cannot determine file system type of %s", what);
3169 return -EINVAL;
3170 } else if (r != 0) {
3171 if (errno == 0)
3172 errno = EIO;
3173 log_error_errno(errno, "Failed to probe %s: %m", what);
3174 return -errno;
3175 }
3176
3177 errno = 0;
3178 if (blkid_probe_lookup_value(b, "TYPE", &fstype, NULL) < 0) {
3179 if (errno == 0)
3180 errno = EINVAL;
3181 log_error("Failed to determine file system type of %s", what);
3182 return -errno;
3183 }
3184
3185 if (streq(fstype, "crypto_LUKS")) {
3186 log_error("nspawn currently does not support LUKS disk images.");
3187 return -EOPNOTSUPP;
3188 }
3189
3190 if (mount(what, p, fstype, MS_NODEV|(rw ? 0 : MS_RDONLY), NULL) < 0)
3191 return log_error_errno(errno, "Failed to mount %s: %m", what);
3192
3193 return 0;
3194 #else
3195 log_error("--image= is not supported, compiled without blkid support.");
3196 return -EOPNOTSUPP;
3197 #endif
3198 }
3199
3200 static int mount_devices(
3201 const char *where,
3202 const char *root_device, bool root_device_rw,
3203 const char *home_device, bool home_device_rw,
3204 const char *srv_device, bool srv_device_rw) {
3205 int r;
3206
3207 assert(where);
3208
3209 if (root_device) {
3210 r = mount_device(root_device, arg_directory, NULL, root_device_rw);
3211 if (r < 0)
3212 return log_error_errno(r, "Failed to mount root directory: %m");
3213 }
3214
3215 if (home_device) {
3216 r = mount_device(home_device, arg_directory, "/home", home_device_rw);
3217 if (r < 0)
3218 return log_error_errno(r, "Failed to mount home directory: %m");
3219 }
3220
3221 if (srv_device) {
3222 r = mount_device(srv_device, arg_directory, "/srv", srv_device_rw);
3223 if (r < 0)
3224 return log_error_errno(r, "Failed to mount server data directory: %m");
3225 }
3226
3227 return 0;
3228 }
3229
3230 static void loop_remove(int nr, int *image_fd) {
3231 _cleanup_close_ int control = -1;
3232 int r;
3233
3234 if (nr < 0)
3235 return;
3236
3237 if (image_fd && *image_fd >= 0) {
3238 r = ioctl(*image_fd, LOOP_CLR_FD);
3239 if (r < 0)
3240 log_debug_errno(errno, "Failed to close loop image: %m");
3241 *image_fd = safe_close(*image_fd);
3242 }
3243
3244 control = open("/dev/loop-control", O_RDWR|O_CLOEXEC|O_NOCTTY|O_NONBLOCK);
3245 if (control < 0) {
3246 log_warning_errno(errno, "Failed to open /dev/loop-control: %m");
3247 return;
3248 }
3249
3250 r = ioctl(control, LOOP_CTL_REMOVE, nr);
3251 if (r < 0)
3252 log_debug_errno(errno, "Failed to remove loop %d: %m", nr);
3253 }
3254
3255 static int spawn_getent(const char *database, const char *key, pid_t *rpid) {
3256 int pipe_fds[2];
3257 pid_t pid;
3258
3259 assert(database);
3260 assert(key);
3261 assert(rpid);
3262
3263 if (pipe2(pipe_fds, O_CLOEXEC) < 0)
3264 return log_error_errno(errno, "Failed to allocate pipe: %m");
3265
3266 pid = fork();
3267 if (pid < 0)
3268 return log_error_errno(errno, "Failed to fork getent child: %m");
3269 else if (pid == 0) {
3270 int nullfd;
3271 char *empty_env = NULL;
3272
3273 if (dup3(pipe_fds[1], STDOUT_FILENO, 0) < 0)
3274 _exit(EXIT_FAILURE);
3275
3276 if (pipe_fds[0] > 2)
3277 safe_close(pipe_fds[0]);
3278 if (pipe_fds[1] > 2)
3279 safe_close(pipe_fds[1]);
3280
3281 nullfd = open("/dev/null", O_RDWR);
3282 if (nullfd < 0)
3283 _exit(EXIT_FAILURE);
3284
3285 if (dup3(nullfd, STDIN_FILENO, 0) < 0)
3286 _exit(EXIT_FAILURE);
3287
3288 if (dup3(nullfd, STDERR_FILENO, 0) < 0)
3289 _exit(EXIT_FAILURE);
3290
3291 if (nullfd > 2)
3292 safe_close(nullfd);
3293
3294 reset_all_signal_handlers();
3295 close_all_fds(NULL, 0);
3296
3297 execle("/usr/bin/getent", "getent", database, key, NULL, &empty_env);
3298 execle("/bin/getent", "getent", database, key, NULL, &empty_env);
3299 _exit(EXIT_FAILURE);
3300 }
3301
3302 pipe_fds[1] = safe_close(pipe_fds[1]);
3303
3304 *rpid = pid;
3305
3306 return pipe_fds[0];
3307 }
3308
3309 static int change_uid_gid(char **_home) {
3310 char line[LINE_MAX], *x, *u, *g, *h;
3311 const char *word, *state;
3312 _cleanup_free_ uid_t *uids = NULL;
3313 _cleanup_free_ char *home = NULL;
3314 _cleanup_fclose_ FILE *f = NULL;
3315 _cleanup_close_ int fd = -1;
3316 unsigned n_uids = 0;
3317 size_t sz = 0, l;
3318 uid_t uid;
3319 gid_t gid;
3320 pid_t pid;
3321 int r;
3322
3323 assert(_home);
3324
3325 if (!arg_user || streq(arg_user, "root") || streq(arg_user, "0")) {
3326 /* Reset everything fully to 0, just in case */
3327
3328 if (setgroups(0, NULL) < 0)
3329 return log_error_errno(errno, "setgroups() failed: %m");
3330
3331 if (setresgid(0, 0, 0) < 0)
3332 return log_error_errno(errno, "setregid() failed: %m");
3333
3334 if (setresuid(0, 0, 0) < 0)
3335 return log_error_errno(errno, "setreuid() failed: %m");
3336
3337 *_home = NULL;
3338 return 0;
3339 }
3340
3341 /* First, get user credentials */
3342 fd = spawn_getent("passwd", arg_user, &pid);
3343 if (fd < 0)
3344 return fd;
3345
3346 f = fdopen(fd, "r");
3347 if (!f)
3348 return log_oom();
3349 fd = -1;
3350
3351 if (!fgets(line, sizeof(line), f)) {
3352
3353 if (!ferror(f)) {
3354 log_error("Failed to resolve user %s.", arg_user);
3355 return -ESRCH;
3356 }
3357
3358 log_error_errno(errno, "Failed to read from getent: %m");
3359 return -errno;
3360 }
3361
3362 truncate_nl(line);
3363
3364 wait_for_terminate_and_warn("getent passwd", pid, true);
3365
3366 x = strchr(line, ':');
3367 if (!x) {
3368 log_error("/etc/passwd entry has invalid user field.");
3369 return -EIO;
3370 }
3371
3372 u = strchr(x+1, ':');
3373 if (!u) {
3374 log_error("/etc/passwd entry has invalid password field.");
3375 return -EIO;
3376 }
3377
3378 u++;
3379 g = strchr(u, ':');
3380 if (!g) {
3381 log_error("/etc/passwd entry has invalid UID field.");
3382 return -EIO;
3383 }
3384
3385 *g = 0;
3386 g++;
3387 x = strchr(g, ':');
3388 if (!x) {
3389 log_error("/etc/passwd entry has invalid GID field.");
3390 return -EIO;
3391 }
3392
3393 *x = 0;
3394 h = strchr(x+1, ':');
3395 if (!h) {
3396 log_error("/etc/passwd entry has invalid GECOS field.");
3397 return -EIO;
3398 }
3399
3400 h++;
3401 x = strchr(h, ':');
3402 if (!x) {
3403 log_error("/etc/passwd entry has invalid home directory field.");
3404 return -EIO;
3405 }
3406
3407 *x = 0;
3408
3409 r = parse_uid(u, &uid);
3410 if (r < 0) {
3411 log_error("Failed to parse UID of user.");
3412 return -EIO;
3413 }
3414
3415 r = parse_gid(g, &gid);
3416 if (r < 0) {
3417 log_error("Failed to parse GID of user.");
3418 return -EIO;
3419 }
3420
3421 home = strdup(h);
3422 if (!home)
3423 return log_oom();
3424
3425 /* Second, get group memberships */
3426 fd = spawn_getent("initgroups", arg_user, &pid);
3427 if (fd < 0)
3428 return fd;
3429
3430 fclose(f);
3431 f = fdopen(fd, "r");
3432 if (!f)
3433 return log_oom();
3434 fd = -1;
3435
3436 if (!fgets(line, sizeof(line), f)) {
3437 if (!ferror(f)) {
3438 log_error("Failed to resolve user %s.", arg_user);
3439 return -ESRCH;
3440 }
3441
3442 log_error_errno(errno, "Failed to read from getent: %m");
3443 return -errno;
3444 }
3445
3446 truncate_nl(line);
3447
3448 wait_for_terminate_and_warn("getent initgroups", pid, true);
3449
3450 /* Skip over the username and subsequent separator whitespace */
3451 x = line;
3452 x += strcspn(x, WHITESPACE);
3453 x += strspn(x, WHITESPACE);
3454
3455 FOREACH_WORD(word, l, x, state) {
3456 char c[l+1];
3457
3458 memcpy(c, word, l);
3459 c[l] = 0;
3460
3461 if (!GREEDY_REALLOC(uids, sz, n_uids+1))
3462 return log_oom();
3463
3464 r = parse_uid(c, &uids[n_uids++]);
3465 if (r < 0) {
3466 log_error("Failed to parse group data from getent.");
3467 return -EIO;
3468 }
3469 }
3470
3471 r = mkdir_parents(home, 0775);
3472 if (r < 0)
3473 return log_error_errno(r, "Failed to make home root directory: %m");
3474
3475 r = mkdir_safe(home, 0755, uid, gid);
3476 if (r < 0 && r != -EEXIST)
3477 return log_error_errno(r, "Failed to make home directory: %m");
3478
3479 fchown(STDIN_FILENO, uid, gid);
3480 fchown(STDOUT_FILENO, uid, gid);
3481 fchown(STDERR_FILENO, uid, gid);
3482
3483 if (setgroups(n_uids, uids) < 0)
3484 return log_error_errno(errno, "Failed to set auxiliary groups: %m");
3485
3486 if (setresgid(gid, gid, gid) < 0)
3487 return log_error_errno(errno, "setregid() failed: %m");
3488
3489 if (setresuid(uid, uid, uid) < 0)
3490 return log_error_errno(errno, "setreuid() failed: %m");
3491
3492 if (_home) {
3493 *_home = home;
3494 home = NULL;
3495 }
3496
3497 return 0;
3498 }
3499
3500 /*
3501 * Return values:
3502 * < 0 : wait_for_terminate() failed to get the state of the
3503 * container, the container was terminated by a signal, or
3504 * failed for an unknown reason. No change is made to the
3505 * container argument.
3506 * > 0 : The program executed in the container terminated with an
3507 * error. The exit code of the program executed in the
3508 * container is returned. The container argument has been set
3509 * to CONTAINER_TERMINATED.
3510 * 0 : The container is being rebooted, has been shut down or exited
3511 * successfully. The container argument has been set to either
3512 * CONTAINER_TERMINATED or CONTAINER_REBOOTED.
3513 *
3514 * That is, success is indicated by a return value of zero, and an
3515 * error is indicated by a non-zero value.
3516 */
3517 static int wait_for_container(pid_t pid, ContainerStatus *container) {
3518 siginfo_t status;
3519 int r;
3520
3521 r = wait_for_terminate(pid, &status);
3522 if (r < 0)
3523 return log_warning_errno(r, "Failed to wait for container: %m");
3524
3525 switch (status.si_code) {
3526
3527 case CLD_EXITED:
3528 if (status.si_status == 0) {
3529 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s exited successfully.", arg_machine);
3530
3531 } else
3532 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s failed with error code %i.", arg_machine, status.si_status);
3533
3534 *container = CONTAINER_TERMINATED;
3535 return status.si_status;
3536
3537 case CLD_KILLED:
3538 if (status.si_status == SIGINT) {
3539
3540 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s has been shut down.", arg_machine);
3541 *container = CONTAINER_TERMINATED;
3542 return 0;
3543
3544 } else if (status.si_status == SIGHUP) {
3545
3546 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s is being rebooted.", arg_machine);
3547 *container = CONTAINER_REBOOTED;
3548 return 0;
3549 }
3550
3551 /* CLD_KILLED fallthrough */
3552
3553 case CLD_DUMPED:
3554 log_error("Container %s terminated by signal %s.", arg_machine, signal_to_string(status.si_status));
3555 return -EIO;
3556
3557 default:
3558 log_error("Container %s failed due to unknown reason.", arg_machine);
3559 return -EIO;
3560 }
3561
3562 return r;
3563 }
3564
3565 static void nop_handler(int sig) {}
3566
3567 static int on_orderly_shutdown(sd_event_source *s, const struct signalfd_siginfo *si, void *userdata) {
3568 pid_t pid;
3569
3570 pid = PTR_TO_UINT32(userdata);
3571 if (pid > 0) {
3572 if (kill(pid, arg_kill_signal) >= 0) {
3573 log_info("Trying to halt container. Send SIGTERM again to trigger immediate termination.");
3574 sd_event_source_set_userdata(s, NULL);
3575 return 0;
3576 }
3577 }
3578
3579 sd_event_exit(sd_event_source_get_event(s), 0);
3580 return 0;
3581 }
3582
3583 static int determine_names(void) {
3584 int r;
3585
3586 if (!arg_image && !arg_directory) {
3587 if (arg_machine) {
3588 _cleanup_(image_unrefp) Image *i = NULL;
3589
3590 r = image_find(arg_machine, &i);
3591 if (r < 0)
3592 return log_error_errno(r, "Failed to find image for machine '%s': %m", arg_machine);
3593 else if (r == 0) {
3594 log_error("No image for machine '%s': %m", arg_machine);
3595 return -ENOENT;
3596 }
3597
3598 if (i->type == IMAGE_RAW)
3599 r = set_sanitized_path(&arg_image, i->path);
3600 else
3601 r = set_sanitized_path(&arg_directory, i->path);
3602 if (r < 0)
3603 return log_error_errno(r, "Invalid image directory: %m");
3604
3605 arg_read_only = arg_read_only || i->read_only;
3606 } else
3607 arg_directory = get_current_dir_name();
3608
3609 if (!arg_directory && !arg_machine) {
3610 log_error("Failed to determine path, please use -D or -i.");
3611 return -EINVAL;
3612 }
3613 }
3614
3615 if (!arg_machine) {
3616 if (arg_directory && path_equal(arg_directory, "/"))
3617 arg_machine = gethostname_malloc();
3618 else
3619 arg_machine = strdup(basename(arg_image ?: arg_directory));
3620
3621 if (!arg_machine)
3622 return log_oom();
3623
3624 hostname_cleanup(arg_machine, false);
3625 if (!machine_name_is_valid(arg_machine)) {
3626 log_error("Failed to determine machine name automatically, please use -M.");
3627 return -EINVAL;
3628 }
3629
3630 if (arg_ephemeral) {
3631 char *b;
3632
3633 /* Add a random suffix when this is an
3634 * ephemeral machine, so that we can run many
3635 * instances at once without manually having
3636 * to specify -M each time. */
3637
3638 if (asprintf(&b, "%s-%016" PRIx64, arg_machine, random_u64()) < 0)
3639 return log_oom();
3640
3641 free(arg_machine);
3642 arg_machine = b;
3643 }
3644 }
3645
3646 return 0;
3647 }
3648
3649 static int determine_uid_shift(void) {
3650 int r;
3651
3652 if (!arg_userns)
3653 return 0;
3654
3655 if (arg_uid_shift == UID_INVALID) {
3656 struct stat st;
3657
3658 r = stat(arg_directory, &st);
3659 if (r < 0)
3660 return log_error_errno(errno, "Failed to determine UID base of %s: %m", arg_directory);
3661
3662 arg_uid_shift = st.st_uid & UINT32_C(0xffff0000);
3663
3664 if (arg_uid_shift != (st.st_gid & UINT32_C(0xffff0000))) {
3665 log_error("UID and GID base of %s don't match.", arg_directory);
3666 return -EINVAL;
3667 }
3668
3669 arg_uid_range = UINT32_C(0x10000);
3670 }
3671
3672 if (arg_uid_shift > (uid_t) -1 - arg_uid_range) {
3673 log_error("UID base too high for UID range.");
3674 return -EINVAL;
3675 }
3676
3677 log_info("Using user namespaces with base " UID_FMT " and range " UID_FMT ".", arg_uid_shift, arg_uid_range);
3678 return 0;
3679 }
3680
3681 int main(int argc, char *argv[]) {
3682
3683 _cleanup_free_ char *device_path = NULL, *root_device = NULL, *home_device = NULL, *srv_device = NULL, *console = NULL;
3684 bool root_device_rw = true, home_device_rw = true, srv_device_rw = true;
3685 _cleanup_close_ int master = -1, image_fd = -1;
3686 _cleanup_fdset_free_ FDSet *fds = NULL;
3687 int r, n_fd_passed, loop_nr = -1;
3688 char veth_name[IFNAMSIZ];
3689 bool secondary = false, remove_subvol = false;
3690 sigset_t mask, mask_chld;
3691 pid_t pid = 0;
3692 int ret = EXIT_SUCCESS;
3693 union in_addr_union exposed = {};
3694 _cleanup_release_lock_file_ LockFile tree_global_lock = LOCK_FILE_INIT, tree_local_lock = LOCK_FILE_INIT;
3695 bool interactive;
3696
3697 log_parse_environment();
3698 log_open();
3699
3700 r = parse_argv(argc, argv);
3701 if (r <= 0)
3702 goto finish;
3703
3704 r = determine_names();
3705 if (r < 0)
3706 goto finish;
3707
3708 if (geteuid() != 0) {
3709 log_error("Need to be root.");
3710 r = -EPERM;
3711 goto finish;
3712 }
3713
3714 log_close();
3715 n_fd_passed = sd_listen_fds(false);
3716 if (n_fd_passed > 0) {
3717 r = fdset_new_listen_fds(&fds, false);
3718 if (r < 0) {
3719 log_error_errno(r, "Failed to collect file descriptors: %m");
3720 goto finish;
3721 }
3722 }
3723 fdset_close_others(fds);
3724 log_open();
3725
3726 if (arg_directory) {
3727 assert(!arg_image);
3728
3729 if (path_equal(arg_directory, "/") && !arg_ephemeral) {
3730 log_error("Spawning container on root directory is not supported. Consider using --ephemeral.");
3731 r = -EINVAL;
3732 goto finish;
3733 }
3734
3735 if (arg_ephemeral) {
3736 _cleanup_free_ char *np = NULL;
3737
3738 /* If the specified path is a mount point we
3739 * generate the new snapshot immediately
3740 * inside it under a random name. However if
3741 * the specified is not a mount point we
3742 * create the new snapshot in the parent
3743 * directory, just next to it. */
3744 r = path_is_mount_point(arg_directory, false);
3745 if (r < 0) {
3746 log_error_errno(r, "Failed to determine whether directory %s is mount point: %m", arg_directory);
3747 goto finish;
3748 }
3749 if (r > 0)
3750 r = tempfn_random_child(arg_directory, &np);
3751 else
3752 r = tempfn_random(arg_directory, &np);
3753 if (r < 0) {
3754 log_error_errno(r, "Failed to generate name for snapshot: %m");
3755 goto finish;
3756 }
3757
3758 r = image_path_lock(np, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
3759 if (r < 0) {
3760 log_error_errno(r, "Failed to lock %s: %m", np);
3761 goto finish;
3762 }
3763
3764 r = btrfs_subvol_snapshot(arg_directory, np, arg_read_only, true);
3765 if (r < 0) {
3766 log_error_errno(r, "Failed to create snapshot %s from %s: %m", np, arg_directory);
3767 goto finish;
3768 }
3769
3770 free(arg_directory);
3771 arg_directory = np;
3772 np = NULL;
3773
3774 remove_subvol = true;
3775
3776 } else {
3777 r = image_path_lock(arg_directory, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
3778 if (r == -EBUSY) {
3779 log_error_errno(r, "Directory tree %s is currently busy.", arg_directory);
3780 goto finish;
3781 }
3782 if (r < 0) {
3783 log_error_errno(r, "Failed to lock %s: %m", arg_directory);
3784 return r;
3785 }
3786
3787 if (arg_template) {
3788 r = btrfs_subvol_snapshot(arg_template, arg_directory, arg_read_only, true);
3789 if (r == -EEXIST) {
3790 if (!arg_quiet)
3791 log_info("Directory %s already exists, not populating from template %s.", arg_directory, arg_template);
3792 } else if (r < 0) {
3793 log_error_errno(r, "Couldn't create snapshot %s from %s: %m", arg_directory, arg_template);
3794 goto finish;
3795 } else {
3796 if (!arg_quiet)
3797 log_info("Populated %s from template %s.", arg_directory, arg_template);
3798 }
3799 }
3800 }
3801
3802 if (arg_boot) {
3803 if (path_is_os_tree(arg_directory) <= 0) {
3804 log_error("Directory %s doesn't look like an OS root directory (os-release file is missing). Refusing.", arg_directory);
3805 r = -EINVAL;
3806 goto finish;
3807 }
3808 } else {
3809 const char *p;
3810
3811 p = strjoina(arg_directory,
3812 argc > optind && path_is_absolute(argv[optind]) ? argv[optind] : "/usr/bin/");
3813 if (access(p, F_OK) < 0) {
3814 log_error("Directory %s lacks the binary to execute or doesn't look like a binary tree. Refusing.", arg_directory);
3815 r = -EINVAL;
3816 goto finish;
3817 }
3818 }
3819
3820 } else {
3821 char template[] = "/tmp/nspawn-root-XXXXXX";
3822
3823 assert(arg_image);
3824 assert(!arg_template);
3825
3826 r = image_path_lock(arg_image, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
3827 if (r == -EBUSY) {
3828 r = log_error_errno(r, "Disk image %s is currently busy.", arg_image);
3829 goto finish;
3830 }
3831 if (r < 0) {
3832 r = log_error_errno(r, "Failed to create image lock: %m");
3833 goto finish;
3834 }
3835
3836 if (!mkdtemp(template)) {
3837 log_error_errno(errno, "Failed to create temporary directory: %m");
3838 r = -errno;
3839 goto finish;
3840 }
3841
3842 arg_directory = strdup(template);
3843 if (!arg_directory) {
3844 r = log_oom();
3845 goto finish;
3846 }
3847
3848 image_fd = setup_image(&device_path, &loop_nr);
3849 if (image_fd < 0) {
3850 r = image_fd;
3851 goto finish;
3852 }
3853
3854 r = dissect_image(image_fd,
3855 &root_device, &root_device_rw,
3856 &home_device, &home_device_rw,
3857 &srv_device, &srv_device_rw,
3858 &secondary);
3859 if (r < 0)
3860 goto finish;
3861 }
3862
3863 r = determine_uid_shift();
3864 if (r < 0)
3865 goto finish;
3866
3867 interactive = isatty(STDIN_FILENO) > 0 && isatty(STDOUT_FILENO) > 0;
3868
3869 master = posix_openpt(O_RDWR|O_NOCTTY|O_CLOEXEC|O_NDELAY);
3870 if (master < 0) {
3871 r = log_error_errno(errno, "Failed to acquire pseudo tty: %m");
3872 goto finish;
3873 }
3874
3875 r = ptsname_malloc(master, &console);
3876 if (r < 0) {
3877 r = log_error_errno(r, "Failed to determine tty name: %m");
3878 goto finish;
3879 }
3880
3881 if (unlockpt(master) < 0) {
3882 r = log_error_errno(errno, "Failed to unlock tty: %m");
3883 goto finish;
3884 }
3885
3886 if (!arg_quiet)
3887 log_info("Spawning container %s on %s.\nPress ^] three times within 1s to kill container.",
3888 arg_machine, arg_image ?: arg_directory);
3889
3890 assert_se(sigemptyset(&mask) == 0);
3891 sigset_add_many(&mask, SIGCHLD, SIGWINCH, SIGTERM, SIGINT, -1);
3892 assert_se(sigprocmask(SIG_BLOCK, &mask, NULL) == 0);
3893
3894 assert_se(sigemptyset(&mask_chld) == 0);
3895 assert_se(sigaddset(&mask_chld, SIGCHLD) == 0);
3896
3897 for (;;) {
3898 _cleanup_close_pair_ int kmsg_socket_pair[2] = { -1, -1 }, rtnl_socket_pair[2] = { -1, -1 };
3899 ContainerStatus container_status;
3900 _cleanup_(barrier_destroy) Barrier barrier = BARRIER_NULL;
3901 struct sigaction sa = {
3902 .sa_handler = nop_handler,
3903 .sa_flags = SA_NOCLDSTOP,
3904 };
3905
3906 r = barrier_create(&barrier);
3907 if (r < 0) {
3908 log_error_errno(r, "Cannot initialize IPC barrier: %m");
3909 goto finish;
3910 }
3911
3912 if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC, 0, kmsg_socket_pair) < 0) {
3913 r = log_error_errno(errno, "Failed to create kmsg socket pair: %m");
3914 goto finish;
3915 }
3916
3917 if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC, 0, rtnl_socket_pair) < 0) {
3918 r = log_error_errno(errno, "Failed to create rtnl socket pair: %m");
3919 goto finish;
3920 }
3921
3922 /* Child can be killed before execv(), so handle SIGCHLD
3923 * in order to interrupt parent's blocking calls and
3924 * give it a chance to call wait() and terminate. */
3925 r = sigprocmask(SIG_UNBLOCK, &mask_chld, NULL);
3926 if (r < 0) {
3927 r = log_error_errno(errno, "Failed to change the signal mask: %m");
3928 goto finish;
3929 }
3930
3931 r = sigaction(SIGCHLD, &sa, NULL);
3932 if (r < 0) {
3933 r = log_error_errno(errno, "Failed to install SIGCHLD handler: %m");
3934 goto finish;
3935 }
3936
3937 pid = raw_clone(SIGCHLD|CLONE_NEWNS|
3938 (arg_share_system ? 0 : CLONE_NEWIPC|CLONE_NEWPID|CLONE_NEWUTS)|
3939 (arg_private_network ? CLONE_NEWNET : 0), NULL);
3940 if (pid < 0) {
3941 if (errno == EINVAL)
3942 r = log_error_errno(errno, "clone() failed, do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in): %m");
3943 else
3944 r = log_error_errno(errno, "clone() failed: %m");
3945
3946 goto finish;
3947 }
3948
3949 if (pid == 0) {
3950 /* child */
3951 _cleanup_free_ char *home = NULL;
3952 unsigned n_env = 2;
3953 const char *envp[] = {
3954 "PATH=" DEFAULT_PATH_SPLIT_USR,
3955 "container=systemd-nspawn", /* LXC sets container=lxc, so follow the scheme here */
3956 NULL, /* TERM */
3957 NULL, /* HOME */
3958 NULL, /* USER */
3959 NULL, /* LOGNAME */
3960 NULL, /* container_uuid */
3961 NULL, /* LISTEN_FDS */
3962 NULL, /* LISTEN_PID */
3963 NULL
3964 };
3965 char **env_use;
3966
3967 barrier_set_role(&barrier, BARRIER_CHILD);
3968
3969 envp[n_env] = strv_find_prefix(environ, "TERM=");
3970 if (envp[n_env])
3971 n_env ++;
3972
3973 master = safe_close(master);
3974
3975 kmsg_socket_pair[0] = safe_close(kmsg_socket_pair[0]);
3976 rtnl_socket_pair[0] = safe_close(rtnl_socket_pair[0]);
3977
3978 reset_all_signal_handlers();
3979 reset_signal_mask();
3980
3981 if (interactive) {
3982 close_nointr(STDIN_FILENO);
3983 close_nointr(STDOUT_FILENO);
3984 close_nointr(STDERR_FILENO);
3985
3986 r = open_terminal(console, O_RDWR);
3987 if (r != STDIN_FILENO) {
3988 if (r >= 0) {
3989 safe_close(r);
3990 r = -EINVAL;
3991 }
3992
3993 log_error_errno(r, "Failed to open console: %m");
3994 _exit(EXIT_FAILURE);
3995 }
3996
3997 if (dup2(STDIN_FILENO, STDOUT_FILENO) != STDOUT_FILENO ||
3998 dup2(STDIN_FILENO, STDERR_FILENO) != STDERR_FILENO) {
3999 log_error_errno(errno, "Failed to duplicate console: %m");
4000 _exit(EXIT_FAILURE);
4001 }
4002 }
4003
4004 if (setsid() < 0) {
4005 log_error_errno(errno, "setsid() failed: %m");
4006 _exit(EXIT_FAILURE);
4007 }
4008
4009 if (reset_audit_loginuid() < 0)
4010 _exit(EXIT_FAILURE);
4011
4012 if (prctl(PR_SET_PDEATHSIG, SIGKILL) < 0) {
4013 log_error_errno(errno, "PR_SET_PDEATHSIG failed: %m");
4014 _exit(EXIT_FAILURE);
4015 }
4016
4017 if (arg_private_network)
4018 loopback_setup();
4019
4020 /* Mark everything as slave, so that we still
4021 * receive mounts from the real root, but don't
4022 * propagate mounts to the real root. */
4023 if (mount(NULL, "/", NULL, MS_SLAVE|MS_REC, NULL) < 0) {
4024 log_error_errno(errno, "MS_SLAVE|MS_REC failed: %m");
4025 _exit(EXIT_FAILURE);
4026 }
4027
4028 if (mount_devices(arg_directory,
4029 root_device, root_device_rw,
4030 home_device, home_device_rw,
4031 srv_device, srv_device_rw) < 0)
4032 _exit(EXIT_FAILURE);
4033
4034 /* Turn directory into bind mount */
4035 if (mount(arg_directory, arg_directory, NULL, MS_BIND|MS_REC, NULL) < 0) {
4036 log_error_errno(errno, "Failed to make bind mount: %m");
4037 _exit(EXIT_FAILURE);
4038 }
4039
4040 r = setup_volatile(arg_directory);
4041 if (r < 0)
4042 _exit(EXIT_FAILURE);
4043
4044 if (setup_volatile_state(arg_directory) < 0)
4045 _exit(EXIT_FAILURE);
4046
4047 r = base_filesystem_create(arg_directory);
4048 if (r < 0)
4049 _exit(EXIT_FAILURE);
4050
4051 if (arg_read_only) {
4052 r = bind_remount_recursive(arg_directory, true);
4053 if (r < 0) {
4054 log_error_errno(r, "Failed to make tree read-only: %m");
4055 _exit(EXIT_FAILURE);
4056 }
4057 }
4058
4059 if (mount_all(arg_directory) < 0)
4060 _exit(EXIT_FAILURE);
4061
4062 if (copy_devnodes(arg_directory) < 0)
4063 _exit(EXIT_FAILURE);
4064
4065 if (setup_ptmx(arg_directory) < 0)
4066 _exit(EXIT_FAILURE);
4067
4068 dev_setup(arg_directory);
4069
4070 if (setup_propagate(arg_directory) < 0)
4071 _exit(EXIT_FAILURE);
4072
4073 if (setup_seccomp() < 0)
4074 _exit(EXIT_FAILURE);
4075
4076 if (setup_dev_console(arg_directory, console) < 0)
4077 _exit(EXIT_FAILURE);
4078
4079 if (setup_kmsg(arg_directory, kmsg_socket_pair[1]) < 0)
4080 _exit(EXIT_FAILURE);
4081 kmsg_socket_pair[1] = safe_close(kmsg_socket_pair[1]);
4082
4083 if (send_rtnl(rtnl_socket_pair[1]) < 0)
4084 _exit(EXIT_FAILURE);
4085 rtnl_socket_pair[1] = safe_close(rtnl_socket_pair[1]);
4086
4087 /* Tell the parent that we are ready, and that
4088 * it can cgroupify us to that we lack access
4089 * to certain devices and resources. */
4090 (void) barrier_place(&barrier); /* #1 */
4091
4092 if (setup_boot_id(arg_directory) < 0)
4093 _exit(EXIT_FAILURE);
4094
4095 if (setup_timezone(arg_directory) < 0)
4096 _exit(EXIT_FAILURE);
4097
4098 if (setup_resolv_conf(arg_directory) < 0)
4099 _exit(EXIT_FAILURE);
4100
4101 if (setup_journal(arg_directory) < 0)
4102 _exit(EXIT_FAILURE);
4103
4104 if (mount_binds(arg_directory, arg_bind, false) < 0)
4105 _exit(EXIT_FAILURE);
4106
4107 if (mount_binds(arg_directory, arg_bind_ro, true) < 0)
4108 _exit(EXIT_FAILURE);
4109
4110 if (mount_tmpfs(arg_directory) < 0)
4111 _exit(EXIT_FAILURE);
4112
4113 /* Wait until we are cgroup-ified, so that we
4114 * can mount the right cgroup path writable */
4115 (void) barrier_place_and_sync(&barrier); /* #2 */
4116
4117 if (mount_cgroup(arg_directory) < 0)
4118 _exit(EXIT_FAILURE);
4119
4120 if (chdir(arg_directory) < 0) {
4121 log_error_errno(errno, "chdir(%s) failed: %m", arg_directory);
4122 _exit(EXIT_FAILURE);
4123 }
4124
4125 if (mount(arg_directory, "/", NULL, MS_MOVE, NULL) < 0) {
4126 log_error_errno(errno, "mount(MS_MOVE) failed: %m");
4127 _exit(EXIT_FAILURE);
4128 }
4129
4130 if (chroot(".") < 0) {
4131 log_error_errno(errno, "chroot() failed: %m");
4132 _exit(EXIT_FAILURE);
4133 }
4134
4135 if (chdir("/") < 0) {
4136 log_error_errno(errno, "chdir() failed: %m");
4137 _exit(EXIT_FAILURE);
4138 }
4139
4140 if (arg_userns) {
4141 if (unshare(CLONE_NEWUSER) < 0) {
4142 log_error_errno(errno, "unshare(CLONE_NEWUSER) failed: %m");
4143 _exit(EXIT_FAILURE);
4144 }
4145
4146 /* Tell the parent, that it now can
4147 * write the UID map. */
4148 (void) barrier_place(&barrier); /* #3 */
4149
4150 /* Wait until the parent wrote the UID
4151 * map */
4152 (void) barrier_place_and_sync(&barrier); /* #4 */
4153 }
4154
4155 umask(0022);
4156
4157 if (drop_capabilities() < 0) {
4158 log_error_errno(errno, "drop_capabilities() failed: %m");
4159 _exit(EXIT_FAILURE);
4160 }
4161
4162 setup_hostname();
4163
4164 if (arg_personality != 0xffffffffLU) {
4165 if (personality(arg_personality) < 0) {
4166 log_error_errno(errno, "personality() failed: %m");
4167 _exit(EXIT_FAILURE);
4168 }
4169 } else if (secondary) {
4170 if (personality(PER_LINUX32) < 0) {
4171 log_error_errno(errno, "personality() failed: %m");
4172 _exit(EXIT_FAILURE);
4173 }
4174 }
4175
4176 #ifdef HAVE_SELINUX
4177 if (arg_selinux_context)
4178 if (setexeccon((security_context_t) arg_selinux_context) < 0) {
4179 log_error_errno(errno, "setexeccon(\"%s\") failed: %m", arg_selinux_context);
4180 _exit(EXIT_FAILURE);
4181 }
4182 #endif
4183
4184 r = change_uid_gid(&home);
4185 if (r < 0)
4186 _exit(EXIT_FAILURE);
4187
4188 if ((asprintf((char**)(envp + n_env++), "HOME=%s", home ? home: "/root") < 0) ||
4189 (asprintf((char**)(envp + n_env++), "USER=%s", arg_user ? arg_user : "root") < 0) ||
4190 (asprintf((char**)(envp + n_env++), "LOGNAME=%s", arg_user ? arg_user : "root") < 0)) {
4191 log_oom();
4192 _exit(EXIT_FAILURE);
4193 }
4194
4195 if (!sd_id128_equal(arg_uuid, SD_ID128_NULL)) {
4196 char as_uuid[37];
4197
4198 if (asprintf((char**)(envp + n_env++), "container_uuid=%s", id128_format_as_uuid(arg_uuid, as_uuid)) < 0) {
4199 log_oom();
4200 _exit(EXIT_FAILURE);
4201 }
4202 }
4203
4204 if (fdset_size(fds) > 0) {
4205 r = fdset_cloexec(fds, false);
4206 if (r < 0) {
4207 log_error_errno(r, "Failed to unset O_CLOEXEC for file descriptors.");
4208 _exit(EXIT_FAILURE);
4209 }
4210
4211 if ((asprintf((char **)(envp + n_env++), "LISTEN_FDS=%u", n_fd_passed) < 0) ||
4212 (asprintf((char **)(envp + n_env++), "LISTEN_PID=1") < 0)) {
4213 log_oom();
4214 _exit(EXIT_FAILURE);
4215 }
4216 }
4217
4218 if (!strv_isempty(arg_setenv)) {
4219 char **n;
4220
4221 n = strv_env_merge(2, envp, arg_setenv);
4222 if (!n) {
4223 log_oom();
4224 _exit(EXIT_FAILURE);
4225 }
4226
4227 env_use = n;
4228 } else
4229 env_use = (char**) envp;
4230
4231 /* Let the parent know that we are ready and
4232 * wait until the parent is ready with the
4233 * setup, too... */
4234 (void) barrier_place_and_sync(&barrier); /* #5 */
4235
4236 if (arg_boot) {
4237 char **a;
4238 size_t l;
4239
4240 /* Automatically search for the init system */
4241
4242 l = 1 + argc - optind;
4243 a = newa(char*, l + 1);
4244 memcpy(a + 1, argv + optind, l * sizeof(char*));
4245
4246 a[0] = (char*) "/usr/lib/systemd/systemd";
4247 execve(a[0], a, env_use);
4248
4249 a[0] = (char*) "/lib/systemd/systemd";
4250 execve(a[0], a, env_use);
4251
4252 a[0] = (char*) "/sbin/init";
4253 execve(a[0], a, env_use);
4254 } else if (argc > optind)
4255 execvpe(argv[optind], argv + optind, env_use);
4256 else {
4257 chdir(home ? home : "/root");
4258 execle("/bin/bash", "-bash", NULL, env_use);
4259 execle("/bin/sh", "-sh", NULL, env_use);
4260 }
4261
4262 log_error_errno(errno, "execv() failed: %m");
4263 _exit(EXIT_FAILURE);
4264 }
4265
4266 barrier_set_role(&barrier, BARRIER_PARENT);
4267 fdset_free(fds);
4268 fds = NULL;
4269
4270 kmsg_socket_pair[1] = safe_close(kmsg_socket_pair[1]);
4271 rtnl_socket_pair[1] = safe_close(rtnl_socket_pair[1]);
4272
4273 (void) barrier_place(&barrier); /* #1 */
4274
4275 /* Wait for the most basic Child-setup to be done,
4276 * before we add hardware to it, and place it in a
4277 * cgroup. */
4278 if (barrier_sync(&barrier)) { /* #1 */
4279 int ifi = 0;
4280
4281 r = move_network_interfaces(pid);
4282 if (r < 0)
4283 goto finish;
4284
4285 r = setup_veth(pid, veth_name, &ifi);
4286 if (r < 0)
4287 goto finish;
4288
4289 r = setup_bridge(veth_name, &ifi);
4290 if (r < 0)
4291 goto finish;
4292
4293 r = setup_macvlan(pid);
4294 if (r < 0)
4295 goto finish;
4296
4297 r = setup_ipvlan(pid);
4298 if (r < 0)
4299 goto finish;
4300
4301 r = register_machine(pid, ifi);
4302 if (r < 0)
4303 goto finish;
4304
4305 /* Notify the child that the parent is ready with all
4306 * its setup, and that the child can now hand over
4307 * control to the code to run inside the container. */
4308 (void) barrier_place(&barrier); /* #2 */
4309
4310 if (arg_userns) {
4311 char uid_map[strlen("/proc//uid_map") + DECIMAL_STR_MAX(uid_t) + 1], line[DECIMAL_STR_MAX(uid_t)*3+3+1];
4312
4313 (void) barrier_place_and_sync(&barrier); /* #3 */
4314
4315 xsprintf(uid_map, "/proc/" PID_FMT "/uid_map", pid);
4316 xsprintf(line, UID_FMT " " UID_FMT " " UID_FMT "\n", 0, arg_uid_shift, arg_uid_range);
4317 r = write_string_file(uid_map, line);
4318 if (r < 0) {
4319 log_error_errno(r, "Failed to write UID map: %m");
4320 goto finish;
4321 }
4322
4323 /* We always assign the same UID and GID ranges */
4324 xsprintf(uid_map, "/proc/" PID_FMT "/gid_map", pid);
4325 r = write_string_file(uid_map, line);
4326 if (r < 0) {
4327 log_error_errno(r, "Failed to write GID map: %m");
4328 goto finish;
4329 }
4330
4331 (void) barrier_place(&barrier); /* #4 */
4332 }
4333
4334 /* Block SIGCHLD here, before notifying child.
4335 * process_pty() will handle it with the other signals. */
4336 r = sigprocmask(SIG_BLOCK, &mask_chld, NULL);
4337 if (r < 0)
4338 goto finish;
4339
4340 /* Reset signal to default */
4341 r = default_signals(SIGCHLD, -1);
4342 if (r < 0)
4343 goto finish;
4344
4345 /* Let the child know that we are ready and wait that the child is completely ready now. */
4346 if (barrier_place_and_sync(&barrier)) { /* #5 */
4347 _cleanup_event_unref_ sd_event *event = NULL;
4348 _cleanup_(pty_forward_freep) PTYForward *forward = NULL;
4349 _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
4350 char last_char = 0;
4351
4352 sd_notifyf(false,
4353 "READY=1\n"
4354 "STATUS=Container running.\n"
4355 "X_NSPAWN_LEADER_PID=" PID_FMT, pid);
4356
4357 r = sd_event_new(&event);
4358 if (r < 0) {
4359 log_error_errno(r, "Failed to get default event source: %m");
4360 goto finish;
4361 }
4362
4363 if (arg_kill_signal > 0) {
4364 /* Try to kill the init system on SIGINT or SIGTERM */
4365 sd_event_add_signal(event, NULL, SIGINT, on_orderly_shutdown, UINT32_TO_PTR(pid));
4366 sd_event_add_signal(event, NULL, SIGTERM, on_orderly_shutdown, UINT32_TO_PTR(pid));
4367 } else {
4368 /* Immediately exit */
4369 sd_event_add_signal(event, NULL, SIGINT, NULL, NULL);
4370 sd_event_add_signal(event, NULL, SIGTERM, NULL, NULL);
4371 }
4372
4373 /* simply exit on sigchld */
4374 sd_event_add_signal(event, NULL, SIGCHLD, NULL, NULL);
4375
4376 if (arg_expose_ports) {
4377 r = watch_rtnl(event, rtnl_socket_pair[0], &exposed, &rtnl);
4378 if (r < 0)
4379 goto finish;
4380
4381 (void) expose_ports(rtnl, &exposed);
4382 }
4383
4384 rtnl_socket_pair[0] = safe_close(rtnl_socket_pair[0]);
4385
4386 r = pty_forward_new(event, master, true, !interactive, &forward);
4387 if (r < 0) {
4388 log_error_errno(r, "Failed to create PTY forwarder: %m");
4389 goto finish;
4390 }
4391
4392 r = sd_event_loop(event);
4393 if (r < 0) {
4394 log_error_errno(r, "Failed to run event loop: %m");
4395 goto finish;
4396 }
4397
4398 pty_forward_get_last_char(forward, &last_char);
4399
4400 forward = pty_forward_free(forward);
4401
4402 if (!arg_quiet && last_char != '\n')
4403 putc('\n', stdout);
4404
4405 /* Kill if it is not dead yet anyway */
4406 terminate_machine(pid);
4407 }
4408 }
4409
4410 /* Normally redundant, but better safe than sorry */
4411 kill(pid, SIGKILL);
4412
4413 r = wait_for_container(pid, &container_status);
4414 pid = 0;
4415
4416 if (r < 0)
4417 /* We failed to wait for the container, or the
4418 * container exited abnormally */
4419 goto finish;
4420 else if (r > 0 || container_status == CONTAINER_TERMINATED){
4421 /* The container exited with a non-zero
4422 * status, or with zero status and no reboot
4423 * was requested. */
4424 ret = r;
4425 break;
4426 }
4427
4428 /* CONTAINER_REBOOTED, loop again */
4429
4430 if (arg_keep_unit) {
4431 /* Special handling if we are running as a
4432 * service: instead of simply restarting the
4433 * machine we want to restart the entire
4434 * service, so let's inform systemd about this
4435 * with the special exit code 133. The service
4436 * file uses RestartForceExitStatus=133 so
4437 * that this results in a full nspawn
4438 * restart. This is necessary since we might
4439 * have cgroup parameters set we want to have
4440 * flushed out. */
4441 ret = 133;
4442 r = 0;
4443 break;
4444 }
4445
4446 flush_ports(&exposed);
4447 }
4448
4449 finish:
4450 sd_notify(false,
4451 "STOPPING=1\n"
4452 "STATUS=Terminating...");
4453
4454 loop_remove(loop_nr, &image_fd);
4455
4456 if (pid > 0)
4457 kill(pid, SIGKILL);
4458
4459 if (remove_subvol && arg_directory) {
4460 int k;
4461
4462 k = btrfs_subvol_remove(arg_directory);
4463 if (k < 0)
4464 log_warning_errno(k, "Cannot remove subvolume '%s', ignoring: %m", arg_directory);
4465 }
4466
4467 if (arg_machine) {
4468 const char *p;
4469
4470 p = strjoina("/run/systemd/nspawn/propagate/", arg_machine);
4471 (void) rm_rf(p, REMOVE_ROOT);
4472 }
4473
4474 free(arg_directory);
4475 free(arg_template);
4476 free(arg_image);
4477 free(arg_machine);
4478 free(arg_user);
4479 strv_free(arg_setenv);
4480 strv_free(arg_network_interfaces);
4481 strv_free(arg_network_macvlan);
4482 strv_free(arg_network_ipvlan);
4483 strv_free(arg_bind);
4484 strv_free(arg_bind_ro);
4485 strv_free(arg_tmpfs);
4486
4487 flush_ports(&exposed);
4488
4489 while (arg_expose_ports) {
4490 ExposePort *p = arg_expose_ports;
4491 LIST_REMOVE(ports, arg_expose_ports, p);
4492 free(p);
4493 }
4494
4495 return r < 0 ? EXIT_FAILURE : ret;
4496 }