1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
4 This file is part of systemd.
6 Copyright 2010 Lennart Poettering
8 systemd is free software; you can redistribute it and/or modify it
9 under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 2.1 of the License, or
11 (at your option) any later version.
13 systemd is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 Lesser General Public License for more details.
18 You should have received a copy of the GNU Lesser General Public License
19 along with systemd; If not, see <http://www.gnu.org/licenses/>.
25 #include <sys/types.h>
26 #include <sys/mount.h>
31 #include <sys/prctl.h>
35 #include <sys/socket.h>
36 #include <linux/netlink.h>
38 #include <linux/veth.h>
39 #include <sys/personality.h>
40 #include <linux/loop.h>
44 #include <selinux/selinux.h>
52 #include <blkid/blkid.h>
55 #include "sd-daemon.h"
58 #include "sd-netlink.h"
59 #include "random-util.h"
66 #include "cgroup-util.h"
68 #include "path-util.h"
69 #include "loopback-setup.h"
70 #include "dev-setup.h"
75 #include "bus-error.h"
78 #include "netlink-util.h"
79 #include "udev-util.h"
80 #include "blkid-util.h"
82 #include "siphash24.h"
84 #include "base-filesystem.h"
86 #include "event-util.h"
87 #include "capability.h"
89 #include "btrfs-util.h"
90 #include "machine-image.h"
92 #include "in-addr-util.h"
93 #include "firewall-util.h"
94 #include "local-addresses.h"
95 #include "formats-util.h"
96 #include "process-util.h"
97 #include "terminal-util.h"
98 #include "hostname-util.h"
99 #include "signal-util.h"
102 #include "seccomp-util.h"
105 typedef struct ExposePort
{
108 uint16_t container_port
;
109 LIST_FIELDS(struct ExposePort
, ports
);
112 typedef enum ContainerStatus
{
113 CONTAINER_TERMINATED
,
117 typedef enum LinkJournal
{
124 typedef enum Volatile
{
130 typedef enum CustomMountType
{
133 CUSTOM_MOUNT_OVERLAY
,
136 typedef struct CustomMount
{
137 CustomMountType type
;
139 char *source
; /* for overlayfs this is the upper directory */
146 static char *arg_directory
= NULL
;
147 static char *arg_template
= NULL
;
148 static char *arg_user
= NULL
;
149 static sd_id128_t arg_uuid
= {};
150 static char *arg_machine
= NULL
;
151 static const char *arg_selinux_context
= NULL
;
152 static const char *arg_selinux_apifs_context
= NULL
;
153 static const char *arg_slice
= NULL
;
154 static bool arg_private_network
= false;
155 static bool arg_read_only
= false;
156 static bool arg_boot
= false;
157 static bool arg_ephemeral
= false;
158 static LinkJournal arg_link_journal
= LINK_AUTO
;
159 static bool arg_link_journal_try
= false;
160 static uint64_t arg_retain
=
161 (1ULL << CAP_CHOWN
) |
162 (1ULL << CAP_DAC_OVERRIDE
) |
163 (1ULL << CAP_DAC_READ_SEARCH
) |
164 (1ULL << CAP_FOWNER
) |
165 (1ULL << CAP_FSETID
) |
166 (1ULL << CAP_IPC_OWNER
) |
168 (1ULL << CAP_LEASE
) |
169 (1ULL << CAP_LINUX_IMMUTABLE
) |
170 (1ULL << CAP_NET_BIND_SERVICE
) |
171 (1ULL << CAP_NET_BROADCAST
) |
172 (1ULL << CAP_NET_RAW
) |
173 (1ULL << CAP_SETGID
) |
174 (1ULL << CAP_SETFCAP
) |
175 (1ULL << CAP_SETPCAP
) |
176 (1ULL << CAP_SETUID
) |
177 (1ULL << CAP_SYS_ADMIN
) |
178 (1ULL << CAP_SYS_CHROOT
) |
179 (1ULL << CAP_SYS_NICE
) |
180 (1ULL << CAP_SYS_PTRACE
) |
181 (1ULL << CAP_SYS_TTY_CONFIG
) |
182 (1ULL << CAP_SYS_RESOURCE
) |
183 (1ULL << CAP_SYS_BOOT
) |
184 (1ULL << CAP_AUDIT_WRITE
) |
185 (1ULL << CAP_AUDIT_CONTROL
) |
187 static CustomMount
*arg_custom_mounts
= NULL
;
188 static unsigned arg_n_custom_mounts
= 0;
189 static char **arg_setenv
= NULL
;
190 static bool arg_quiet
= false;
191 static bool arg_share_system
= false;
192 static bool arg_register
= true;
193 static bool arg_keep_unit
= false;
194 static char **arg_network_interfaces
= NULL
;
195 static char **arg_network_macvlan
= NULL
;
196 static char **arg_network_ipvlan
= NULL
;
197 static bool arg_network_veth
= false;
198 static const char *arg_network_bridge
= NULL
;
199 static unsigned long arg_personality
= PERSONALITY_INVALID
;
200 static char *arg_image
= NULL
;
201 static Volatile arg_volatile
= VOLATILE_NO
;
202 static ExposePort
*arg_expose_ports
= NULL
;
203 static char **arg_property
= NULL
;
204 static uid_t arg_uid_shift
= UID_INVALID
, arg_uid_range
= 0x10000U
;
205 static bool arg_userns
= false;
206 static int arg_kill_signal
= 0;
208 static void help(void) {
209 printf("%s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
210 "Spawn a minimal namespace container for debugging, testing and building.\n\n"
211 " -h --help Show this help\n"
212 " --version Print version string\n"
213 " -q --quiet Do not show status information\n"
214 " -D --directory=PATH Root directory for the container\n"
215 " --template=PATH Initialize root directory from template directory,\n"
217 " -x --ephemeral Run container with snapshot of root directory, and\n"
218 " remove it after exit\n"
219 " -i --image=PATH File system device or disk image for the container\n"
220 " -b --boot Boot up full system (i.e. invoke init)\n"
221 " -u --user=USER Run the command under specified user or uid\n"
222 " -M --machine=NAME Set the machine name for the container\n"
223 " --uuid=UUID Set a specific machine UUID for the container\n"
224 " -S --slice=SLICE Place the container in the specified slice\n"
225 " --property=NAME=VALUE Set scope unit property\n"
226 " --private-users[=UIDBASE[:NUIDS]]\n"
227 " Run within user namespace\n"
228 " --private-network Disable network in container\n"
229 " --network-interface=INTERFACE\n"
230 " Assign an existing network interface to the\n"
232 " --network-macvlan=INTERFACE\n"
233 " Create a macvlan network interface based on an\n"
234 " existing network interface to the container\n"
235 " --network-ipvlan=INTERFACE\n"
236 " Create a ipvlan network interface based on an\n"
237 " existing network interface to the container\n"
238 " -n --network-veth Add a virtual ethernet connection between host\n"
240 " --network-bridge=INTERFACE\n"
241 " Add a virtual ethernet connection between host\n"
242 " and container and add it to an existing bridge on\n"
244 " -p --port=[PROTOCOL:]HOSTPORT[:CONTAINERPORT]\n"
245 " Expose a container IP port on the host\n"
246 " -Z --selinux-context=SECLABEL\n"
247 " Set the SELinux security context to be used by\n"
248 " processes in the container\n"
249 " -L --selinux-apifs-context=SECLABEL\n"
250 " Set the SELinux security context to be used by\n"
251 " API/tmpfs file systems in the container\n"
252 " --capability=CAP In addition to the default, retain specified\n"
254 " --drop-capability=CAP Drop the specified capability from the default set\n"
255 " --kill-signal=SIGNAL Select signal to use for shutting down PID 1\n"
256 " --link-journal=MODE Link up guest journal, one of no, auto, guest, host,\n"
257 " try-guest, try-host\n"
258 " -j Equivalent to --link-journal=try-guest\n"
259 " --read-only Mount the root directory read-only\n"
260 " --bind=PATH[:PATH] Bind mount a file or directory from the host into\n"
262 " --bind-ro=PATH[:PATH] Similar, but creates a read-only bind mount\n"
263 " --tmpfs=PATH:[OPTIONS] Mount an empty tmpfs to the specified directory\n"
264 " --overlay=PATH[:PATH...]:PATH\n"
265 " Create an overlay mount from the host to \n"
267 " --overlay-ro=PATH[:PATH...]:PATH\n"
268 " Similar, but creates a read-only overlay mount\n"
269 " --setenv=NAME=VALUE Pass an environment variable to PID 1\n"
270 " --share-system Share system namespaces with host\n"
271 " --register=BOOLEAN Register container as machine\n"
272 " --keep-unit Do not register a scope for the machine, reuse\n"
273 " the service unit nspawn is running in\n"
274 " --volatile[=MODE] Run the system in volatile mode\n"
275 , program_invocation_short_name
);
278 static CustomMount
* custom_mount_add(CustomMountType t
) {
279 CustomMount
*c
, *ret
;
281 c
= realloc(arg_custom_mounts
, (arg_n_custom_mounts
+ 1) * sizeof(CustomMount
));
285 arg_custom_mounts
= c
;
286 ret
= arg_custom_mounts
+ arg_n_custom_mounts
;
287 arg_n_custom_mounts
++;
289 *ret
= (CustomMount
) { .type
= t
};
294 static void custom_mount_free_all(void) {
297 for (i
= 0; i
< arg_n_custom_mounts
; i
++) {
298 CustomMount
*m
= &arg_custom_mounts
[i
];
301 free(m
->destination
);
305 (void) rm_rf(m
->work_dir
, REMOVE_ROOT
|REMOVE_PHYSICAL
);
312 free(arg_custom_mounts
);
313 arg_custom_mounts
= NULL
;
314 arg_n_custom_mounts
= 0;
317 static int custom_mount_compare(const void *a
, const void *b
) {
318 const CustomMount
*x
= a
, *y
= b
;
321 r
= path_compare(x
->destination
, y
->destination
);
325 if (x
->type
< y
->type
)
327 if (x
->type
> y
->type
)
333 static int custom_mounts_prepare(void) {
337 /* Ensure the mounts are applied prefix first. */
338 qsort_safe(arg_custom_mounts
, arg_n_custom_mounts
, sizeof(CustomMount
), custom_mount_compare
);
340 /* Allocate working directories for the overlay file systems that need it */
341 for (i
= 0; i
< arg_n_custom_mounts
; i
++) {
342 CustomMount
*m
= &arg_custom_mounts
[i
];
344 if (arg_userns
&& arg_uid_shift
== UID_INVALID
&& path_equal(m
->destination
, "/")) {
345 log_error("--private-users with automatic UID shift may not be combined with custom root mounts.");
349 if (m
->type
!= CUSTOM_MOUNT_OVERLAY
)
358 r
= tempfn_random(m
->source
, NULL
, &m
->work_dir
);
360 return log_error_errno(r
, "Failed to generate work directory from %s: %m", m
->source
);
366 static int set_sanitized_path(char **b
, const char *path
) {
372 p
= canonicalize_file_name(path
);
377 p
= path_make_absolute_cwd(path
);
383 *b
= path_kill_slashes(p
);
387 static int parse_argv(int argc
, char *argv
[]) {
406 ARG_NETWORK_INTERFACE
,
418 static const struct option options
[] = {
419 { "help", no_argument
, NULL
, 'h' },
420 { "version", no_argument
, NULL
, ARG_VERSION
},
421 { "directory", required_argument
, NULL
, 'D' },
422 { "template", required_argument
, NULL
, ARG_TEMPLATE
},
423 { "ephemeral", no_argument
, NULL
, 'x' },
424 { "user", required_argument
, NULL
, 'u' },
425 { "private-network", no_argument
, NULL
, ARG_PRIVATE_NETWORK
},
426 { "boot", no_argument
, NULL
, 'b' },
427 { "uuid", required_argument
, NULL
, ARG_UUID
},
428 { "read-only", no_argument
, NULL
, ARG_READ_ONLY
},
429 { "capability", required_argument
, NULL
, ARG_CAPABILITY
},
430 { "drop-capability", required_argument
, NULL
, ARG_DROP_CAPABILITY
},
431 { "link-journal", required_argument
, NULL
, ARG_LINK_JOURNAL
},
432 { "bind", required_argument
, NULL
, ARG_BIND
},
433 { "bind-ro", required_argument
, NULL
, ARG_BIND_RO
},
434 { "tmpfs", required_argument
, NULL
, ARG_TMPFS
},
435 { "overlay", required_argument
, NULL
, ARG_OVERLAY
},
436 { "overlay-ro", required_argument
, NULL
, ARG_OVERLAY_RO
},
437 { "machine", required_argument
, NULL
, 'M' },
438 { "slice", required_argument
, NULL
, 'S' },
439 { "setenv", required_argument
, NULL
, ARG_SETENV
},
440 { "selinux-context", required_argument
, NULL
, 'Z' },
441 { "selinux-apifs-context", required_argument
, NULL
, 'L' },
442 { "quiet", no_argument
, NULL
, 'q' },
443 { "share-system", no_argument
, NULL
, ARG_SHARE_SYSTEM
},
444 { "register", required_argument
, NULL
, ARG_REGISTER
},
445 { "keep-unit", no_argument
, NULL
, ARG_KEEP_UNIT
},
446 { "network-interface", required_argument
, NULL
, ARG_NETWORK_INTERFACE
},
447 { "network-macvlan", required_argument
, NULL
, ARG_NETWORK_MACVLAN
},
448 { "network-ipvlan", required_argument
, NULL
, ARG_NETWORK_IPVLAN
},
449 { "network-veth", no_argument
, NULL
, 'n' },
450 { "network-bridge", required_argument
, NULL
, ARG_NETWORK_BRIDGE
},
451 { "personality", required_argument
, NULL
, ARG_PERSONALITY
},
452 { "image", required_argument
, NULL
, 'i' },
453 { "volatile", optional_argument
, NULL
, ARG_VOLATILE
},
454 { "port", required_argument
, NULL
, 'p' },
455 { "property", required_argument
, NULL
, ARG_PROPERTY
},
456 { "private-users", optional_argument
, NULL
, ARG_PRIVATE_USERS
},
457 { "kill-signal", required_argument
, NULL
, ARG_KILL_SIGNAL
},
462 uint64_t plus
= 0, minus
= 0;
467 while ((c
= getopt_long(argc
, argv
, "+hD:u:bL:M:jS:Z:qi:xp:n", options
, NULL
)) >= 0)
476 puts(PACKAGE_STRING
);
477 puts(SYSTEMD_FEATURES
);
481 r
= set_sanitized_path(&arg_directory
, optarg
);
483 return log_error_errno(r
, "Invalid root directory: %m");
488 r
= set_sanitized_path(&arg_template
, optarg
);
490 return log_error_errno(r
, "Invalid template directory: %m");
495 r
= set_sanitized_path(&arg_image
, optarg
);
497 return log_error_errno(r
, "Invalid image path: %m");
502 arg_ephemeral
= true;
506 r
= free_and_strdup(&arg_user
, optarg
);
512 case ARG_NETWORK_BRIDGE
:
513 arg_network_bridge
= optarg
;
518 arg_network_veth
= true;
519 arg_private_network
= true;
522 case ARG_NETWORK_INTERFACE
:
523 if (strv_extend(&arg_network_interfaces
, optarg
) < 0)
526 arg_private_network
= true;
529 case ARG_NETWORK_MACVLAN
:
530 if (strv_extend(&arg_network_macvlan
, optarg
) < 0)
533 arg_private_network
= true;
536 case ARG_NETWORK_IPVLAN
:
537 if (strv_extend(&arg_network_ipvlan
, optarg
) < 0)
542 case ARG_PRIVATE_NETWORK
:
543 arg_private_network
= true;
551 r
= sd_id128_from_string(optarg
, &arg_uuid
);
553 log_error("Invalid UUID: %s", optarg
);
563 if (isempty(optarg
)) {
567 if (!machine_name_is_valid(optarg
)) {
568 log_error("Invalid machine name: %s", optarg
);
572 r
= free_and_strdup(&arg_machine
, optarg
);
580 arg_selinux_context
= optarg
;
584 arg_selinux_apifs_context
= optarg
;
588 arg_read_only
= true;
592 case ARG_DROP_CAPABILITY
: {
593 const char *state
, *word
;
596 FOREACH_WORD_SEPARATOR(word
, length
, optarg
, ",", state
) {
597 _cleanup_free_
char *t
;
599 t
= strndup(word
, length
);
603 if (streq(t
, "all")) {
604 if (c
== ARG_CAPABILITY
)
605 plus
= (uint64_t) -1;
607 minus
= (uint64_t) -1;
611 cap
= capability_from_name(t
);
613 log_error("Failed to parse capability %s.", t
);
617 if (c
== ARG_CAPABILITY
)
618 plus
|= 1ULL << (uint64_t) cap
;
620 minus
|= 1ULL << (uint64_t) cap
;
628 arg_link_journal
= LINK_GUEST
;
629 arg_link_journal_try
= true;
632 case ARG_LINK_JOURNAL
:
633 if (streq(optarg
, "auto")) {
634 arg_link_journal
= LINK_AUTO
;
635 arg_link_journal_try
= false;
636 } else if (streq(optarg
, "no")) {
637 arg_link_journal
= LINK_NO
;
638 arg_link_journal_try
= false;
639 } else if (streq(optarg
, "guest")) {
640 arg_link_journal
= LINK_GUEST
;
641 arg_link_journal_try
= false;
642 } else if (streq(optarg
, "host")) {
643 arg_link_journal
= LINK_HOST
;
644 arg_link_journal_try
= false;
645 } else if (streq(optarg
, "try-guest")) {
646 arg_link_journal
= LINK_GUEST
;
647 arg_link_journal_try
= true;
648 } else if (streq(optarg
, "try-host")) {
649 arg_link_journal
= LINK_HOST
;
650 arg_link_journal_try
= true;
652 log_error("Failed to parse link journal mode %s", optarg
);
660 _cleanup_free_
char *source
= NULL
, *destination
= NULL
;
664 e
= strchr(optarg
, ':');
666 source
= strndup(optarg
, e
- optarg
);
667 destination
= strdup(e
+ 1);
669 source
= strdup(optarg
);
670 destination
= strdup(optarg
);
673 if (!source
|| !destination
)
676 if (!path_is_absolute(source
) || !path_is_absolute(destination
)) {
677 log_error("Invalid bind mount specification: %s", optarg
);
681 m
= custom_mount_add(CUSTOM_MOUNT_BIND
);
686 m
->destination
= destination
;
687 m
->read_only
= c
== ARG_BIND_RO
;
689 source
= destination
= NULL
;
695 _cleanup_free_
char *path
= NULL
, *opts
= NULL
;
699 e
= strchr(optarg
, ':');
701 path
= strndup(optarg
, e
- optarg
);
702 opts
= strdup(e
+ 1);
704 path
= strdup(optarg
);
705 opts
= strdup("mode=0755");
711 if (!path_is_absolute(path
)) {
712 log_error("Invalid tmpfs specification: %s", optarg
);
716 m
= custom_mount_add(CUSTOM_MOUNT_TMPFS
);
720 m
->destination
= path
;
729 case ARG_OVERLAY_RO
: {
730 _cleanup_free_
char *upper
= NULL
, *destination
= NULL
;
731 _cleanup_strv_free_
char **lower
= NULL
;
736 lower
= strv_split(optarg
, ":");
740 STRV_FOREACH(i
, lower
) {
741 if (!path_is_absolute(*i
)) {
742 log_error("Overlay path %s is not absolute.", *i
);
750 log_error("--overlay= needs at least two colon-separated directories specified.");
755 /* If two parameters are specified,
756 * the first one is the lower, the
757 * second one the upper directory. And
758 * we'll also define the destination
759 * mount point the same as the upper. */
763 destination
= strdup(upper
);
768 upper
= lower
[n
- 2];
769 destination
= lower
[n
- 1];
773 m
= custom_mount_add(CUSTOM_MOUNT_OVERLAY
);
777 m
->destination
= destination
;
780 m
->read_only
= c
== ARG_OVERLAY_RO
;
782 upper
= destination
= NULL
;
791 if (!env_assignment_is_valid(optarg
)) {
792 log_error("Environment variable assignment '%s' is not valid.", optarg
);
796 n
= strv_env_set(arg_setenv
, optarg
);
800 strv_free(arg_setenv
);
809 case ARG_SHARE_SYSTEM
:
810 arg_share_system
= true;
814 r
= parse_boolean(optarg
);
816 log_error("Failed to parse --register= argument: %s", optarg
);
824 arg_keep_unit
= true;
827 case ARG_PERSONALITY
:
829 arg_personality
= personality_from_string(optarg
);
830 if (arg_personality
== PERSONALITY_INVALID
) {
831 log_error("Unknown or unsupported personality '%s'.", optarg
);
840 arg_volatile
= VOLATILE_YES
;
842 r
= parse_boolean(optarg
);
844 if (streq(optarg
, "state"))
845 arg_volatile
= VOLATILE_STATE
;
847 log_error("Failed to parse --volatile= argument: %s", optarg
);
851 arg_volatile
= r
? VOLATILE_YES
: VOLATILE_NO
;
857 const char *split
, *e
;
858 uint16_t container_port
, host_port
;
862 if ((e
= startswith(optarg
, "tcp:")))
863 protocol
= IPPROTO_TCP
;
864 else if ((e
= startswith(optarg
, "udp:")))
865 protocol
= IPPROTO_UDP
;
868 protocol
= IPPROTO_TCP
;
871 split
= strchr(e
, ':');
873 char v
[split
- e
+ 1];
875 memcpy(v
, e
, split
- e
);
878 r
= safe_atou16(v
, &host_port
);
879 if (r
< 0 || host_port
<= 0) {
880 log_error("Failed to parse host port: %s", optarg
);
884 r
= safe_atou16(split
+ 1, &container_port
);
886 r
= safe_atou16(e
, &container_port
);
887 host_port
= container_port
;
890 if (r
< 0 || container_port
<= 0) {
891 log_error("Failed to parse host port: %s", optarg
);
895 LIST_FOREACH(ports
, p
, arg_expose_ports
) {
896 if (p
->protocol
== protocol
&& p
->host_port
== host_port
) {
897 log_error("Duplicate port specification: %s", optarg
);
902 p
= new(ExposePort
, 1);
906 p
->protocol
= protocol
;
907 p
->host_port
= host_port
;
908 p
->container_port
= container_port
;
910 LIST_PREPEND(ports
, arg_expose_ports
, p
);
916 if (strv_extend(&arg_property
, optarg
) < 0)
921 case ARG_PRIVATE_USERS
:
923 _cleanup_free_
char *buffer
= NULL
;
924 const char *range
, *shift
;
926 range
= strchr(optarg
, ':');
928 buffer
= strndup(optarg
, range
- optarg
);
934 if (safe_atou32(range
, &arg_uid_range
) < 0 || arg_uid_range
<= 0) {
935 log_error("Failed to parse UID range: %s", range
);
941 if (parse_uid(shift
, &arg_uid_shift
) < 0) {
942 log_error("Failed to parse UID: %s", optarg
);
950 case ARG_KILL_SIGNAL
:
951 arg_kill_signal
= signal_from_string_try_harder(optarg
);
952 if (arg_kill_signal
< 0) {
953 log_error("Cannot parse signal: %s", optarg
);
963 assert_not_reached("Unhandled option");
966 if (arg_share_system
)
967 arg_register
= false;
969 if (arg_boot
&& arg_share_system
) {
970 log_error("--boot and --share-system may not be combined.");
974 if (arg_keep_unit
&& cg_pid_get_owner_uid(0, NULL
) >= 0) {
975 log_error("--keep-unit may not be used when invoked from a user session.");
979 if (arg_directory
&& arg_image
) {
980 log_error("--directory= and --image= may not be combined.");
984 if (arg_template
&& arg_image
) {
985 log_error("--template= and --image= may not be combined.");
989 if (arg_template
&& !(arg_directory
|| arg_machine
)) {
990 log_error("--template= needs --directory= or --machine=.");
994 if (arg_ephemeral
&& arg_template
) {
995 log_error("--ephemeral and --template= may not be combined.");
999 if (arg_ephemeral
&& arg_image
) {
1000 log_error("--ephemeral and --image= may not be combined.");
1004 if (arg_ephemeral
&& !IN_SET(arg_link_journal
, LINK_NO
, LINK_AUTO
)) {
1005 log_error("--ephemeral and --link-journal= may not be combined.");
1009 if (arg_volatile
!= VOLATILE_NO
&& arg_read_only
) {
1010 log_error("Cannot combine --read-only with --volatile. Note that --volatile already implies a read-only base hierarchy.");
1014 if (arg_expose_ports
&& !arg_private_network
) {
1015 log_error("Cannot use --port= without private networking.");
1019 if (arg_userns
&& access("/proc/self/uid_map", F_OK
) < 0)
1020 return log_error_errno(EOPNOTSUPP
, "--private-users= is not supported, kernel compiled without user namespace support.");
1022 arg_retain
= (arg_retain
| plus
| (arg_private_network
? 1ULL << CAP_NET_ADMIN
: 0)) & ~minus
;
1024 if (arg_boot
&& arg_kill_signal
<= 0)
1025 arg_kill_signal
= SIGRTMIN
+3;
1030 static int tmpfs_patch_options(const char *options
, char **ret
) {
1033 if (arg_userns
&& arg_uid_shift
!= 0) {
1034 assert(arg_uid_shift
!= UID_INVALID
);
1037 (void) asprintf(&buf
, "%s,uid=" UID_FMT
",gid=" UID_FMT
, options
, arg_uid_shift
, arg_uid_shift
);
1039 (void) asprintf(&buf
, "uid=" UID_FMT
",gid=" UID_FMT
, arg_uid_shift
, arg_uid_shift
);
1047 if (arg_selinux_apifs_context
) {
1051 t
= strjoin(options
, ",context=\"", arg_selinux_apifs_context
, "\"", NULL
);
1053 t
= strjoin("context=\"", arg_selinux_apifs_context
, "\"", NULL
);
1068 static int mount_all(const char *dest
, bool userns
) {
1070 typedef struct MountPoint
{
1074 const char *options
;
1075 unsigned long flags
;
1080 static const MountPoint mount_table
[] = {
1081 { "proc", "/proc", "proc", NULL
, MS_NOSUID
|MS_NOEXEC
|MS_NODEV
, true, true },
1082 { "/proc/sys", "/proc/sys", NULL
, NULL
, MS_BIND
, true, true }, /* Bind mount first */
1083 { NULL
, "/proc/sys", NULL
, NULL
, MS_BIND
|MS_RDONLY
|MS_NOSUID
|MS_NOEXEC
|MS_NODEV
|MS_REMOUNT
, true, true }, /* Then, make it r/o */
1084 { "sysfs", "/sys", "sysfs", NULL
, MS_RDONLY
|MS_NOSUID
|MS_NOEXEC
|MS_NODEV
, true, false },
1085 { "tmpfs", "/sys/fs/cgroup", "tmpfs", "mode=755", MS_NOSUID
|MS_NOEXEC
|MS_NODEV
|MS_STRICTATIME
, true, false },
1086 { "tmpfs", "/dev", "tmpfs", "mode=755", MS_NOSUID
|MS_STRICTATIME
, true, false },
1087 { "tmpfs", "/dev/shm", "tmpfs", "mode=1777", MS_NOSUID
|MS_NODEV
|MS_STRICTATIME
, true, false },
1088 { "tmpfs", "/run", "tmpfs", "mode=755", MS_NOSUID
|MS_NODEV
|MS_STRICTATIME
, true, false },
1089 { "tmpfs", "/tmp", "tmpfs", "mode=1777", MS_STRICTATIME
, true, false },
1091 { "/sys/fs/selinux", "/sys/fs/selinux", NULL
, NULL
, MS_BIND
, false, false }, /* Bind mount first */
1092 { NULL
, "/sys/fs/selinux", NULL
, NULL
, MS_BIND
|MS_RDONLY
|MS_NOSUID
|MS_NOEXEC
|MS_NODEV
|MS_REMOUNT
, false, false }, /* Then, make it r/o */
1099 for (k
= 0; k
< ELEMENTSOF(mount_table
); k
++) {
1100 _cleanup_free_
char *where
= NULL
, *options
= NULL
;
1103 if (userns
!= mount_table
[k
].userns
)
1106 where
= prefix_root(dest
, mount_table
[k
].where
);
1110 r
= path_is_mount_point(where
, AT_SYMLINK_FOLLOW
);
1111 if (r
< 0 && r
!= -ENOENT
)
1112 return log_error_errno(r
, "Failed to detect whether %s is a mount point: %m", where
);
1114 /* Skip this entry if it is not a remount. */
1115 if (mount_table
[k
].what
&& r
> 0)
1118 r
= mkdir_p(where
, 0755);
1120 if (mount_table
[k
].fatal
)
1121 return log_error_errno(r
, "Failed to create directory %s: %m", where
);
1123 log_warning_errno(r
, "Failed to create directory %s: %m", where
);
1127 o
= mount_table
[k
].options
;
1128 if (streq_ptr(mount_table
[k
].type
, "tmpfs")) {
1129 r
= tmpfs_patch_options(o
, &options
);
1136 if (mount(mount_table
[k
].what
,
1138 mount_table
[k
].type
,
1139 mount_table
[k
].flags
,
1142 if (mount_table
[k
].fatal
)
1143 return log_error_errno(errno
, "mount(%s) failed: %m", where
);
1145 log_warning_errno(errno
, "mount(%s) failed, ignoring: %m", where
);
1152 static int mount_bind(const char *dest
, CustomMount
*m
) {
1153 struct stat source_st
, dest_st
;
1159 if (stat(m
->source
, &source_st
) < 0)
1160 return log_error_errno(errno
, "Failed to stat %s: %m", m
->source
);
1162 where
= prefix_roota(dest
, m
->destination
);
1164 if (stat(where
, &dest_st
) >= 0) {
1165 if (S_ISDIR(source_st
.st_mode
) && !S_ISDIR(dest_st
.st_mode
)) {
1166 log_error("Cannot bind mount directory %s on file %s.", m
->source
, where
);
1170 if (!S_ISDIR(source_st
.st_mode
) && S_ISDIR(dest_st
.st_mode
)) {
1171 log_error("Cannot bind mount file %s on directory %s.", m
->source
, where
);
1175 } else if (errno
== ENOENT
) {
1176 r
= mkdir_parents_label(where
, 0755);
1178 return log_error_errno(r
, "Failed to make parents of %s: %m", where
);
1180 log_error_errno(errno
, "Failed to stat %s: %m", where
);
1184 /* Create the mount point. Any non-directory file can be
1185 * mounted on any non-directory file (regular, fifo, socket,
1188 if (S_ISDIR(source_st
.st_mode
))
1189 r
= mkdir_label(where
, 0755);
1192 if (r
< 0 && r
!= -EEXIST
)
1193 return log_error_errno(r
, "Failed to create mount point %s: %m", where
);
1195 if (mount(m
->source
, where
, NULL
, MS_BIND
, NULL
) < 0)
1196 return log_error_errno(errno
, "mount(%s) failed: %m", where
);
1199 r
= bind_remount_recursive(where
, true);
1201 return log_error_errno(r
, "Read-only bind mount failed: %m");
1207 static int mount_tmpfs(const char *dest
, CustomMount
*m
) {
1208 const char *where
, *options
;
1209 _cleanup_free_
char *buf
= NULL
;
1215 where
= prefix_roota(dest
, m
->destination
);
1217 r
= mkdir_p_label(where
, 0755);
1218 if (r
< 0 && r
!= -EEXIST
)
1219 return log_error_errno(r
, "Creating mount point for tmpfs %s failed: %m", where
);
1221 r
= tmpfs_patch_options(m
->options
, &buf
);
1224 options
= r
> 0 ? buf
: m
->options
;
1226 if (mount("tmpfs", where
, "tmpfs", MS_NODEV
|MS_STRICTATIME
, options
) < 0)
1227 return log_error_errno(errno
, "tmpfs mount to %s failed: %m", where
);
1232 static int mount_overlay(const char *dest
, CustomMount
*m
) {
1233 _cleanup_free_
char *lower
= NULL
;
1234 const char *where
, *options
;
1240 where
= prefix_roota(dest
, m
->destination
);
1242 r
= mkdir_label(where
, 0755);
1243 if (r
< 0 && r
!= -EEXIST
)
1244 return log_error_errno(r
, "Creating mount point for overlay %s failed: %m", where
);
1246 (void) mkdir_p_label(m
->source
, 0755);
1248 strv_reverse(m
->lower
);
1249 lower
= strv_join(m
->lower
, ":");
1250 strv_reverse(m
->lower
);
1255 options
= strjoina("lowerdir=", m
->source
, ":", lower
);
1257 assert(m
->work_dir
);
1258 (void) mkdir_label(m
->work_dir
, 0700);
1260 options
= strjoina("lowerdir=", lower
, ",upperdir=", m
->source
, ",workdir=", m
->work_dir
);
1263 if (mount("overlay", where
, "overlay", m
->read_only
? MS_RDONLY
: 0, options
) < 0)
1264 return log_error_errno(errno
, "overlay mount to %s failed: %m", where
);
1269 static int mount_custom(const char *dest
) {
1275 for (i
= 0; i
< arg_n_custom_mounts
; i
++) {
1276 CustomMount
*m
= &arg_custom_mounts
[i
];
1280 case CUSTOM_MOUNT_BIND
:
1281 r
= mount_bind(dest
, m
);
1284 case CUSTOM_MOUNT_TMPFS
:
1285 r
= mount_tmpfs(dest
, m
);
1288 case CUSTOM_MOUNT_OVERLAY
:
1289 r
= mount_overlay(dest
, m
);
1293 assert_not_reached("Unknown custom mount type");
1303 static int mount_cgroup_hierarchy(const char *dest
, const char *controller
, const char *hierarchy
, bool read_only
) {
1307 to
= strjoina(dest
, "/sys/fs/cgroup/", hierarchy
);
1309 r
= path_is_mount_point(to
, 0);
1310 if (r
< 0 && r
!= -ENOENT
)
1311 return log_error_errno(r
, "Failed to determine if %s is mounted already: %m", to
);
1317 /* The superblock mount options of the mount point need to be
1318 * identical to the hosts', and hence writable... */
1319 if (mount("cgroup", to
, "cgroup", MS_NOSUID
|MS_NOEXEC
|MS_NODEV
, controller
) < 0)
1320 return log_error_errno(errno
, "Failed to mount to %s: %m", to
);
1322 /* ... hence let's only make the bind mount read-only, not the
1325 if (mount(NULL
, to
, NULL
, MS_BIND
|MS_REMOUNT
|MS_NOSUID
|MS_NOEXEC
|MS_NODEV
|MS_RDONLY
, NULL
) < 0)
1326 return log_error_errno(errno
, "Failed to remount %s read-only: %m", to
);
1331 static int mount_cgroup(const char *dest
) {
1332 _cleanup_set_free_free_ Set
*controllers
= NULL
;
1333 const char *cgroup_root
;
1336 controllers
= set_new(&string_hash_ops
);
1340 r
= cg_kernel_controllers(controllers
);
1342 return log_error_errno(r
, "Failed to determine cgroup controllers: %m");
1345 _cleanup_free_
char *controller
= NULL
, *origin
= NULL
, *combined
= NULL
;
1347 controller
= set_steal_first(controllers
);
1351 origin
= prefix_root("/sys/fs/cgroup/", controller
);
1355 r
= readlink_malloc(origin
, &combined
);
1357 /* Not a symbolic link, but directly a single cgroup hierarchy */
1359 r
= mount_cgroup_hierarchy(dest
, controller
, controller
, true);
1364 return log_error_errno(r
, "Failed to read link %s: %m", origin
);
1366 _cleanup_free_
char *target
= NULL
;
1368 target
= prefix_root(dest
, origin
);
1372 /* A symbolic link, a combination of controllers in one hierarchy */
1374 if (!filename_is_valid(combined
)) {
1375 log_warning("Ignoring invalid combined hierarchy %s.", combined
);
1379 r
= mount_cgroup_hierarchy(dest
, combined
, combined
, true);
1383 r
= symlink_idempotent(combined
, target
);
1385 log_error("Invalid existing symlink for combined hierarchy");
1389 return log_error_errno(r
, "Failed to create symlink for combined hierarchy: %m");
1393 r
= mount_cgroup_hierarchy(dest
, "name=systemd,xattr", "systemd", false);
1397 cgroup_root
= prefix_roota(dest
, "/sys/fs/cgroup");
1398 if (mount(NULL
, cgroup_root
, NULL
, MS_REMOUNT
|MS_NOSUID
|MS_NOEXEC
|MS_NODEV
|MS_STRICTATIME
|MS_RDONLY
, "mode=755") < 0)
1399 return log_error_errno(errno
, "Failed to remount %s read-only: %m", cgroup_root
);
1404 static int mount_systemd_cgroup_writable(const char *dest
) {
1405 _cleanup_free_
char *own_cgroup_path
= NULL
;
1406 const char *systemd_root
, *systemd_own
;
1411 r
= cg_pid_get_path(NULL
, 0, &own_cgroup_path
);
1413 return log_error_errno(r
, "Failed to determine our own cgroup path: %m");
1415 /* Make our own cgroup a (writable) bind mount */
1416 systemd_own
= strjoina(dest
, "/sys/fs/cgroup/systemd", own_cgroup_path
);
1417 if (mount(systemd_own
, systemd_own
, NULL
, MS_BIND
, NULL
) < 0)
1418 return log_error_errno(errno
, "Failed to turn %s into a bind mount: %m", own_cgroup_path
);
1420 /* And then remount the systemd cgroup root read-only */
1421 systemd_root
= prefix_roota(dest
, "/sys/fs/cgroup/systemd");
1422 if (mount(NULL
, systemd_root
, NULL
, MS_BIND
|MS_REMOUNT
|MS_NOSUID
|MS_NOEXEC
|MS_NODEV
|MS_RDONLY
, NULL
) < 0)
1423 return log_error_errno(errno
, "Failed to mount cgroup root read-only: %m");
1428 static int userns_lchown(const char *p
, uid_t uid
, gid_t gid
) {
1434 if (uid
== UID_INVALID
&& gid
== GID_INVALID
)
1437 if (uid
!= UID_INVALID
) {
1438 uid
+= arg_uid_shift
;
1440 if (uid
< arg_uid_shift
|| uid
>= arg_uid_shift
+ arg_uid_range
)
1444 if (gid
!= GID_INVALID
) {
1445 gid
+= (gid_t
) arg_uid_shift
;
1447 if (gid
< (gid_t
) arg_uid_shift
|| gid
>= (gid_t
) (arg_uid_shift
+ arg_uid_range
))
1451 if (lchown(p
, uid
, gid
) < 0)
1457 static int userns_mkdir(const char *root
, const char *path
, mode_t mode
, uid_t uid
, gid_t gid
) {
1460 q
= prefix_roota(root
, path
);
1461 if (mkdir(q
, mode
) < 0) {
1462 if (errno
== EEXIST
)
1467 return userns_lchown(q
, uid
, gid
);
1470 static int setup_timezone(const char *dest
) {
1471 _cleanup_free_
char *p
= NULL
, *q
= NULL
;
1472 const char *where
, *check
, *what
;
1478 /* Fix the timezone, if possible */
1479 r
= readlink_malloc("/etc/localtime", &p
);
1481 log_warning("/etc/localtime is not a symlink, not updating container timezone.");
1485 z
= path_startswith(p
, "../usr/share/zoneinfo/");
1487 z
= path_startswith(p
, "/usr/share/zoneinfo/");
1489 log_warning("/etc/localtime does not point into /usr/share/zoneinfo/, not updating container timezone.");
1493 where
= prefix_roota(dest
, "/etc/localtime");
1494 r
= readlink_malloc(where
, &q
);
1496 y
= path_startswith(q
, "../usr/share/zoneinfo/");
1498 y
= path_startswith(q
, "/usr/share/zoneinfo/");
1500 /* Already pointing to the right place? Then do nothing .. */
1501 if (y
&& streq(y
, z
))
1505 check
= strjoina("/usr/share/zoneinfo/", z
);
1506 check
= prefix_root(dest
, check
);
1507 if (laccess(check
, F_OK
) < 0) {
1508 log_warning("Timezone %s does not exist in container, not updating container timezone.", z
);
1513 if (r
< 0 && errno
!= ENOENT
) {
1514 log_error_errno(errno
, "Failed to remove existing timezone info %s in container: %m", where
);
1518 what
= strjoina("../usr/share/zoneinfo/", z
);
1519 if (symlink(what
, where
) < 0) {
1520 log_error_errno(errno
, "Failed to correct timezone of container: %m");
1524 r
= userns_lchown(where
, 0, 0);
1526 return log_warning_errno(r
, "Failed to chown /etc/localtime: %m");
1531 static int setup_resolv_conf(const char *dest
) {
1532 const char *where
= NULL
;
1537 if (arg_private_network
)
1540 /* Fix resolv.conf, if possible */
1541 where
= prefix_roota(dest
, "/etc/resolv.conf");
1543 r
= copy_file("/etc/resolv.conf", where
, O_TRUNC
|O_NOFOLLOW
, 0644, 0);
1545 /* If the file already exists as symlink, let's
1546 * suppress the warning, under the assumption that
1547 * resolved or something similar runs inside and the
1548 * symlink points there.
1550 * If the disk image is read-only, there's also no
1551 * point in complaining.
1553 log_full_errno(IN_SET(r
, -ELOOP
, -EROFS
) ? LOG_DEBUG
: LOG_WARNING
, r
,
1554 "Failed to copy /etc/resolv.conf to %s: %m", where
);
1558 r
= userns_lchown(where
, 0, 0);
1560 log_warning_errno(r
, "Failed to chown /etc/resolv.conf: %m");
1565 static int setup_volatile_state(const char *directory
) {
1566 _cleanup_free_
char *buf
= NULL
;
1567 const char *p
, *options
;
1572 if (arg_volatile
!= VOLATILE_STATE
)
1575 /* --volatile=state means we simply overmount /var
1576 with a tmpfs, and the rest read-only. */
1578 r
= bind_remount_recursive(directory
, true);
1580 return log_error_errno(r
, "Failed to remount %s read-only: %m", directory
);
1582 p
= prefix_roota(directory
, "/var");
1584 if (r
< 0 && errno
!= EEXIST
)
1585 return log_error_errno(errno
, "Failed to create %s: %m", directory
);
1587 options
= "mode=755";
1588 r
= tmpfs_patch_options(options
, &buf
);
1594 if (mount("tmpfs", p
, "tmpfs", MS_STRICTATIME
, options
) < 0)
1595 return log_error_errno(errno
, "Failed to mount tmpfs to /var: %m");
1600 static int setup_volatile(const char *directory
) {
1601 bool tmpfs_mounted
= false, bind_mounted
= false;
1602 char template[] = "/tmp/nspawn-volatile-XXXXXX";
1603 _cleanup_free_
char *buf
= NULL
;
1604 const char *f
, *t
, *options
;
1609 if (arg_volatile
!= VOLATILE_YES
)
1612 /* --volatile=yes means we mount a tmpfs to the root dir, and
1613 the original /usr to use inside it, and that read-only. */
1615 if (!mkdtemp(template))
1616 return log_error_errno(errno
, "Failed to create temporary directory: %m");
1618 options
= "mode=755";
1619 r
= tmpfs_patch_options(options
, &buf
);
1625 if (mount("tmpfs", template, "tmpfs", MS_STRICTATIME
, options
) < 0) {
1626 r
= log_error_errno(errno
, "Failed to mount tmpfs for root directory: %m");
1630 tmpfs_mounted
= true;
1632 f
= prefix_roota(directory
, "/usr");
1633 t
= prefix_roota(template, "/usr");
1636 if (r
< 0 && errno
!= EEXIST
) {
1637 r
= log_error_errno(errno
, "Failed to create %s: %m", t
);
1641 if (mount(f
, t
, NULL
, MS_BIND
|MS_REC
, NULL
) < 0) {
1642 r
= log_error_errno(errno
, "Failed to create /usr bind mount: %m");
1646 bind_mounted
= true;
1648 r
= bind_remount_recursive(t
, true);
1650 log_error_errno(r
, "Failed to remount %s read-only: %m", t
);
1654 if (mount(template, directory
, NULL
, MS_MOVE
, NULL
) < 0) {
1655 r
= log_error_errno(errno
, "Failed to move root mount: %m");
1659 (void) rmdir(template);
1668 (void) umount(template);
1669 (void) rmdir(template);
1673 static char* id128_format_as_uuid(sd_id128_t id
, char s
[37]) {
1677 "%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
1678 SD_ID128_FORMAT_VAL(id
));
1683 static int setup_boot_id(const char *dest
) {
1684 const char *from
, *to
;
1685 sd_id128_t rnd
= {};
1689 if (arg_share_system
)
1692 /* Generate a new randomized boot ID, so that each boot-up of
1693 * the container gets a new one */
1695 from
= prefix_roota(dest
, "/run/proc-sys-kernel-random-boot-id");
1696 to
= prefix_roota(dest
, "/proc/sys/kernel/random/boot_id");
1698 r
= sd_id128_randomize(&rnd
);
1700 return log_error_errno(r
, "Failed to generate random boot id: %m");
1702 id128_format_as_uuid(rnd
, as_uuid
);
1704 r
= write_string_file(from
, as_uuid
, WRITE_STRING_FILE_CREATE
);
1706 return log_error_errno(r
, "Failed to write boot id: %m");
1708 if (mount(from
, to
, NULL
, MS_BIND
, NULL
) < 0)
1709 r
= log_error_errno(errno
, "Failed to bind mount boot id: %m");
1710 else if (mount(NULL
, to
, NULL
, MS_BIND
|MS_REMOUNT
|MS_RDONLY
|MS_NOSUID
|MS_NODEV
, NULL
) < 0)
1711 log_warning_errno(errno
, "Failed to make boot id read-only: %m");
1717 static int copy_devnodes(const char *dest
) {
1719 static const char devnodes
[] =
1730 _cleanup_umask_ mode_t u
;
1736 /* Create /dev/net, so that we can create /dev/net/tun in it */
1737 if (userns_mkdir(dest
, "/dev/net", 0755, 0, 0) < 0)
1738 return log_error_errno(r
, "Failed to create /dev/net directory: %m");
1740 NULSTR_FOREACH(d
, devnodes
) {
1741 _cleanup_free_
char *from
= NULL
, *to
= NULL
;
1744 from
= strappend("/dev/", d
);
1745 to
= prefix_root(dest
, from
);
1747 if (stat(from
, &st
) < 0) {
1749 if (errno
!= ENOENT
)
1750 return log_error_errno(errno
, "Failed to stat %s: %m", from
);
1752 } else if (!S_ISCHR(st
.st_mode
) && !S_ISBLK(st
.st_mode
)) {
1754 log_error("%s is not a char or block device, cannot copy.", from
);
1758 if (mknod(to
, st
.st_mode
, st
.st_rdev
) < 0) {
1760 return log_error_errno(errno
, "mknod(%s) failed: %m", to
);
1762 /* Some systems abusively restrict mknod but
1763 * allow bind mounts. */
1766 return log_error_errno(r
, "touch (%s) failed: %m", to
);
1767 if (mount(from
, to
, NULL
, MS_BIND
, NULL
) < 0)
1768 return log_error_errno(errno
, "Both mknod and bind mount (%s) failed: %m", to
);
1771 r
= userns_lchown(to
, 0, 0);
1773 return log_error_errno(r
, "chown() of device node %s failed: %m", to
);
1780 static int setup_pts(const char *dest
) {
1781 _cleanup_free_
char *options
= NULL
;
1785 if (arg_selinux_apifs_context
)
1786 (void) asprintf(&options
,
1787 "newinstance,ptmxmode=0666,mode=620,gid=" GID_FMT
",context=\"%s\"",
1788 arg_uid_shift
+ TTY_GID
,
1789 arg_selinux_apifs_context
);
1792 (void) asprintf(&options
,
1793 "newinstance,ptmxmode=0666,mode=620,gid=" GID_FMT
,
1794 arg_uid_shift
+ TTY_GID
);
1799 /* Mount /dev/pts itself */
1800 p
= prefix_roota(dest
, "/dev/pts");
1801 if (mkdir(p
, 0755) < 0)
1802 return log_error_errno(errno
, "Failed to create /dev/pts: %m");
1803 if (mount("devpts", p
, "devpts", MS_NOSUID
|MS_NOEXEC
, options
) < 0)
1804 return log_error_errno(errno
, "Failed to mount /dev/pts: %m");
1805 if (userns_lchown(p
, 0, 0) < 0)
1806 return log_error_errno(errno
, "Failed to chown /dev/pts: %m");
1808 /* Create /dev/ptmx symlink */
1809 p
= prefix_roota(dest
, "/dev/ptmx");
1810 if (symlink("pts/ptmx", p
) < 0)
1811 return log_error_errno(errno
, "Failed to create /dev/ptmx symlink: %m");
1812 if (userns_lchown(p
, 0, 0) < 0)
1813 return log_error_errno(errno
, "Failed to chown /dev/ptmx: %m");
1815 /* And fix /dev/pts/ptmx ownership */
1816 p
= prefix_roota(dest
, "/dev/pts/ptmx");
1817 if (userns_lchown(p
, 0, 0) < 0)
1818 return log_error_errno(errno
, "Failed to chown /dev/pts/ptmx: %m");
1823 static int setup_dev_console(const char *dest
, const char *console
) {
1824 _cleanup_umask_ mode_t u
;
1833 r
= chmod_and_chown(console
, 0600, arg_uid_shift
, arg_uid_shift
);
1835 return log_error_errno(r
, "Failed to correct access mode for TTY: %m");
1837 /* We need to bind mount the right tty to /dev/console since
1838 * ptys can only exist on pts file systems. To have something
1839 * to bind mount things on we create a empty regular file. */
1841 to
= prefix_roota(dest
, "/dev/console");
1844 return log_error_errno(r
, "touch() for /dev/console failed: %m");
1846 if (mount(console
, to
, NULL
, MS_BIND
, NULL
) < 0)
1847 return log_error_errno(errno
, "Bind mount for /dev/console failed: %m");
1852 static int setup_kmsg(const char *dest
, int kmsg_socket
) {
1853 const char *from
, *to
;
1854 _cleanup_umask_ mode_t u
;
1857 struct cmsghdr cmsghdr
;
1858 uint8_t buf
[CMSG_SPACE(sizeof(int))];
1860 struct msghdr mh
= {
1861 .msg_control
= &control
,
1862 .msg_controllen
= sizeof(control
),
1864 struct cmsghdr
*cmsg
;
1866 assert(kmsg_socket
>= 0);
1870 /* We create the kmsg FIFO as /run/kmsg, but immediately
1871 * delete it after bind mounting it to /proc/kmsg. While FIFOs
1872 * on the reading side behave very similar to /proc/kmsg,
1873 * their writing side behaves differently from /dev/kmsg in
1874 * that writing blocks when nothing is reading. In order to
1875 * avoid any problems with containers deadlocking due to this
1876 * we simply make /dev/kmsg unavailable to the container. */
1877 from
= prefix_roota(dest
, "/run/kmsg");
1878 to
= prefix_roota(dest
, "/proc/kmsg");
1880 if (mkfifo(from
, 0600) < 0)
1881 return log_error_errno(errno
, "mkfifo() for /run/kmsg failed: %m");
1882 if (mount(from
, to
, NULL
, MS_BIND
, NULL
) < 0)
1883 return log_error_errno(errno
, "Bind mount for /proc/kmsg failed: %m");
1885 fd
= open(from
, O_RDWR
|O_NDELAY
|O_CLOEXEC
);
1887 return log_error_errno(errno
, "Failed to open fifo: %m");
1889 cmsg
= CMSG_FIRSTHDR(&mh
);
1890 cmsg
->cmsg_level
= SOL_SOCKET
;
1891 cmsg
->cmsg_type
= SCM_RIGHTS
;
1892 cmsg
->cmsg_len
= CMSG_LEN(sizeof(int));
1893 memcpy(CMSG_DATA(cmsg
), &fd
, sizeof(int));
1895 mh
.msg_controllen
= cmsg
->cmsg_len
;
1897 /* Store away the fd in the socket, so that it stays open as
1898 * long as we run the child */
1899 k
= sendmsg(kmsg_socket
, &mh
, MSG_NOSIGNAL
);
1903 return log_error_errno(errno
, "Failed to send FIFO fd: %m");
1905 /* And now make the FIFO unavailable as /run/kmsg... */
1906 (void) unlink(from
);
1911 static int send_rtnl(int send_fd
) {
1913 struct cmsghdr cmsghdr
;
1914 uint8_t buf
[CMSG_SPACE(sizeof(int))];
1916 struct msghdr mh
= {
1917 .msg_control
= &control
,
1918 .msg_controllen
= sizeof(control
),
1920 struct cmsghdr
*cmsg
;
1921 _cleanup_close_
int fd
= -1;
1924 assert(send_fd
>= 0);
1926 if (!arg_expose_ports
)
1929 fd
= socket(PF_NETLINK
, SOCK_RAW
|SOCK_CLOEXEC
|SOCK_NONBLOCK
, NETLINK_ROUTE
);
1931 return log_error_errno(errno
, "Failed to allocate container netlink: %m");
1933 cmsg
= CMSG_FIRSTHDR(&mh
);
1934 cmsg
->cmsg_level
= SOL_SOCKET
;
1935 cmsg
->cmsg_type
= SCM_RIGHTS
;
1936 cmsg
->cmsg_len
= CMSG_LEN(sizeof(int));
1937 memcpy(CMSG_DATA(cmsg
), &fd
, sizeof(int));
1939 mh
.msg_controllen
= cmsg
->cmsg_len
;
1941 /* Store away the fd in the socket, so that it stays open as
1942 * long as we run the child */
1943 k
= sendmsg(send_fd
, &mh
, MSG_NOSIGNAL
);
1945 return log_error_errno(errno
, "Failed to send netlink fd: %m");
1950 static int flush_ports(union in_addr_union
*exposed
) {
1952 int r
, af
= AF_INET
;
1956 if (!arg_expose_ports
)
1959 if (in_addr_is_null(af
, exposed
))
1962 log_debug("Lost IP address.");
1964 LIST_FOREACH(ports
, p
, arg_expose_ports
) {
1965 r
= fw_add_local_dnat(false,
1976 log_warning_errno(r
, "Failed to modify firewall: %m");
1979 *exposed
= IN_ADDR_NULL
;
1983 static int expose_ports(sd_netlink
*rtnl
, union in_addr_union
*exposed
) {
1984 _cleanup_free_
struct local_address
*addresses
= NULL
;
1985 _cleanup_free_
char *pretty
= NULL
;
1986 union in_addr_union new_exposed
;
1989 int af
= AF_INET
, r
;
1993 /* Invoked each time an address is added or removed inside the
1996 if (!arg_expose_ports
)
1999 r
= local_addresses(rtnl
, 0, af
, &addresses
);
2001 return log_error_errno(r
, "Failed to enumerate local addresses: %m");
2004 addresses
[0].family
== af
&&
2005 addresses
[0].scope
< RT_SCOPE_LINK
;
2008 return flush_ports(exposed
);
2010 new_exposed
= addresses
[0].address
;
2011 if (in_addr_equal(af
, exposed
, &new_exposed
))
2014 in_addr_to_string(af
, &new_exposed
, &pretty
);
2015 log_debug("New container IP is %s.", strna(pretty
));
2017 LIST_FOREACH(ports
, p
, arg_expose_ports
) {
2019 r
= fw_add_local_dnat(true,
2028 in_addr_is_null(af
, exposed
) ? NULL
: exposed
);
2030 log_warning_errno(r
, "Failed to modify firewall: %m");
2033 *exposed
= new_exposed
;
2037 static int on_address_change(sd_netlink
*rtnl
, sd_netlink_message
*m
, void *userdata
) {
2038 union in_addr_union
*exposed
= userdata
;
2044 expose_ports(rtnl
, exposed
);
2048 static int watch_rtnl(sd_event
*event
, int recv_fd
, union in_addr_union
*exposed
, sd_netlink
**ret
) {
2050 struct cmsghdr cmsghdr
;
2051 uint8_t buf
[CMSG_SPACE(sizeof(int))];
2053 struct msghdr mh
= {
2054 .msg_control
= &control
,
2055 .msg_controllen
= sizeof(control
),
2057 struct cmsghdr
*cmsg
;
2058 _cleanup_netlink_unref_ sd_netlink
*rtnl
= NULL
;
2063 assert(recv_fd
>= 0);
2066 if (!arg_expose_ports
)
2069 k
= recvmsg(recv_fd
, &mh
, MSG_NOSIGNAL
);
2071 return log_error_errno(errno
, "Failed to recv netlink fd: %m");
2073 cmsg
= CMSG_FIRSTHDR(&mh
);
2074 assert(cmsg
->cmsg_level
== SOL_SOCKET
);
2075 assert(cmsg
->cmsg_type
== SCM_RIGHTS
);
2076 assert(cmsg
->cmsg_len
== CMSG_LEN(sizeof(int)));
2077 memcpy(&fd
, CMSG_DATA(cmsg
), sizeof(int));
2079 r
= sd_netlink_open_fd(&rtnl
, fd
);
2082 return log_error_errno(r
, "Failed to create rtnl object: %m");
2085 r
= sd_netlink_add_match(rtnl
, RTM_NEWADDR
, on_address_change
, exposed
);
2087 return log_error_errno(r
, "Failed to subscribe to RTM_NEWADDR messages: %m");
2089 r
= sd_netlink_add_match(rtnl
, RTM_DELADDR
, on_address_change
, exposed
);
2091 return log_error_errno(r
, "Failed to subscribe to RTM_DELADDR messages: %m");
2093 r
= sd_netlink_attach_event(rtnl
, event
, 0);
2095 return log_error_errno(r
, "Failed to add to even loop: %m");
2103 static int setup_hostname(void) {
2105 if (arg_share_system
)
2108 if (sethostname_idempotent(arg_machine
) < 0)
2114 static int setup_journal(const char *directory
) {
2115 sd_id128_t machine_id
, this_id
;
2116 _cleanup_free_
char *b
= NULL
, *d
= NULL
;
2117 const char *etc_machine_id
, *p
, *q
;
2121 /* Don't link journals in ephemeral mode */
2125 etc_machine_id
= prefix_roota(directory
, "/etc/machine-id");
2127 r
= read_one_line_file(etc_machine_id
, &b
);
2128 if (r
== -ENOENT
&& arg_link_journal
== LINK_AUTO
)
2131 return log_error_errno(r
, "Failed to read machine ID from %s: %m", etc_machine_id
);
2134 if (isempty(id
) && arg_link_journal
== LINK_AUTO
)
2137 /* Verify validity */
2138 r
= sd_id128_from_string(id
, &machine_id
);
2140 return log_error_errno(r
, "Failed to parse machine ID from %s: %m", etc_machine_id
);
2142 r
= sd_id128_get_machine(&this_id
);
2144 return log_error_errno(r
, "Failed to retrieve machine ID: %m");
2146 if (sd_id128_equal(machine_id
, this_id
)) {
2147 log_full(arg_link_journal
== LINK_AUTO
? LOG_WARNING
: LOG_ERR
,
2148 "Host and machine ids are equal (%s): refusing to link journals", id
);
2149 if (arg_link_journal
== LINK_AUTO
)
2154 if (arg_link_journal
== LINK_NO
)
2157 r
= userns_mkdir(directory
, "/var", 0755, 0, 0);
2159 return log_error_errno(r
, "Failed to create /var: %m");
2161 r
= userns_mkdir(directory
, "/var/log", 0755, 0, 0);
2163 return log_error_errno(r
, "Failed to create /var/log: %m");
2165 r
= userns_mkdir(directory
, "/var/log/journal", 0755, 0, 0);
2167 return log_error_errno(r
, "Failed to create /var/log/journal: %m");
2169 p
= strjoina("/var/log/journal/", id
);
2170 q
= prefix_roota(directory
, p
);
2172 if (path_is_mount_point(p
, 0) > 0) {
2173 if (arg_link_journal
!= LINK_AUTO
) {
2174 log_error("%s: already a mount point, refusing to use for journal", p
);
2181 if (path_is_mount_point(q
, 0) > 0) {
2182 if (arg_link_journal
!= LINK_AUTO
) {
2183 log_error("%s: already a mount point, refusing to use for journal", q
);
2190 r
= readlink_and_make_absolute(p
, &d
);
2192 if ((arg_link_journal
== LINK_GUEST
||
2193 arg_link_journal
== LINK_AUTO
) &&
2196 r
= userns_mkdir(directory
, p
, 0755, 0, 0);
2198 log_warning_errno(errno
, "Failed to create directory %s: %m", q
);
2203 return log_error_errno(errno
, "Failed to remove symlink %s: %m", p
);
2204 } else if (r
== -EINVAL
) {
2206 if (arg_link_journal
== LINK_GUEST
&&
2209 if (errno
== ENOTDIR
) {
2210 log_error("%s already exists and is neither a symlink nor a directory", p
);
2213 log_error_errno(errno
, "Failed to remove %s: %m", p
);
2217 } else if (r
!= -ENOENT
) {
2218 log_error_errno(errno
, "readlink(%s) failed: %m", p
);
2222 if (arg_link_journal
== LINK_GUEST
) {
2224 if (symlink(q
, p
) < 0) {
2225 if (arg_link_journal_try
) {
2226 log_debug_errno(errno
, "Failed to symlink %s to %s, skipping journal setup: %m", q
, p
);
2229 log_error_errno(errno
, "Failed to symlink %s to %s: %m", q
, p
);
2234 r
= userns_mkdir(directory
, p
, 0755, 0, 0);
2236 log_warning_errno(errno
, "Failed to create directory %s: %m", q
);
2240 if (arg_link_journal
== LINK_HOST
) {
2241 /* don't create parents here -- if the host doesn't have
2242 * permanent journal set up, don't force it here */
2245 if (arg_link_journal_try
) {
2246 log_debug_errno(errno
, "Failed to create %s, skipping journal setup: %m", p
);
2249 log_error_errno(errno
, "Failed to create %s: %m", p
);
2254 } else if (access(p
, F_OK
) < 0)
2257 if (dir_is_empty(q
) == 0)
2258 log_warning("%s is not empty, proceeding anyway.", q
);
2260 r
= userns_mkdir(directory
, p
, 0755, 0, 0);
2262 log_error_errno(errno
, "Failed to create %s: %m", q
);
2266 if (mount(p
, q
, NULL
, MS_BIND
, NULL
) < 0)
2267 return log_error_errno(errno
, "Failed to bind mount journal from host into guest: %m");
2272 static int drop_capabilities(void) {
2273 return capability_bounding_set_drop(~arg_retain
, false);
2276 static int register_machine(pid_t pid
, int local_ifindex
) {
2277 _cleanup_bus_error_free_ sd_bus_error error
= SD_BUS_ERROR_NULL
;
2278 _cleanup_bus_flush_close_unref_ sd_bus
*bus
= NULL
;
2284 r
= sd_bus_default_system(&bus
);
2286 return log_error_errno(r
, "Failed to open system bus: %m");
2288 if (arg_keep_unit
) {
2289 r
= sd_bus_call_method(
2291 "org.freedesktop.machine1",
2292 "/org/freedesktop/machine1",
2293 "org.freedesktop.machine1.Manager",
2294 "RegisterMachineWithNetwork",
2299 SD_BUS_MESSAGE_APPEND_ID128(arg_uuid
),
2303 strempty(arg_directory
),
2304 local_ifindex
> 0 ? 1 : 0, local_ifindex
);
2306 _cleanup_bus_message_unref_ sd_bus_message
*m
= NULL
;
2310 r
= sd_bus_message_new_method_call(
2313 "org.freedesktop.machine1",
2314 "/org/freedesktop/machine1",
2315 "org.freedesktop.machine1.Manager",
2316 "CreateMachineWithNetwork");
2318 return bus_log_create_error(r
);
2320 r
= sd_bus_message_append(
2324 SD_BUS_MESSAGE_APPEND_ID128(arg_uuid
),
2328 strempty(arg_directory
),
2329 local_ifindex
> 0 ? 1 : 0, local_ifindex
);
2331 return bus_log_create_error(r
);
2333 r
= sd_bus_message_open_container(m
, 'a', "(sv)");
2335 return bus_log_create_error(r
);
2337 if (!isempty(arg_slice
)) {
2338 r
= sd_bus_message_append(m
, "(sv)", "Slice", "s", arg_slice
);
2340 return bus_log_create_error(r
);
2343 r
= sd_bus_message_append(m
, "(sv)", "DevicePolicy", "s", "strict");
2345 return bus_log_create_error(r
);
2347 /* If you make changes here, also make sure to update
2348 * systemd-nspawn@.service, to keep the device
2349 * policies in sync regardless if we are run with or
2350 * without the --keep-unit switch. */
2351 r
= sd_bus_message_append(m
, "(sv)", "DeviceAllow", "a(ss)", 9,
2352 /* Allow the container to
2353 * access and create the API
2354 * device nodes, so that
2355 * PrivateDevices= in the
2356 * container can work
2361 "/dev/random", "rwm",
2362 "/dev/urandom", "rwm",
2364 "/dev/net/tun", "rwm",
2365 /* Allow the container
2366 * access to ptys. However,
2368 * container to ever create
2369 * these device nodes. */
2370 "/dev/pts/ptmx", "rw",
2373 return bus_log_create_error(r
);
2375 for (j
= 0; j
< arg_n_custom_mounts
; j
++) {
2376 CustomMount
*cm
= &arg_custom_mounts
[j
];
2378 if (cm
->type
!= CUSTOM_MOUNT_BIND
)
2381 r
= is_device_node(cm
->source
);
2383 return log_error_errno(r
, "Failed to stat %s: %m", cm
->source
);
2386 r
= sd_bus_message_append(m
, "(sv)", "DeviceAllow", "a(ss)", 1,
2387 cm
->source
, cm
->read_only
? "r" : "rw");
2389 return log_error_errno(r
, "Failed to append message arguments: %m");
2393 if (arg_kill_signal
!= 0) {
2394 r
= sd_bus_message_append(m
, "(sv)", "KillSignal", "i", arg_kill_signal
);
2396 return bus_log_create_error(r
);
2398 r
= sd_bus_message_append(m
, "(sv)", "KillMode", "s", "mixed");
2400 return bus_log_create_error(r
);
2403 STRV_FOREACH(i
, arg_property
) {
2404 r
= sd_bus_message_open_container(m
, 'r', "sv");
2406 return bus_log_create_error(r
);
2408 r
= bus_append_unit_property_assignment(m
, *i
);
2412 r
= sd_bus_message_close_container(m
);
2414 return bus_log_create_error(r
);
2417 r
= sd_bus_message_close_container(m
);
2419 return bus_log_create_error(r
);
2421 r
= sd_bus_call(bus
, m
, 0, &error
, NULL
);
2425 log_error("Failed to register machine: %s", bus_error_message(&error
, r
));
2432 static int terminate_machine(pid_t pid
) {
2433 _cleanup_bus_error_free_ sd_bus_error error
= SD_BUS_ERROR_NULL
;
2434 _cleanup_bus_message_unref_ sd_bus_message
*reply
= NULL
;
2435 _cleanup_bus_flush_close_unref_ sd_bus
*bus
= NULL
;
2442 /* If we are reusing the unit, then just exit, systemd will do
2443 * the right thing when we exit. */
2447 r
= sd_bus_default_system(&bus
);
2449 return log_error_errno(r
, "Failed to open system bus: %m");
2451 r
= sd_bus_call_method(
2453 "org.freedesktop.machine1",
2454 "/org/freedesktop/machine1",
2455 "org.freedesktop.machine1.Manager",
2462 /* Note that the machine might already have been
2463 * cleaned up automatically, hence don't consider it a
2464 * failure if we cannot get the machine object. */
2465 log_debug("Failed to get machine: %s", bus_error_message(&error
, r
));
2469 r
= sd_bus_message_read(reply
, "o", &path
);
2471 return bus_log_parse_error(r
);
2473 r
= sd_bus_call_method(
2475 "org.freedesktop.machine1",
2477 "org.freedesktop.machine1.Machine",
2483 log_debug("Failed to terminate machine: %s", bus_error_message(&error
, r
));
2490 static int reset_audit_loginuid(void) {
2491 _cleanup_free_
char *p
= NULL
;
2494 if (arg_share_system
)
2497 r
= read_one_line_file("/proc/self/loginuid", &p
);
2501 return log_error_errno(r
, "Failed to read /proc/self/loginuid: %m");
2503 /* Already reset? */
2504 if (streq(p
, "4294967295"))
2507 r
= write_string_file("/proc/self/loginuid", "4294967295", 0);
2510 "Failed to reset audit login UID. This probably means that your kernel is too\n"
2511 "old and you have audit enabled. Note that the auditing subsystem is known to\n"
2512 "be incompatible with containers on old kernels. Please make sure to upgrade\n"
2513 "your kernel or to off auditing with 'audit=0' on the kernel command line before\n"
2514 "using systemd-nspawn. Sleeping for 5s... (%m)");
2522 #define HOST_HASH_KEY SD_ID128_MAKE(1a,37,6f,c7,46,ec,45,0b,ad,a3,d5,31,06,60,5d,b1)
2523 #define CONTAINER_HASH_KEY SD_ID128_MAKE(c3,c4,f9,19,b5,57,b2,1c,e6,cf,14,27,03,9c,ee,a2)
2524 #define MACVLAN_HASH_KEY SD_ID128_MAKE(00,13,6d,bc,66,83,44,81,bb,0c,f9,51,1f,24,a6,6f)
2526 static int generate_mac(struct ether_addr
*mac
, sd_id128_t hash_key
, uint64_t idx
) {
2532 l
= strlen(arg_machine
);
2533 sz
= sizeof(sd_id128_t
) + l
;
2539 /* fetch some persistent data unique to the host */
2540 r
= sd_id128_get_machine((sd_id128_t
*) v
);
2544 /* combine with some data unique (on this host) to this
2545 * container instance */
2546 i
= mempcpy(v
+ sizeof(sd_id128_t
), arg_machine
, l
);
2549 memcpy(i
, &idx
, sizeof(idx
));
2552 /* Let's hash the host machine ID plus the container name. We
2553 * use a fixed, but originally randomly created hash key here. */
2554 siphash24(result
, v
, sz
, hash_key
.bytes
);
2556 assert_cc(ETH_ALEN
<= sizeof(result
));
2557 memcpy(mac
->ether_addr_octet
, result
, ETH_ALEN
);
2559 /* see eth_random_addr in the kernel */
2560 mac
->ether_addr_octet
[0] &= 0xfe; /* clear multicast bit */
2561 mac
->ether_addr_octet
[0] |= 0x02; /* set local assignment bit (IEEE802) */
2566 static int setup_veth(pid_t pid
, char iface_name
[IFNAMSIZ
], int *ifi
) {
2567 _cleanup_netlink_message_unref_ sd_netlink_message
*m
= NULL
;
2568 _cleanup_netlink_unref_ sd_netlink
*rtnl
= NULL
;
2569 struct ether_addr mac_host
, mac_container
;
2572 if (!arg_private_network
)
2575 if (!arg_network_veth
)
2578 /* Use two different interface name prefixes depending whether
2579 * we are in bridge mode or not. */
2580 snprintf(iface_name
, IFNAMSIZ
- 1, "%s-%s",
2581 arg_network_bridge
? "vb" : "ve", arg_machine
);
2583 r
= generate_mac(&mac_container
, CONTAINER_HASH_KEY
, 0);
2585 return log_error_errno(r
, "Failed to generate predictable MAC address for container side: %m");
2587 r
= generate_mac(&mac_host
, HOST_HASH_KEY
, 0);
2589 return log_error_errno(r
, "Failed to generate predictable MAC address for host side: %m");
2591 r
= sd_netlink_open(&rtnl
);
2593 return log_error_errno(r
, "Failed to connect to netlink: %m");
2595 r
= sd_rtnl_message_new_link(rtnl
, &m
, RTM_NEWLINK
, 0);
2597 return log_error_errno(r
, "Failed to allocate netlink message: %m");
2599 r
= sd_netlink_message_append_string(m
, IFLA_IFNAME
, iface_name
);
2601 return log_error_errno(r
, "Failed to add netlink interface name: %m");
2603 r
= sd_netlink_message_append_ether_addr(m
, IFLA_ADDRESS
, &mac_host
);
2605 return log_error_errno(r
, "Failed to add netlink MAC address: %m");
2607 r
= sd_netlink_message_open_container(m
, IFLA_LINKINFO
);
2609 return log_error_errno(r
, "Failed to open netlink container: %m");
2611 r
= sd_netlink_message_open_container_union(m
, IFLA_INFO_DATA
, "veth");
2613 return log_error_errno(r
, "Failed to open netlink container: %m");
2615 r
= sd_netlink_message_open_container(m
, VETH_INFO_PEER
);
2617 return log_error_errno(r
, "Failed to open netlink container: %m");
2619 r
= sd_netlink_message_append_string(m
, IFLA_IFNAME
, "host0");
2621 return log_error_errno(r
, "Failed to add netlink interface name: %m");
2623 r
= sd_netlink_message_append_ether_addr(m
, IFLA_ADDRESS
, &mac_container
);
2625 return log_error_errno(r
, "Failed to add netlink MAC address: %m");
2627 r
= sd_netlink_message_append_u32(m
, IFLA_NET_NS_PID
, pid
);
2629 return log_error_errno(r
, "Failed to add netlink namespace field: %m");
2631 r
= sd_netlink_message_close_container(m
);
2633 return log_error_errno(r
, "Failed to close netlink container: %m");
2635 r
= sd_netlink_message_close_container(m
);
2637 return log_error_errno(r
, "Failed to close netlink container: %m");
2639 r
= sd_netlink_message_close_container(m
);
2641 return log_error_errno(r
, "Failed to close netlink container: %m");
2643 r
= sd_netlink_call(rtnl
, m
, 0, NULL
);
2645 return log_error_errno(r
, "Failed to add new veth interfaces (host0, %s): %m", iface_name
);
2647 i
= (int) if_nametoindex(iface_name
);
2649 return log_error_errno(errno
, "Failed to resolve interface %s: %m", iface_name
);
2656 static int setup_bridge(const char veth_name
[], int *ifi
) {
2657 _cleanup_netlink_message_unref_ sd_netlink_message
*m
= NULL
;
2658 _cleanup_netlink_unref_ sd_netlink
*rtnl
= NULL
;
2661 if (!arg_private_network
)
2664 if (!arg_network_veth
)
2667 if (!arg_network_bridge
)
2670 bridge
= (int) if_nametoindex(arg_network_bridge
);
2672 return log_error_errno(errno
, "Failed to resolve interface %s: %m", arg_network_bridge
);
2676 r
= sd_netlink_open(&rtnl
);
2678 return log_error_errno(r
, "Failed to connect to netlink: %m");
2680 r
= sd_rtnl_message_new_link(rtnl
, &m
, RTM_SETLINK
, 0);
2682 return log_error_errno(r
, "Failed to allocate netlink message: %m");
2684 r
= sd_rtnl_message_link_set_flags(m
, IFF_UP
, IFF_UP
);
2686 return log_error_errno(r
, "Failed to set IFF_UP flag: %m");
2688 r
= sd_netlink_message_append_string(m
, IFLA_IFNAME
, veth_name
);
2690 return log_error_errno(r
, "Failed to add netlink interface name field: %m");
2692 r
= sd_netlink_message_append_u32(m
, IFLA_MASTER
, bridge
);
2694 return log_error_errno(r
, "Failed to add netlink master field: %m");
2696 r
= sd_netlink_call(rtnl
, m
, 0, NULL
);
2698 return log_error_errno(r
, "Failed to add veth interface to bridge: %m");
2703 static int parse_interface(struct udev
*udev
, const char *name
) {
2704 _cleanup_udev_device_unref_
struct udev_device
*d
= NULL
;
2705 char ifi_str
[2 + DECIMAL_STR_MAX(int)];
2708 ifi
= (int) if_nametoindex(name
);
2710 return log_error_errno(errno
, "Failed to resolve interface %s: %m", name
);
2712 sprintf(ifi_str
, "n%i", ifi
);
2713 d
= udev_device_new_from_device_id(udev
, ifi_str
);
2715 return log_error_errno(errno
, "Failed to get udev device for interface %s: %m", name
);
2717 if (udev_device_get_is_initialized(d
) <= 0) {
2718 log_error("Network interface %s is not initialized yet.", name
);
2725 static int move_network_interfaces(pid_t pid
) {
2726 _cleanup_udev_unref_
struct udev
*udev
= NULL
;
2727 _cleanup_netlink_unref_ sd_netlink
*rtnl
= NULL
;
2731 if (!arg_private_network
)
2734 if (strv_isempty(arg_network_interfaces
))
2737 r
= sd_netlink_open(&rtnl
);
2739 return log_error_errno(r
, "Failed to connect to netlink: %m");
2743 log_error("Failed to connect to udev.");
2747 STRV_FOREACH(i
, arg_network_interfaces
) {
2748 _cleanup_netlink_message_unref_ sd_netlink_message
*m
= NULL
;
2751 ifi
= parse_interface(udev
, *i
);
2755 r
= sd_rtnl_message_new_link(rtnl
, &m
, RTM_SETLINK
, ifi
);
2757 return log_error_errno(r
, "Failed to allocate netlink message: %m");
2759 r
= sd_netlink_message_append_u32(m
, IFLA_NET_NS_PID
, pid
);
2761 return log_error_errno(r
, "Failed to append namespace PID to netlink message: %m");
2763 r
= sd_netlink_call(rtnl
, m
, 0, NULL
);
2765 return log_error_errno(r
, "Failed to move interface %s to namespace: %m", *i
);
2771 static int setup_macvlan(pid_t pid
) {
2772 _cleanup_udev_unref_
struct udev
*udev
= NULL
;
2773 _cleanup_netlink_unref_ sd_netlink
*rtnl
= NULL
;
2778 if (!arg_private_network
)
2781 if (strv_isempty(arg_network_macvlan
))
2784 r
= sd_netlink_open(&rtnl
);
2786 return log_error_errno(r
, "Failed to connect to netlink: %m");
2790 log_error("Failed to connect to udev.");
2794 STRV_FOREACH(i
, arg_network_macvlan
) {
2795 _cleanup_netlink_message_unref_ sd_netlink_message
*m
= NULL
;
2796 _cleanup_free_
char *n
= NULL
;
2797 struct ether_addr mac
;
2800 ifi
= parse_interface(udev
, *i
);
2804 r
= generate_mac(&mac
, MACVLAN_HASH_KEY
, idx
++);
2806 return log_error_errno(r
, "Failed to create MACVLAN MAC address: %m");
2808 r
= sd_rtnl_message_new_link(rtnl
, &m
, RTM_NEWLINK
, 0);
2810 return log_error_errno(r
, "Failed to allocate netlink message: %m");
2812 r
= sd_netlink_message_append_u32(m
, IFLA_LINK
, ifi
);
2814 return log_error_errno(r
, "Failed to add netlink interface index: %m");
2816 n
= strappend("mv-", *i
);
2820 strshorten(n
, IFNAMSIZ
-1);
2822 r
= sd_netlink_message_append_string(m
, IFLA_IFNAME
, n
);
2824 return log_error_errno(r
, "Failed to add netlink interface name: %m");
2826 r
= sd_netlink_message_append_ether_addr(m
, IFLA_ADDRESS
, &mac
);
2828 return log_error_errno(r
, "Failed to add netlink MAC address: %m");
2830 r
= sd_netlink_message_append_u32(m
, IFLA_NET_NS_PID
, pid
);
2832 return log_error_errno(r
, "Failed to add netlink namespace field: %m");
2834 r
= sd_netlink_message_open_container(m
, IFLA_LINKINFO
);
2836 return log_error_errno(r
, "Failed to open netlink container: %m");
2838 r
= sd_netlink_message_open_container_union(m
, IFLA_INFO_DATA
, "macvlan");
2840 return log_error_errno(r
, "Failed to open netlink container: %m");
2842 r
= sd_netlink_message_append_u32(m
, IFLA_MACVLAN_MODE
, MACVLAN_MODE_BRIDGE
);
2844 return log_error_errno(r
, "Failed to append macvlan mode: %m");
2846 r
= sd_netlink_message_close_container(m
);
2848 return log_error_errno(r
, "Failed to close netlink container: %m");
2850 r
= sd_netlink_message_close_container(m
);
2852 return log_error_errno(r
, "Failed to close netlink container: %m");
2854 r
= sd_netlink_call(rtnl
, m
, 0, NULL
);
2856 return log_error_errno(r
, "Failed to add new macvlan interfaces: %m");
2862 static int setup_ipvlan(pid_t pid
) {
2863 _cleanup_udev_unref_
struct udev
*udev
= NULL
;
2864 _cleanup_netlink_unref_ sd_netlink
*rtnl
= NULL
;
2868 if (!arg_private_network
)
2871 if (strv_isempty(arg_network_ipvlan
))
2874 r
= sd_netlink_open(&rtnl
);
2876 return log_error_errno(r
, "Failed to connect to netlink: %m");
2880 log_error("Failed to connect to udev.");
2884 STRV_FOREACH(i
, arg_network_ipvlan
) {
2885 _cleanup_netlink_message_unref_ sd_netlink_message
*m
= NULL
;
2886 _cleanup_free_
char *n
= NULL
;
2889 ifi
= parse_interface(udev
, *i
);
2893 r
= sd_rtnl_message_new_link(rtnl
, &m
, RTM_NEWLINK
, 0);
2895 return log_error_errno(r
, "Failed to allocate netlink message: %m");
2897 r
= sd_netlink_message_append_u32(m
, IFLA_LINK
, ifi
);
2899 return log_error_errno(r
, "Failed to add netlink interface index: %m");
2901 n
= strappend("iv-", *i
);
2905 strshorten(n
, IFNAMSIZ
-1);
2907 r
= sd_netlink_message_append_string(m
, IFLA_IFNAME
, n
);
2909 return log_error_errno(r
, "Failed to add netlink interface name: %m");
2911 r
= sd_netlink_message_append_u32(m
, IFLA_NET_NS_PID
, pid
);
2913 return log_error_errno(r
, "Failed to add netlink namespace field: %m");
2915 r
= sd_netlink_message_open_container(m
, IFLA_LINKINFO
);
2917 return log_error_errno(r
, "Failed to open netlink container: %m");
2919 r
= sd_netlink_message_open_container_union(m
, IFLA_INFO_DATA
, "ipvlan");
2921 return log_error_errno(r
, "Failed to open netlink container: %m");
2923 r
= sd_netlink_message_append_u16(m
, IFLA_IPVLAN_MODE
, IPVLAN_MODE_L2
);
2925 return log_error_errno(r
, "Failed to add ipvlan mode: %m");
2927 r
= sd_netlink_message_close_container(m
);
2929 return log_error_errno(r
, "Failed to close netlink container: %m");
2931 r
= sd_netlink_message_close_container(m
);
2933 return log_error_errno(r
, "Failed to close netlink container: %m");
2935 r
= sd_netlink_call(rtnl
, m
, 0, NULL
);
2937 return log_error_errno(r
, "Failed to add new ipvlan interfaces: %m");
2943 static int setup_seccomp(void) {
2946 static const struct {
2947 uint64_t capability
;
2950 { CAP_SYS_RAWIO
, SCMP_SYS(iopl
) },
2951 { CAP_SYS_RAWIO
, SCMP_SYS(ioperm
) },
2952 { CAP_SYS_BOOT
, SCMP_SYS(kexec_load
) },
2953 { CAP_SYS_ADMIN
, SCMP_SYS(swapon
) },
2954 { CAP_SYS_ADMIN
, SCMP_SYS(swapoff
) },
2955 { CAP_SYS_ADMIN
, SCMP_SYS(open_by_handle_at
) },
2956 { CAP_SYS_MODULE
, SCMP_SYS(init_module
) },
2957 { CAP_SYS_MODULE
, SCMP_SYS(finit_module
) },
2958 { CAP_SYS_MODULE
, SCMP_SYS(delete_module
) },
2959 { CAP_SYSLOG
, SCMP_SYS(syslog
) },
2962 scmp_filter_ctx seccomp
;
2966 seccomp
= seccomp_init(SCMP_ACT_ALLOW
);
2970 r
= seccomp_add_secondary_archs(seccomp
);
2972 log_error_errno(r
, "Failed to add secondary archs to seccomp filter: %m");
2976 for (i
= 0; i
< ELEMENTSOF(blacklist
); i
++) {
2977 if (arg_retain
& (1ULL << blacklist
[i
].capability
))
2980 r
= seccomp_rule_add(seccomp
, SCMP_ACT_ERRNO(EPERM
), blacklist
[i
].syscall_num
, 0);
2982 continue; /* unknown syscall */
2984 log_error_errno(r
, "Failed to block syscall: %m");
2991 Audit is broken in containers, much of the userspace audit
2992 hookup will fail if running inside a container. We don't
2993 care and just turn off creation of audit sockets.
2995 This will make socket(AF_NETLINK, *, NETLINK_AUDIT) fail
2996 with EAFNOSUPPORT which audit userspace uses as indication
2997 that audit is disabled in the kernel.
3000 r
= seccomp_rule_add(
3002 SCMP_ACT_ERRNO(EAFNOSUPPORT
),
3005 SCMP_A0(SCMP_CMP_EQ
, AF_NETLINK
),
3006 SCMP_A2(SCMP_CMP_EQ
, NETLINK_AUDIT
));
3008 log_error_errno(r
, "Failed to add audit seccomp rule: %m");
3012 r
= seccomp_attr_set(seccomp
, SCMP_FLTATR_CTL_NNP
, 0);
3014 log_error_errno(r
, "Failed to unset NO_NEW_PRIVS: %m");
3018 r
= seccomp_load(seccomp
);
3020 log_debug_errno(r
, "Kernel is probably not configured with CONFIG_SECCOMP. Disabling seccomp audit filter: %m");
3025 log_error_errno(r
, "Failed to install seccomp audit filter: %m");
3030 seccomp_release(seccomp
);
3038 static int setup_propagate(const char *root
) {
3041 (void) mkdir_p("/run/systemd/nspawn/", 0755);
3042 (void) mkdir_p("/run/systemd/nspawn/propagate", 0600);
3043 p
= strjoina("/run/systemd/nspawn/propagate/", arg_machine
);
3044 (void) mkdir_p(p
, 0600);
3046 if (userns_mkdir(root
, "/run/systemd", 0755, 0, 0) < 0)
3047 return log_error_errno(errno
, "Failed to create /run/systemd: %m");
3049 if (userns_mkdir(root
, "/run/systemd/nspawn", 0755, 0, 0) < 0)
3050 return log_error_errno(errno
, "Failed to create /run/systemd/nspawn: %m");
3052 if (userns_mkdir(root
, "/run/systemd/nspawn/incoming", 0600, 0, 0) < 0)
3053 return log_error_errno(errno
, "Failed to create /run/systemd/nspawn/incoming: %m");
3055 q
= prefix_roota(root
, "/run/systemd/nspawn/incoming");
3056 if (mount(p
, q
, NULL
, MS_BIND
, NULL
) < 0)
3057 return log_error_errno(errno
, "Failed to install propagation bind mount.");
3059 if (mount(NULL
, q
, NULL
, MS_BIND
|MS_REMOUNT
|MS_RDONLY
, NULL
) < 0)
3060 return log_error_errno(errno
, "Failed to make propagation mount read-only");
3065 static int setup_image(char **device_path
, int *loop_nr
) {
3066 struct loop_info64 info
= {
3067 .lo_flags
= LO_FLAGS_AUTOCLEAR
|LO_FLAGS_PARTSCAN
3069 _cleanup_close_
int fd
= -1, control
= -1, loop
= -1;
3070 _cleanup_free_
char* loopdev
= NULL
;
3074 assert(device_path
);
3078 fd
= open(arg_image
, O_CLOEXEC
|(arg_read_only
? O_RDONLY
: O_RDWR
)|O_NONBLOCK
|O_NOCTTY
);
3080 return log_error_errno(errno
, "Failed to open %s: %m", arg_image
);
3082 if (fstat(fd
, &st
) < 0)
3083 return log_error_errno(errno
, "Failed to stat %s: %m", arg_image
);
3085 if (S_ISBLK(st
.st_mode
)) {
3088 p
= strdup(arg_image
);
3102 if (!S_ISREG(st
.st_mode
)) {
3103 log_error_errno(errno
, "%s is not a regular file or block device: %m", arg_image
);
3107 control
= open("/dev/loop-control", O_RDWR
|O_CLOEXEC
|O_NOCTTY
|O_NONBLOCK
);
3109 return log_error_errno(errno
, "Failed to open /dev/loop-control: %m");
3111 nr
= ioctl(control
, LOOP_CTL_GET_FREE
);
3113 return log_error_errno(errno
, "Failed to allocate loop device: %m");
3115 if (asprintf(&loopdev
, "/dev/loop%i", nr
) < 0)
3118 loop
= open(loopdev
, O_CLOEXEC
|(arg_read_only
? O_RDONLY
: O_RDWR
)|O_NONBLOCK
|O_NOCTTY
);
3120 return log_error_errno(errno
, "Failed to open loop device %s: %m", loopdev
);
3122 if (ioctl(loop
, LOOP_SET_FD
, fd
) < 0)
3123 return log_error_errno(errno
, "Failed to set loopback file descriptor on %s: %m", loopdev
);
3126 info
.lo_flags
|= LO_FLAGS_READ_ONLY
;
3128 if (ioctl(loop
, LOOP_SET_STATUS64
, &info
) < 0)
3129 return log_error_errno(errno
, "Failed to set loopback settings on %s: %m", loopdev
);
3131 *device_path
= loopdev
;
3142 #define PARTITION_TABLE_BLURB \
3143 "Note that the disk image needs to either contain only a single MBR partition of\n" \
3144 "type 0x83 that is marked bootable, or a single GPT partition of type " \
3145 "0FC63DAF-8483-4772-8E79-3D69D8477DE4 or follow\n" \
3146 " http://www.freedesktop.org/wiki/Specifications/DiscoverablePartitionsSpec/\n" \
3147 "to be bootable with systemd-nspawn."
3149 static int dissect_image(
3151 char **root_device
, bool *root_device_rw
,
3152 char **home_device
, bool *home_device_rw
,
3153 char **srv_device
, bool *srv_device_rw
,
3157 int home_nr
= -1, srv_nr
= -1;
3158 #ifdef GPT_ROOT_NATIVE
3161 #ifdef GPT_ROOT_SECONDARY
3162 int secondary_root_nr
= -1;
3164 _cleanup_free_
char *home
= NULL
, *root
= NULL
, *secondary_root
= NULL
, *srv
= NULL
, *generic
= NULL
;
3165 _cleanup_udev_enumerate_unref_
struct udev_enumerate
*e
= NULL
;
3166 _cleanup_udev_device_unref_
struct udev_device
*d
= NULL
;
3167 _cleanup_blkid_free_probe_ blkid_probe b
= NULL
;
3168 _cleanup_udev_unref_
struct udev
*udev
= NULL
;
3169 struct udev_list_entry
*first
, *item
;
3170 bool home_rw
= true, root_rw
= true, secondary_root_rw
= true, srv_rw
= true, generic_rw
= true;
3171 bool is_gpt
, is_mbr
, multiple_generic
= false;
3172 const char *pttype
= NULL
;
3179 assert(root_device
);
3180 assert(home_device
);
3185 b
= blkid_new_probe();
3190 r
= blkid_probe_set_device(b
, fd
, 0, 0);
3195 log_error_errno(errno
, "Failed to set device on blkid probe: %m");
3199 blkid_probe_enable_partitions(b
, 1);
3200 blkid_probe_set_partitions_flags(b
, BLKID_PARTS_ENTRY_DETAILS
);
3203 r
= blkid_do_safeprobe(b
);
3204 if (r
== -2 || r
== 1) {
3205 log_error("Failed to identify any partition table on\n"
3207 PARTITION_TABLE_BLURB
, arg_image
);
3209 } else if (r
!= 0) {
3212 log_error_errno(errno
, "Failed to probe: %m");
3216 (void) blkid_probe_lookup_value(b
, "PTTYPE", &pttype
, NULL
);
3218 is_gpt
= streq_ptr(pttype
, "gpt");
3219 is_mbr
= streq_ptr(pttype
, "dos");
3221 if (!is_gpt
&& !is_mbr
) {
3222 log_error("No GPT or MBR partition table discovered on\n"
3224 PARTITION_TABLE_BLURB
, arg_image
);
3229 pl
= blkid_probe_get_partitions(b
);
3234 log_error("Failed to list partitions of %s", arg_image
);
3242 if (fstat(fd
, &st
) < 0)
3243 return log_error_errno(errno
, "Failed to stat block device: %m");
3245 d
= udev_device_new_from_devnum(udev
, 'b', st
.st_rdev
);
3253 log_error("Kernel partitions never appeared.");
3257 e
= udev_enumerate_new(udev
);
3261 r
= udev_enumerate_add_match_parent(e
, d
);
3265 r
= udev_enumerate_scan_devices(e
);
3267 return log_error_errno(r
, "Failed to scan for partition devices of %s: %m", arg_image
);
3269 /* Count the partitions enumerated by the kernel */
3271 first
= udev_enumerate_get_list_entry(e
);
3272 udev_list_entry_foreach(item
, first
)
3275 /* Count the partitions enumerated by blkid */
3276 m
= blkid_partlist_numof_partitions(pl
);
3280 log_error("blkid and kernel partition list do not match.");
3286 /* The kernel has probed fewer partitions than
3287 * blkid? Maybe the kernel prober is still
3288 * running or it got EBUSY because udev
3289 * already opened the device. Let's reprobe
3290 * the device, which is a synchronous call
3291 * that waits until probing is complete. */
3293 for (j
= 0; j
< 20; j
++) {
3295 r
= ioctl(fd
, BLKRRPART
, 0);
3298 if (r
>= 0 || r
!= -EBUSY
)
3301 /* If something else has the device
3302 * open, such as an udev rule, the
3303 * ioctl will return EBUSY. Since
3304 * there's no way to wait until it
3305 * isn't busy anymore, let's just wait
3306 * a bit, and try again.
3308 * This is really something they
3309 * should fix in the kernel! */
3311 usleep(50 * USEC_PER_MSEC
);
3315 return log_error_errno(r
, "Failed to reread partition table: %m");
3318 e
= udev_enumerate_unref(e
);
3321 first
= udev_enumerate_get_list_entry(e
);
3322 udev_list_entry_foreach(item
, first
) {
3323 _cleanup_udev_device_unref_
struct udev_device
*q
;
3325 unsigned long long flags
;
3331 q
= udev_device_new_from_syspath(udev
, udev_list_entry_get_name(item
));
3336 log_error_errno(errno
, "Failed to get partition device of %s: %m", arg_image
);
3340 qn
= udev_device_get_devnum(q
);
3344 if (st
.st_rdev
== qn
)
3347 node
= udev_device_get_devnode(q
);
3351 pp
= blkid_partlist_devno_to_partition(pl
, qn
);
3355 flags
= blkid_partition_get_flags(pp
);
3357 nr
= blkid_partition_get_partno(pp
);
3365 if (flags
& GPT_FLAG_NO_AUTO
)
3368 stype
= blkid_partition_get_type_string(pp
);
3372 if (sd_id128_from_string(stype
, &type_id
) < 0)
3375 if (sd_id128_equal(type_id
, GPT_HOME
)) {
3377 if (home
&& nr
>= home_nr
)
3381 home_rw
= !(flags
& GPT_FLAG_READ_ONLY
);
3383 r
= free_and_strdup(&home
, node
);
3387 } else if (sd_id128_equal(type_id
, GPT_SRV
)) {
3389 if (srv
&& nr
>= srv_nr
)
3393 srv_rw
= !(flags
& GPT_FLAG_READ_ONLY
);
3395 r
= free_and_strdup(&srv
, node
);
3399 #ifdef GPT_ROOT_NATIVE
3400 else if (sd_id128_equal(type_id
, GPT_ROOT_NATIVE
)) {
3402 if (root
&& nr
>= root_nr
)
3406 root_rw
= !(flags
& GPT_FLAG_READ_ONLY
);
3408 r
= free_and_strdup(&root
, node
);
3413 #ifdef GPT_ROOT_SECONDARY
3414 else if (sd_id128_equal(type_id
, GPT_ROOT_SECONDARY
)) {
3416 if (secondary_root
&& nr
>= secondary_root_nr
)
3419 secondary_root_nr
= nr
;
3420 secondary_root_rw
= !(flags
& GPT_FLAG_READ_ONLY
);
3422 r
= free_and_strdup(&secondary_root
, node
);
3427 else if (sd_id128_equal(type_id
, GPT_LINUX_GENERIC
)) {
3430 multiple_generic
= true;
3432 generic_rw
= !(flags
& GPT_FLAG_READ_ONLY
);
3434 r
= free_and_strdup(&generic
, node
);
3440 } else if (is_mbr
) {
3443 if (flags
!= 0x80) /* Bootable flag */
3446 type
= blkid_partition_get_type(pp
);
3447 if (type
!= 0x83) /* Linux partition */
3451 multiple_generic
= true;
3455 r
= free_and_strdup(&root
, node
);
3463 *root_device
= root
;
3466 *root_device_rw
= root_rw
;
3468 } else if (secondary_root
) {
3469 *root_device
= secondary_root
;
3470 secondary_root
= NULL
;
3472 *root_device_rw
= secondary_root_rw
;
3474 } else if (generic
) {
3476 /* There were no partitions with precise meanings
3477 * around, but we found generic partitions. In this
3478 * case, if there's only one, we can go ahead and boot
3479 * it, otherwise we bail out, because we really cannot
3480 * make any sense of it. */
3482 if (multiple_generic
) {
3483 log_error("Identified multiple bootable Linux partitions on\n"
3485 PARTITION_TABLE_BLURB
, arg_image
);
3489 *root_device
= generic
;
3492 *root_device_rw
= generic_rw
;
3495 log_error("Failed to identify root partition in disk image\n"
3497 PARTITION_TABLE_BLURB
, arg_image
);
3502 *home_device
= home
;
3505 *home_device_rw
= home_rw
;
3512 *srv_device_rw
= srv_rw
;
3517 log_error("--image= is not supported, compiled without blkid support.");
3522 static int mount_device(const char *what
, const char *where
, const char *directory
, bool rw
) {
3524 _cleanup_blkid_free_probe_ blkid_probe b
= NULL
;
3525 const char *fstype
, *p
;
3535 p
= strjoina(where
, directory
);
3540 b
= blkid_new_probe_from_filename(what
);
3544 log_error_errno(errno
, "Failed to allocate prober for %s: %m", what
);
3548 blkid_probe_enable_superblocks(b
, 1);
3549 blkid_probe_set_superblocks_flags(b
, BLKID_SUBLKS_TYPE
);
3552 r
= blkid_do_safeprobe(b
);
3553 if (r
== -1 || r
== 1) {
3554 log_error("Cannot determine file system type of %s", what
);
3556 } else if (r
!= 0) {
3559 log_error_errno(errno
, "Failed to probe %s: %m", what
);
3564 if (blkid_probe_lookup_value(b
, "TYPE", &fstype
, NULL
) < 0) {
3567 log_error("Failed to determine file system type of %s", what
);
3571 if (streq(fstype
, "crypto_LUKS")) {
3572 log_error("nspawn currently does not support LUKS disk images.");
3576 if (mount(what
, p
, fstype
, MS_NODEV
|(rw
? 0 : MS_RDONLY
), NULL
) < 0)
3577 return log_error_errno(errno
, "Failed to mount %s: %m", what
);
3581 log_error("--image= is not supported, compiled without blkid support.");
3586 static int mount_devices(
3588 const char *root_device
, bool root_device_rw
,
3589 const char *home_device
, bool home_device_rw
,
3590 const char *srv_device
, bool srv_device_rw
) {
3596 r
= mount_device(root_device
, arg_directory
, NULL
, root_device_rw
);
3598 return log_error_errno(r
, "Failed to mount root directory: %m");
3602 r
= mount_device(home_device
, arg_directory
, "/home", home_device_rw
);
3604 return log_error_errno(r
, "Failed to mount home directory: %m");
3608 r
= mount_device(srv_device
, arg_directory
, "/srv", srv_device_rw
);
3610 return log_error_errno(r
, "Failed to mount server data directory: %m");
3616 static void loop_remove(int nr
, int *image_fd
) {
3617 _cleanup_close_
int control
= -1;
3623 if (image_fd
&& *image_fd
>= 0) {
3624 r
= ioctl(*image_fd
, LOOP_CLR_FD
);
3626 log_debug_errno(errno
, "Failed to close loop image: %m");
3627 *image_fd
= safe_close(*image_fd
);
3630 control
= open("/dev/loop-control", O_RDWR
|O_CLOEXEC
|O_NOCTTY
|O_NONBLOCK
);
3632 log_warning_errno(errno
, "Failed to open /dev/loop-control: %m");
3636 r
= ioctl(control
, LOOP_CTL_REMOVE
, nr
);
3638 log_debug_errno(errno
, "Failed to remove loop %d: %m", nr
);
3641 static int spawn_getent(const char *database
, const char *key
, pid_t
*rpid
) {
3649 if (pipe2(pipe_fds
, O_CLOEXEC
) < 0)
3650 return log_error_errno(errno
, "Failed to allocate pipe: %m");
3654 return log_error_errno(errno
, "Failed to fork getent child: %m");
3655 else if (pid
== 0) {
3657 char *empty_env
= NULL
;
3659 if (dup3(pipe_fds
[1], STDOUT_FILENO
, 0) < 0)
3660 _exit(EXIT_FAILURE
);
3662 if (pipe_fds
[0] > 2)
3663 safe_close(pipe_fds
[0]);
3664 if (pipe_fds
[1] > 2)
3665 safe_close(pipe_fds
[1]);
3667 nullfd
= open("/dev/null", O_RDWR
);
3669 _exit(EXIT_FAILURE
);
3671 if (dup3(nullfd
, STDIN_FILENO
, 0) < 0)
3672 _exit(EXIT_FAILURE
);
3674 if (dup3(nullfd
, STDERR_FILENO
, 0) < 0)
3675 _exit(EXIT_FAILURE
);
3680 (void) reset_all_signal_handlers();
3681 (void) reset_signal_mask();
3682 close_all_fds(NULL
, 0);
3684 execle("/usr/bin/getent", "getent", database
, key
, NULL
, &empty_env
);
3685 execle("/bin/getent", "getent", database
, key
, NULL
, &empty_env
);
3686 _exit(EXIT_FAILURE
);
3689 pipe_fds
[1] = safe_close(pipe_fds
[1]);
3696 static int change_uid_gid(char **_home
) {
3697 char line
[LINE_MAX
], *x
, *u
, *g
, *h
;
3698 const char *word
, *state
;
3699 _cleanup_free_ uid_t
*uids
= NULL
;
3700 _cleanup_free_
char *home
= NULL
;
3701 _cleanup_fclose_
FILE *f
= NULL
;
3702 _cleanup_close_
int fd
= -1;
3703 unsigned n_uids
= 0;
3712 if (!arg_user
|| streq(arg_user
, "root") || streq(arg_user
, "0")) {
3713 /* Reset everything fully to 0, just in case */
3715 r
= reset_uid_gid();
3717 return log_error_errno(r
, "Failed to become root: %m");
3723 /* First, get user credentials */
3724 fd
= spawn_getent("passwd", arg_user
, &pid
);
3728 f
= fdopen(fd
, "r");
3733 if (!fgets(line
, sizeof(line
), f
)) {
3736 log_error("Failed to resolve user %s.", arg_user
);
3740 log_error_errno(errno
, "Failed to read from getent: %m");
3746 wait_for_terminate_and_warn("getent passwd", pid
, true);
3748 x
= strchr(line
, ':');
3750 log_error("/etc/passwd entry has invalid user field.");
3754 u
= strchr(x
+1, ':');
3756 log_error("/etc/passwd entry has invalid password field.");
3763 log_error("/etc/passwd entry has invalid UID field.");
3771 log_error("/etc/passwd entry has invalid GID field.");
3776 h
= strchr(x
+1, ':');
3778 log_error("/etc/passwd entry has invalid GECOS field.");
3785 log_error("/etc/passwd entry has invalid home directory field.");
3791 r
= parse_uid(u
, &uid
);
3793 log_error("Failed to parse UID of user.");
3797 r
= parse_gid(g
, &gid
);
3799 log_error("Failed to parse GID of user.");
3807 /* Second, get group memberships */
3808 fd
= spawn_getent("initgroups", arg_user
, &pid
);
3813 f
= fdopen(fd
, "r");
3818 if (!fgets(line
, sizeof(line
), f
)) {
3820 log_error("Failed to resolve user %s.", arg_user
);
3824 log_error_errno(errno
, "Failed to read from getent: %m");
3830 wait_for_terminate_and_warn("getent initgroups", pid
, true);
3832 /* Skip over the username and subsequent separator whitespace */
3834 x
+= strcspn(x
, WHITESPACE
);
3835 x
+= strspn(x
, WHITESPACE
);
3837 FOREACH_WORD(word
, l
, x
, state
) {
3843 if (!GREEDY_REALLOC(uids
, sz
, n_uids
+1))
3846 r
= parse_uid(c
, &uids
[n_uids
++]);
3848 log_error("Failed to parse group data from getent.");
3853 r
= mkdir_parents(home
, 0775);
3855 return log_error_errno(r
, "Failed to make home root directory: %m");
3857 r
= mkdir_safe(home
, 0755, uid
, gid
);
3858 if (r
< 0 && r
!= -EEXIST
)
3859 return log_error_errno(r
, "Failed to make home directory: %m");
3861 (void) fchown(STDIN_FILENO
, uid
, gid
);
3862 (void) fchown(STDOUT_FILENO
, uid
, gid
);
3863 (void) fchown(STDERR_FILENO
, uid
, gid
);
3865 if (setgroups(n_uids
, uids
) < 0)
3866 return log_error_errno(errno
, "Failed to set auxiliary groups: %m");
3868 if (setresgid(gid
, gid
, gid
) < 0)
3869 return log_error_errno(errno
, "setregid() failed: %m");
3871 if (setresuid(uid
, uid
, uid
) < 0)
3872 return log_error_errno(errno
, "setreuid() failed: %m");
3884 * < 0 : wait_for_terminate() failed to get the state of the
3885 * container, the container was terminated by a signal, or
3886 * failed for an unknown reason. No change is made to the
3887 * container argument.
3888 * > 0 : The program executed in the container terminated with an
3889 * error. The exit code of the program executed in the
3890 * container is returned. The container argument has been set
3891 * to CONTAINER_TERMINATED.
3892 * 0 : The container is being rebooted, has been shut down or exited
3893 * successfully. The container argument has been set to either
3894 * CONTAINER_TERMINATED or CONTAINER_REBOOTED.
3896 * That is, success is indicated by a return value of zero, and an
3897 * error is indicated by a non-zero value.
3899 static int wait_for_container(pid_t pid
, ContainerStatus
*container
) {
3903 r
= wait_for_terminate(pid
, &status
);
3905 return log_warning_errno(r
, "Failed to wait for container: %m");
3907 switch (status
.si_code
) {
3910 if (status
.si_status
== 0) {
3911 log_full(arg_quiet
? LOG_DEBUG
: LOG_INFO
, "Container %s exited successfully.", arg_machine
);
3914 log_full(arg_quiet
? LOG_DEBUG
: LOG_INFO
, "Container %s failed with error code %i.", arg_machine
, status
.si_status
);
3916 *container
= CONTAINER_TERMINATED
;
3917 return status
.si_status
;
3920 if (status
.si_status
== SIGINT
) {
3922 log_full(arg_quiet
? LOG_DEBUG
: LOG_INFO
, "Container %s has been shut down.", arg_machine
);
3923 *container
= CONTAINER_TERMINATED
;
3926 } else if (status
.si_status
== SIGHUP
) {
3928 log_full(arg_quiet
? LOG_DEBUG
: LOG_INFO
, "Container %s is being rebooted.", arg_machine
);
3929 *container
= CONTAINER_REBOOTED
;
3933 /* CLD_KILLED fallthrough */
3936 log_error("Container %s terminated by signal %s.", arg_machine
, signal_to_string(status
.si_status
));
3940 log_error("Container %s failed due to unknown reason.", arg_machine
);
3947 static void nop_handler(int sig
) {}
3949 static int on_orderly_shutdown(sd_event_source
*s
, const struct signalfd_siginfo
*si
, void *userdata
) {
3952 pid
= PTR_TO_UINT32(userdata
);
3954 if (kill(pid
, arg_kill_signal
) >= 0) {
3955 log_info("Trying to halt container. Send SIGTERM again to trigger immediate termination.");
3956 sd_event_source_set_userdata(s
, NULL
);
3961 sd_event_exit(sd_event_source_get_event(s
), 0);
3965 static int determine_names(void) {
3968 if (!arg_image
&& !arg_directory
) {
3970 _cleanup_(image_unrefp
) Image
*i
= NULL
;
3972 r
= image_find(arg_machine
, &i
);
3974 return log_error_errno(r
, "Failed to find image for machine '%s': %m", arg_machine
);
3976 log_error("No image for machine '%s': %m", arg_machine
);
3980 if (i
->type
== IMAGE_RAW
)
3981 r
= set_sanitized_path(&arg_image
, i
->path
);
3983 r
= set_sanitized_path(&arg_directory
, i
->path
);
3985 return log_error_errno(r
, "Invalid image directory: %m");
3988 arg_read_only
= arg_read_only
|| i
->read_only
;
3990 arg_directory
= get_current_dir_name();
3992 if (!arg_directory
&& !arg_machine
) {
3993 log_error("Failed to determine path, please use -D or -i.");
3999 if (arg_directory
&& path_equal(arg_directory
, "/"))
4000 arg_machine
= gethostname_malloc();
4002 arg_machine
= strdup(basename(arg_image
?: arg_directory
));
4007 hostname_cleanup(arg_machine
, false);
4008 if (!machine_name_is_valid(arg_machine
)) {
4009 log_error("Failed to determine machine name automatically, please use -M.");
4013 if (arg_ephemeral
) {
4016 /* Add a random suffix when this is an
4017 * ephemeral machine, so that we can run many
4018 * instances at once without manually having
4019 * to specify -M each time. */
4021 if (asprintf(&b
, "%s-%016" PRIx64
, arg_machine
, random_u64()) < 0)
4032 static int determine_uid_shift(const char *directory
) {
4040 if (arg_uid_shift
== UID_INVALID
) {
4043 r
= stat(directory
, &st
);
4045 return log_error_errno(errno
, "Failed to determine UID base of %s: %m", directory
);
4047 arg_uid_shift
= st
.st_uid
& UINT32_C(0xffff0000);
4049 if (arg_uid_shift
!= (st
.st_gid
& UINT32_C(0xffff0000))) {
4050 log_error("UID and GID base of %s don't match.", directory
);
4054 arg_uid_range
= UINT32_C(0x10000);
4057 if (arg_uid_shift
> (uid_t
) -1 - arg_uid_range
) {
4058 log_error("UID base too high for UID range.");
4062 log_info("Using user namespaces with base " UID_FMT
" and range " UID_FMT
".", arg_uid_shift
, arg_uid_range
);
4066 static int inner_child(
4068 const char *directory
,
4076 _cleanup_free_
char *home
= NULL
;
4078 const char *envp
[] = {
4079 "PATH=" DEFAULT_PATH_SPLIT_USR
,
4080 "container=systemd-nspawn", /* LXC sets container=lxc, so follow the scheme here */
4085 NULL
, /* container_uuid */
4086 NULL
, /* LISTEN_FDS */
4087 NULL
, /* LISTEN_PID */
4091 _cleanup_strv_free_
char **env_use
= NULL
;
4096 assert(kmsg_socket
>= 0);
4099 /* Tell the parent, that it now can write the UID map. */
4100 (void) barrier_place(barrier
); /* #1 */
4102 /* Wait until the parent wrote the UID map */
4103 if (!barrier_place_and_sync(barrier
)) { /* #2 */
4104 log_error("Parent died too early");
4109 r
= mount_all(NULL
, true);
4113 /* Wait until we are cgroup-ified, so that we
4114 * can mount the right cgroup path writable */
4115 if (!barrier_place_and_sync(barrier
)) { /* #3 */
4116 log_error("Parent died too early");
4120 r
= mount_systemd_cgroup_writable("");
4124 r
= reset_uid_gid();
4126 return log_error_errno(r
, "Couldn't become new root: %m");
4128 r
= setup_boot_id(NULL
);
4132 r
= setup_kmsg(NULL
, kmsg_socket
);
4135 kmsg_socket
= safe_close(kmsg_socket
);
4140 return log_error_errno(errno
, "setsid() failed: %m");
4142 if (arg_private_network
)
4145 r
= send_rtnl(rtnl_socket
);
4148 rtnl_socket
= safe_close(rtnl_socket
);
4150 if (drop_capabilities() < 0)
4151 return log_error_errno(errno
, "drop_capabilities() failed: %m");
4155 if (arg_personality
!= PERSONALITY_INVALID
) {
4156 if (personality(arg_personality
) < 0)
4157 return log_error_errno(errno
, "personality() failed: %m");
4158 } else if (secondary
) {
4159 if (personality(PER_LINUX32
) < 0)
4160 return log_error_errno(errno
, "personality() failed: %m");
4164 if (arg_selinux_context
)
4165 if (setexeccon((security_context_t
) arg_selinux_context
) < 0)
4166 return log_error_errno(errno
, "setexeccon(\"%s\") failed: %m", arg_selinux_context
);
4169 r
= change_uid_gid(&home
);
4173 envp
[n_env
] = strv_find_prefix(environ
, "TERM=");
4177 if ((asprintf((char**)(envp
+ n_env
++), "HOME=%s", home
? home
: "/root") < 0) ||
4178 (asprintf((char**)(envp
+ n_env
++), "USER=%s", arg_user
? arg_user
: "root") < 0) ||
4179 (asprintf((char**)(envp
+ n_env
++), "LOGNAME=%s", arg_user
? arg_user
: "root") < 0))
4182 if (!sd_id128_equal(arg_uuid
, SD_ID128_NULL
)) {
4185 if (asprintf((char**)(envp
+ n_env
++), "container_uuid=%s", id128_format_as_uuid(arg_uuid
, as_uuid
)) < 0)
4189 if (fdset_size(fds
) > 0) {
4190 r
= fdset_cloexec(fds
, false);
4192 return log_error_errno(r
, "Failed to unset O_CLOEXEC for file descriptors.");
4194 if ((asprintf((char **)(envp
+ n_env
++), "LISTEN_FDS=%u", fdset_size(fds
)) < 0) ||
4195 (asprintf((char **)(envp
+ n_env
++), "LISTEN_PID=1") < 0))
4199 env_use
= strv_env_merge(2, envp
, arg_setenv
);
4203 /* Let the parent know that we are ready and
4204 * wait until the parent is ready with the
4206 if (!barrier_place_and_sync(barrier
)) { /* #4 */
4207 log_error("Parent died too early");
4211 /* Now, explicitly close the log, so that we
4212 * then can close all remaining fds. Closing
4213 * the log explicitly first has the benefit
4214 * that the logging subsystem knows about it,
4215 * and is thus ready to be reopened should we
4216 * need it again. Note that the other fds
4217 * closed here are at least the locking and
4220 (void) fdset_close_others(fds
);
4226 /* Automatically search for the init system */
4228 m
= 1 + argc
- optind
;
4229 a
= newa(char*, m
+ 1);
4230 memcpy(a
+ 1, argv
+ optind
, m
* sizeof(char*));
4232 a
[0] = (char*) "/usr/lib/systemd/systemd";
4233 execve(a
[0], a
, env_use
);
4235 a
[0] = (char*) "/lib/systemd/systemd";
4236 execve(a
[0], a
, env_use
);
4238 a
[0] = (char*) "/sbin/init";
4239 execve(a
[0], a
, env_use
);
4240 } else if (argc
> optind
)
4241 execvpe(argv
[optind
], argv
+ optind
, env_use
);
4243 chdir(home
? home
: "/root");
4244 execle("/bin/bash", "-bash", NULL
, env_use
);
4245 execle("/bin/sh", "-sh", NULL
, env_use
);
4249 return log_error_errno(errno
, "execv() failed: %m");
4252 static int outer_child(
4254 const char *directory
,
4255 const char *console
,
4256 const char *root_device
, bool root_device_rw
,
4257 const char *home_device
, bool home_device_rw
,
4258 const char *srv_device
, bool srv_device_rw
,
4264 int uid_shift_socket
,
4276 assert(pid_socket
>= 0);
4277 assert(kmsg_socket
>= 0);
4279 if (prctl(PR_SET_PDEATHSIG
, SIGKILL
) < 0)
4280 return log_error_errno(errno
, "PR_SET_PDEATHSIG failed: %m");
4283 close_nointr(STDIN_FILENO
);
4284 close_nointr(STDOUT_FILENO
);
4285 close_nointr(STDERR_FILENO
);
4287 r
= open_terminal(console
, O_RDWR
);
4288 if (r
!= STDIN_FILENO
) {
4294 return log_error_errno(r
, "Failed to open console: %m");
4297 if (dup2(STDIN_FILENO
, STDOUT_FILENO
) != STDOUT_FILENO
||
4298 dup2(STDIN_FILENO
, STDERR_FILENO
) != STDERR_FILENO
)
4299 return log_error_errno(errno
, "Failed to duplicate console: %m");
4302 r
= reset_audit_loginuid();
4306 /* Mark everything as slave, so that we still
4307 * receive mounts from the real root, but don't
4308 * propagate mounts to the real root. */
4309 if (mount(NULL
, "/", NULL
, MS_SLAVE
|MS_REC
, NULL
) < 0)
4310 return log_error_errno(errno
, "MS_SLAVE|MS_REC failed: %m");
4312 r
= mount_devices(directory
,
4313 root_device
, root_device_rw
,
4314 home_device
, home_device_rw
,
4315 srv_device
, srv_device_rw
);
4319 r
= determine_uid_shift(directory
);
4324 l
= send(uid_shift_socket
, &arg_uid_shift
, sizeof(arg_uid_shift
), MSG_NOSIGNAL
);
4326 return log_error_errno(errno
, "Failed to send UID shift: %m");
4327 if (l
!= sizeof(arg_uid_shift
)) {
4328 log_error("Short write while sending UID shift.");
4333 /* Turn directory into bind mount */
4334 if (mount(directory
, directory
, NULL
, MS_BIND
|MS_REC
, NULL
) < 0)
4335 return log_error_errno(errno
, "Failed to make bind mount: %m");
4337 r
= setup_volatile(directory
);
4341 r
= setup_volatile_state(directory
);
4345 r
= base_filesystem_create(directory
, arg_uid_shift
, (gid_t
) arg_uid_shift
);
4349 if (arg_read_only
) {
4350 r
= bind_remount_recursive(directory
, true);
4352 return log_error_errno(r
, "Failed to make tree read-only: %m");
4355 r
= mount_all(directory
, false);
4359 if (copy_devnodes(directory
) < 0)
4362 dev_setup(directory
, arg_uid_shift
, arg_uid_shift
);
4364 if (setup_pts(directory
) < 0)
4367 r
= setup_propagate(directory
);
4371 r
= setup_dev_console(directory
, console
);
4375 r
= setup_seccomp();
4379 r
= setup_timezone(directory
);
4383 r
= setup_resolv_conf(directory
);
4387 r
= setup_journal(directory
);
4391 r
= mount_custom(directory
);
4395 r
= mount_cgroup(directory
);
4399 r
= mount_move_root(directory
);
4401 return log_error_errno(r
, "Failed to move root directory: %m");
4403 pid
= raw_clone(SIGCHLD
|CLONE_NEWNS
|
4404 (arg_share_system
? 0 : CLONE_NEWIPC
|CLONE_NEWPID
|CLONE_NEWUTS
) |
4405 (arg_private_network
? CLONE_NEWNET
: 0) |
4406 (arg_userns
? CLONE_NEWUSER
: 0),
4409 return log_error_errno(errno
, "Failed to fork inner child: %m");
4412 pid_socket
= safe_close(pid_socket
);
4413 uid_shift_socket
= safe_close(uid_shift_socket
);
4415 /* The inner child has all namespaces that are
4416 * requested, so that we all are owned by the user if
4417 * user namespaces are turned on. */
4419 r
= inner_child(barrier
, directory
, secondary
, kmsg_socket
, rtnl_socket
, fds
, argc
, argv
);
4421 _exit(EXIT_FAILURE
);
4423 _exit(EXIT_SUCCESS
);
4426 l
= send(pid_socket
, &pid
, sizeof(pid
), MSG_NOSIGNAL
);
4428 return log_error_errno(errno
, "Failed to send PID: %m");
4429 if (l
!= sizeof(pid
)) {
4430 log_error("Short write while sending PID.");
4434 pid_socket
= safe_close(pid_socket
);
4439 static int setup_uid_map(pid_t pid
) {
4440 char uid_map
[strlen("/proc//uid_map") + DECIMAL_STR_MAX(uid_t
) + 1], line
[DECIMAL_STR_MAX(uid_t
)*3+3+1];
4445 xsprintf(uid_map
, "/proc/" PID_FMT
"/uid_map", pid
);
4446 xsprintf(line
, UID_FMT
" " UID_FMT
" " UID_FMT
"\n", 0, arg_uid_shift
, arg_uid_range
);
4447 r
= write_string_file(uid_map
, line
, 0);
4449 return log_error_errno(r
, "Failed to write UID map: %m");
4451 /* We always assign the same UID and GID ranges */
4452 xsprintf(uid_map
, "/proc/" PID_FMT
"/gid_map", pid
);
4453 r
= write_string_file(uid_map
, line
, 0);
4455 return log_error_errno(r
, "Failed to write GID map: %m");
4460 static int chown_cgroup(pid_t pid
) {
4461 _cleanup_free_
char *path
= NULL
, *fs
= NULL
;
4462 _cleanup_close_
int fd
= -1;
4466 r
= cg_pid_get_path(NULL
, pid
, &path
);
4468 return log_error_errno(r
, "Failed to get container cgroup path: %m");
4470 r
= cg_get_path(SYSTEMD_CGROUP_CONTROLLER
, path
, NULL
, &fs
);
4472 return log_error_errno(r
, "Failed to get file system path for container cgroup: %m");
4474 fd
= open(fs
, O_RDONLY
|O_CLOEXEC
|O_DIRECTORY
);
4476 return log_error_errno(errno
, "Failed to open %s: %m", fs
);
4478 FOREACH_STRING(fn
, ".", "tasks", "notify_on_release", "cgroup.procs", "cgroup.clone_children")
4479 if (fchownat(fd
, fn
, arg_uid_shift
, arg_uid_shift
, 0) < 0)
4480 log_warning_errno(errno
, "Failed to chown() cgroup file %s, ignoring: %m", fn
);
4485 int main(int argc
, char *argv
[]) {
4487 _cleanup_free_
char *device_path
= NULL
, *root_device
= NULL
, *home_device
= NULL
, *srv_device
= NULL
, *console
= NULL
;
4488 bool root_device_rw
= true, home_device_rw
= true, srv_device_rw
= true;
4489 _cleanup_close_
int master
= -1, image_fd
= -1;
4490 _cleanup_fdset_free_ FDSet
*fds
= NULL
;
4491 int r
, n_fd_passed
, loop_nr
= -1;
4492 char veth_name
[IFNAMSIZ
];
4493 bool secondary
= false, remove_subvol
= false;
4496 int ret
= EXIT_SUCCESS
;
4497 union in_addr_union exposed
= {};
4498 _cleanup_release_lock_file_ LockFile tree_global_lock
= LOCK_FILE_INIT
, tree_local_lock
= LOCK_FILE_INIT
;
4501 log_parse_environment();
4504 r
= parse_argv(argc
, argv
);
4508 r
= determine_names();
4512 if (geteuid() != 0) {
4513 log_error("Need to be root.");
4518 n_fd_passed
= sd_listen_fds(false);
4519 if (n_fd_passed
> 0) {
4520 r
= fdset_new_listen_fds(&fds
, false);
4522 log_error_errno(r
, "Failed to collect file descriptors: %m");
4527 if (arg_directory
) {
4530 if (path_equal(arg_directory
, "/") && !arg_ephemeral
) {
4531 log_error("Spawning container on root directory is not supported. Consider using --ephemeral.");
4536 if (arg_ephemeral
) {
4537 _cleanup_free_
char *np
= NULL
;
4539 /* If the specified path is a mount point we
4540 * generate the new snapshot immediately
4541 * inside it under a random name. However if
4542 * the specified is not a mount point we
4543 * create the new snapshot in the parent
4544 * directory, just next to it. */
4545 r
= path_is_mount_point(arg_directory
, 0);
4547 log_error_errno(r
, "Failed to determine whether directory %s is mount point: %m", arg_directory
);
4551 r
= tempfn_random_child(arg_directory
, "machine.", &np
);
4553 r
= tempfn_random(arg_directory
, "machine.", &np
);
4555 log_error_errno(r
, "Failed to generate name for snapshot: %m");
4559 r
= image_path_lock(np
, (arg_read_only
? LOCK_SH
: LOCK_EX
) | LOCK_NB
, &tree_global_lock
, &tree_local_lock
);
4561 log_error_errno(r
, "Failed to lock %s: %m", np
);
4565 r
= btrfs_subvol_snapshot(arg_directory
, np
, (arg_read_only
? BTRFS_SNAPSHOT_READ_ONLY
: 0) | BTRFS_SNAPSHOT_FALLBACK_COPY
| BTRFS_SNAPSHOT_RECURSIVE
);
4567 log_error_errno(r
, "Failed to create snapshot %s from %s: %m", np
, arg_directory
);
4571 free(arg_directory
);
4575 remove_subvol
= true;
4578 r
= image_path_lock(arg_directory
, (arg_read_only
? LOCK_SH
: LOCK_EX
) | LOCK_NB
, &tree_global_lock
, &tree_local_lock
);
4580 log_error_errno(r
, "Directory tree %s is currently busy.", arg_directory
);
4584 log_error_errno(r
, "Failed to lock %s: %m", arg_directory
);
4589 r
= btrfs_subvol_snapshot(arg_template
, arg_directory
, (arg_read_only
? BTRFS_SNAPSHOT_READ_ONLY
: 0) | BTRFS_SNAPSHOT_FALLBACK_COPY
| BTRFS_SNAPSHOT_RECURSIVE
);
4592 log_info("Directory %s already exists, not populating from template %s.", arg_directory
, arg_template
);
4594 log_error_errno(r
, "Couldn't create snapshot %s from %s: %m", arg_directory
, arg_template
);
4598 log_info("Populated %s from template %s.", arg_directory
, arg_template
);
4604 if (path_is_os_tree(arg_directory
) <= 0) {
4605 log_error("Directory %s doesn't look like an OS root directory (os-release file is missing). Refusing.", arg_directory
);
4612 p
= strjoina(arg_directory
,
4613 argc
> optind
&& path_is_absolute(argv
[optind
]) ? argv
[optind
] : "/usr/bin/");
4614 if (access(p
, F_OK
) < 0) {
4615 log_error("Directory %s lacks the binary to execute or doesn't look like a binary tree. Refusing.", arg_directory
);
4622 char template[] = "/tmp/nspawn-root-XXXXXX";
4625 assert(!arg_template
);
4627 r
= image_path_lock(arg_image
, (arg_read_only
? LOCK_SH
: LOCK_EX
) | LOCK_NB
, &tree_global_lock
, &tree_local_lock
);
4629 r
= log_error_errno(r
, "Disk image %s is currently busy.", arg_image
);
4633 r
= log_error_errno(r
, "Failed to create image lock: %m");
4637 if (!mkdtemp(template)) {
4638 log_error_errno(errno
, "Failed to create temporary directory: %m");
4643 arg_directory
= strdup(template);
4644 if (!arg_directory
) {
4649 image_fd
= setup_image(&device_path
, &loop_nr
);
4655 r
= dissect_image(image_fd
,
4656 &root_device
, &root_device_rw
,
4657 &home_device
, &home_device_rw
,
4658 &srv_device
, &srv_device_rw
,
4664 r
= custom_mounts_prepare();
4669 isatty(STDIN_FILENO
) > 0 &&
4670 isatty(STDOUT_FILENO
) > 0;
4672 master
= posix_openpt(O_RDWR
|O_NOCTTY
|O_CLOEXEC
|O_NDELAY
);
4674 r
= log_error_errno(errno
, "Failed to acquire pseudo tty: %m");
4678 r
= ptsname_malloc(master
, &console
);
4680 r
= log_error_errno(r
, "Failed to determine tty name: %m");
4684 if (unlockpt(master
) < 0) {
4685 r
= log_error_errno(errno
, "Failed to unlock tty: %m");
4690 log_info("Spawning container %s on %s.\nPress ^] three times within 1s to kill container.",
4691 arg_machine
, arg_image
?: arg_directory
);
4693 assert_se(sigprocmask_many(SIG_BLOCK
, NULL
, SIGCHLD
, SIGWINCH
, SIGTERM
, SIGINT
, -1) >= 0);
4695 assert_se(sigemptyset(&mask_chld
) == 0);
4696 assert_se(sigaddset(&mask_chld
, SIGCHLD
) == 0);
4698 if (prctl(PR_SET_CHILD_SUBREAPER
, 1) < 0) {
4699 r
= log_error_errno(errno
, "Failed to become subreaper: %m");
4704 _cleanup_close_pair_
int kmsg_socket_pair
[2] = { -1, -1 }, rtnl_socket_pair
[2] = { -1, -1 }, pid_socket_pair
[2] = { -1, -1 },
4705 uid_shift_socket_pair
[2] = { -1, -1 };
4706 ContainerStatus container_status
;
4707 _cleanup_(barrier_destroy
) Barrier barrier
= BARRIER_NULL
;
4708 static const struct sigaction sa
= {
4709 .sa_handler
= nop_handler
,
4710 .sa_flags
= SA_NOCLDSTOP
,
4714 _cleanup_event_unref_ sd_event
*event
= NULL
;
4715 _cleanup_(pty_forward_freep
) PTYForward
*forward
= NULL
;
4716 _cleanup_netlink_unref_ sd_netlink
*rtnl
= NULL
;
4719 r
= barrier_create(&barrier
);
4721 log_error_errno(r
, "Cannot initialize IPC barrier: %m");
4725 if (socketpair(AF_UNIX
, SOCK_DGRAM
|SOCK_CLOEXEC
, 0, kmsg_socket_pair
) < 0) {
4726 r
= log_error_errno(errno
, "Failed to create kmsg socket pair: %m");
4730 if (socketpair(AF_UNIX
, SOCK_DGRAM
|SOCK_CLOEXEC
, 0, rtnl_socket_pair
) < 0) {
4731 r
= log_error_errno(errno
, "Failed to create rtnl socket pair: %m");
4735 if (socketpair(AF_UNIX
, SOCK_DGRAM
|SOCK_CLOEXEC
, 0, pid_socket_pair
) < 0) {
4736 r
= log_error_errno(errno
, "Failed to create pid socket pair: %m");
4741 if (socketpair(AF_UNIX
, SOCK_DGRAM
|SOCK_CLOEXEC
, 0, uid_shift_socket_pair
) < 0) {
4742 r
= log_error_errno(errno
, "Failed to create uid shift socket pair: %m");
4746 /* Child can be killed before execv(), so handle SIGCHLD
4747 * in order to interrupt parent's blocking calls and
4748 * give it a chance to call wait() and terminate. */
4749 r
= sigprocmask(SIG_UNBLOCK
, &mask_chld
, NULL
);
4751 r
= log_error_errno(errno
, "Failed to change the signal mask: %m");
4755 r
= sigaction(SIGCHLD
, &sa
, NULL
);
4757 r
= log_error_errno(errno
, "Failed to install SIGCHLD handler: %m");
4761 pid
= raw_clone(SIGCHLD
|CLONE_NEWNS
, NULL
);
4763 if (errno
== EINVAL
)
4764 r
= log_error_errno(errno
, "clone() failed, do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in): %m");
4766 r
= log_error_errno(errno
, "clone() failed: %m");
4772 /* The outer child only has a file system namespace. */
4773 barrier_set_role(&barrier
, BARRIER_CHILD
);
4775 master
= safe_close(master
);
4777 kmsg_socket_pair
[0] = safe_close(kmsg_socket_pair
[0]);
4778 rtnl_socket_pair
[0] = safe_close(rtnl_socket_pair
[0]);
4779 pid_socket_pair
[0] = safe_close(pid_socket_pair
[0]);
4780 uid_shift_socket_pair
[0] = safe_close(uid_shift_socket_pair
[0]);
4782 (void) reset_all_signal_handlers();
4783 (void) reset_signal_mask();
4785 r
= outer_child(&barrier
,
4788 root_device
, root_device_rw
,
4789 home_device
, home_device_rw
,
4790 srv_device
, srv_device_rw
,
4794 kmsg_socket_pair
[1],
4795 rtnl_socket_pair
[1],
4796 uid_shift_socket_pair
[1],
4800 _exit(EXIT_FAILURE
);
4802 _exit(EXIT_SUCCESS
);
4805 barrier_set_role(&barrier
, BARRIER_PARENT
);
4810 kmsg_socket_pair
[1] = safe_close(kmsg_socket_pair
[1]);
4811 rtnl_socket_pair
[1] = safe_close(rtnl_socket_pair
[1]);
4812 pid_socket_pair
[1] = safe_close(pid_socket_pair
[1]);
4814 /* Wait for the outer child. */
4815 r
= wait_for_terminate_and_warn("namespace helper", pid
, NULL
);
4824 /* And now retrieve the PID of the inner child. */
4825 l
= recv(pid_socket_pair
[0], &pid
, sizeof(pid
), 0);
4827 r
= log_error_errno(errno
, "Failed to read inner child PID: %m");
4830 if (l
!= sizeof(pid
)) {
4831 log_error("Short read while reading inner child PID: %m");
4836 log_debug("Init process invoked as PID " PID_FMT
, pid
);
4839 if (!barrier_place_and_sync(&barrier
)) { /* #1 */
4840 log_error("Child died too early.");
4845 l
= recv(uid_shift_socket_pair
[0], &arg_uid_shift
, sizeof(arg_uid_shift
), 0);
4847 r
= log_error_errno(errno
, "Failed to read UID shift: %m");
4850 if (l
!= sizeof(arg_uid_shift
)) {
4851 log_error("Short read while reading UID shift: %m");
4856 r
= setup_uid_map(pid
);
4860 (void) barrier_place(&barrier
); /* #2 */
4863 r
= move_network_interfaces(pid
);
4867 r
= setup_veth(pid
, veth_name
, &ifi
);
4871 r
= setup_bridge(veth_name
, &ifi
);
4875 r
= setup_macvlan(pid
);
4879 r
= setup_ipvlan(pid
);
4883 r
= register_machine(pid
, ifi
);
4887 r
= chown_cgroup(pid
);
4891 /* Notify the child that the parent is ready with all
4892 * its setup (including cgroup-ification), and that
4893 * the child can now hand over control to the code to
4894 * run inside the container. */
4895 (void) barrier_place(&barrier
); /* #3 */
4897 /* Block SIGCHLD here, before notifying child.
4898 * process_pty() will handle it with the other signals. */
4899 assert_se(sigprocmask(SIG_BLOCK
, &mask_chld
, NULL
) >= 0);
4901 /* Reset signal to default */
4902 r
= default_signals(SIGCHLD
, -1);
4904 log_error_errno(r
, "Failed to reset SIGCHLD: %m");
4908 /* Let the child know that we are ready and wait that the child is completely ready now. */
4909 if (!barrier_place_and_sync(&barrier
)) { /* #5 */
4910 log_error("Client died too early.");
4917 "STATUS=Container running.\n"
4918 "X_NSPAWN_LEADER_PID=" PID_FMT
, pid
);
4920 r
= sd_event_new(&event
);
4922 log_error_errno(r
, "Failed to get default event source: %m");
4926 if (arg_kill_signal
> 0) {
4927 /* Try to kill the init system on SIGINT or SIGTERM */
4928 sd_event_add_signal(event
, NULL
, SIGINT
, on_orderly_shutdown
, UINT32_TO_PTR(pid
));
4929 sd_event_add_signal(event
, NULL
, SIGTERM
, on_orderly_shutdown
, UINT32_TO_PTR(pid
));
4931 /* Immediately exit */
4932 sd_event_add_signal(event
, NULL
, SIGINT
, NULL
, NULL
);
4933 sd_event_add_signal(event
, NULL
, SIGTERM
, NULL
, NULL
);
4936 /* simply exit on sigchld */
4937 sd_event_add_signal(event
, NULL
, SIGCHLD
, NULL
, NULL
);
4939 if (arg_expose_ports
) {
4940 r
= watch_rtnl(event
, rtnl_socket_pair
[0], &exposed
, &rtnl
);
4944 (void) expose_ports(rtnl
, &exposed
);
4947 rtnl_socket_pair
[0] = safe_close(rtnl_socket_pair
[0]);
4949 r
= pty_forward_new(event
, master
, true, !interactive
, &forward
);
4951 log_error_errno(r
, "Failed to create PTY forwarder: %m");
4955 r
= sd_event_loop(event
);
4957 log_error_errno(r
, "Failed to run event loop: %m");
4961 pty_forward_get_last_char(forward
, &last_char
);
4963 forward
= pty_forward_free(forward
);
4965 if (!arg_quiet
&& last_char
!= '\n')
4968 /* Kill if it is not dead yet anyway */
4969 terminate_machine(pid
);
4971 /* Normally redundant, but better safe than sorry */
4974 r
= wait_for_container(pid
, &container_status
);
4978 /* We failed to wait for the container, or the
4979 * container exited abnormally */
4981 else if (r
> 0 || container_status
== CONTAINER_TERMINATED
){
4982 /* The container exited with a non-zero
4983 * status, or with zero status and no reboot
4989 /* CONTAINER_REBOOTED, loop again */
4991 if (arg_keep_unit
) {
4992 /* Special handling if we are running as a
4993 * service: instead of simply restarting the
4994 * machine we want to restart the entire
4995 * service, so let's inform systemd about this
4996 * with the special exit code 133. The service
4997 * file uses RestartForceExitStatus=133 so
4998 * that this results in a full nspawn
4999 * restart. This is necessary since we might
5000 * have cgroup parameters set we want to have
5007 flush_ports(&exposed
);
5013 "STATUS=Terminating...");
5018 /* Try to flush whatever is still queued in the pty */
5020 (void) copy_bytes(master
, STDOUT_FILENO
, (off_t
) -1, false);
5022 loop_remove(loop_nr
, &image_fd
);
5024 if (remove_subvol
&& arg_directory
) {
5027 k
= btrfs_subvol_remove(arg_directory
, true);
5029 log_warning_errno(k
, "Cannot remove subvolume '%s', ignoring: %m", arg_directory
);
5035 p
= strjoina("/run/systemd/nspawn/propagate/", arg_machine
);
5036 (void) rm_rf(p
, REMOVE_ROOT
);
5039 free(arg_directory
);
5044 strv_free(arg_setenv
);
5045 strv_free(arg_network_interfaces
);
5046 strv_free(arg_network_macvlan
);
5047 strv_free(arg_network_ipvlan
);
5048 custom_mount_free_all();
5050 flush_ports(&exposed
);
5052 while (arg_expose_ports
) {
5053 ExposePort
*p
= arg_expose_ports
;
5054 LIST_REMOVE(ports
, arg_expose_ports
, p
);
5058 return r
< 0 ? EXIT_FAILURE
: ret
;