1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
4 This file is part of systemd.
6 Copyright 2010 Lennart Poettering
8 systemd is free software; you can redistribute it and/or modify it
9 under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 2.1 of the License, or
11 (at your option) any later version.
13 systemd is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 Lesser General Public License for more details.
18 You should have received a copy of the GNU Lesser General Public License
19 along with systemd; If not, see <http://www.gnu.org/licenses/>.
25 #include <sys/types.h>
26 #include <sys/mount.h>
31 #include <sys/prctl.h>
35 #include <sys/socket.h>
36 #include <linux/netlink.h>
38 #include <linux/veth.h>
39 #include <sys/personality.h>
40 #include <linux/loop.h>
44 #include <selinux/selinux.h>
52 #include <blkid/blkid.h>
55 #include "sd-daemon.h"
58 #include "sd-netlink.h"
59 #include "random-util.h"
66 #include "cgroup-util.h"
68 #include "path-util.h"
69 #include "loopback-setup.h"
70 #include "dev-setup.h"
75 #include "bus-error.h"
78 #include "netlink-util.h"
79 #include "udev-util.h"
80 #include "blkid-util.h"
82 #include "siphash24.h"
84 #include "base-filesystem.h"
86 #include "event-util.h"
87 #include "capability.h"
89 #include "btrfs-util.h"
90 #include "machine-image.h"
92 #include "in-addr-util.h"
93 #include "firewall-util.h"
94 #include "local-addresses.h"
95 #include "formats-util.h"
96 #include "process-util.h"
97 #include "terminal-util.h"
98 #include "hostname-util.h"
99 #include "signal-util.h"
102 #include "seccomp-util.h"
105 typedef struct ExposePort
{
108 uint16_t container_port
;
109 LIST_FIELDS(struct ExposePort
, ports
);
112 typedef enum ContainerStatus
{
113 CONTAINER_TERMINATED
,
117 typedef enum LinkJournal
{
124 typedef enum Volatile
{
130 typedef enum CustomMountType
{
133 CUSTOM_MOUNT_OVERLAY
,
136 typedef struct CustomMount
{
137 CustomMountType type
;
139 char *source
; /* for overlayfs this is the upper directory */
146 static char *arg_directory
= NULL
;
147 static char *arg_template
= NULL
;
148 static char *arg_user
= NULL
;
149 static sd_id128_t arg_uuid
= {};
150 static char *arg_machine
= NULL
;
151 static const char *arg_selinux_context
= NULL
;
152 static const char *arg_selinux_apifs_context
= NULL
;
153 static const char *arg_slice
= NULL
;
154 static bool arg_private_network
= false;
155 static bool arg_read_only
= false;
156 static bool arg_boot
= false;
157 static bool arg_ephemeral
= false;
158 static LinkJournal arg_link_journal
= LINK_AUTO
;
159 static bool arg_link_journal_try
= false;
160 static uint64_t arg_retain
=
161 (1ULL << CAP_CHOWN
) |
162 (1ULL << CAP_DAC_OVERRIDE
) |
163 (1ULL << CAP_DAC_READ_SEARCH
) |
164 (1ULL << CAP_FOWNER
) |
165 (1ULL << CAP_FSETID
) |
166 (1ULL << CAP_IPC_OWNER
) |
168 (1ULL << CAP_LEASE
) |
169 (1ULL << CAP_LINUX_IMMUTABLE
) |
170 (1ULL << CAP_NET_BIND_SERVICE
) |
171 (1ULL << CAP_NET_BROADCAST
) |
172 (1ULL << CAP_NET_RAW
) |
173 (1ULL << CAP_SETGID
) |
174 (1ULL << CAP_SETFCAP
) |
175 (1ULL << CAP_SETPCAP
) |
176 (1ULL << CAP_SETUID
) |
177 (1ULL << CAP_SYS_ADMIN
) |
178 (1ULL << CAP_SYS_CHROOT
) |
179 (1ULL << CAP_SYS_NICE
) |
180 (1ULL << CAP_SYS_PTRACE
) |
181 (1ULL << CAP_SYS_TTY_CONFIG
) |
182 (1ULL << CAP_SYS_RESOURCE
) |
183 (1ULL << CAP_SYS_BOOT
) |
184 (1ULL << CAP_AUDIT_WRITE
) |
185 (1ULL << CAP_AUDIT_CONTROL
) |
187 static CustomMount
*arg_custom_mounts
= NULL
;
188 static unsigned arg_n_custom_mounts
= 0;
189 static char **arg_setenv
= NULL
;
190 static bool arg_quiet
= false;
191 static bool arg_share_system
= false;
192 static bool arg_register
= true;
193 static bool arg_keep_unit
= false;
194 static char **arg_network_interfaces
= NULL
;
195 static char **arg_network_macvlan
= NULL
;
196 static char **arg_network_ipvlan
= NULL
;
197 static bool arg_network_veth
= false;
198 static const char *arg_network_bridge
= NULL
;
199 static unsigned long arg_personality
= PERSONALITY_INVALID
;
200 static char *arg_image
= NULL
;
201 static Volatile arg_volatile
= VOLATILE_NO
;
202 static ExposePort
*arg_expose_ports
= NULL
;
203 static char **arg_property
= NULL
;
204 static uid_t arg_uid_shift
= UID_INVALID
, arg_uid_range
= 0x10000U
;
205 static bool arg_userns
= false;
206 static int arg_kill_signal
= 0;
208 static void help(void) {
209 printf("%s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
210 "Spawn a minimal namespace container for debugging, testing and building.\n\n"
211 " -h --help Show this help\n"
212 " --version Print version string\n"
213 " -q --quiet Do not show status information\n"
214 " -D --directory=PATH Root directory for the container\n"
215 " --template=PATH Initialize root directory from template directory,\n"
217 " -x --ephemeral Run container with snapshot of root directory, and\n"
218 " remove it after exit\n"
219 " -i --image=PATH File system device or disk image for the container\n"
220 " -b --boot Boot up full system (i.e. invoke init)\n"
221 " -u --user=USER Run the command under specified user or uid\n"
222 " -M --machine=NAME Set the machine name for the container\n"
223 " --uuid=UUID Set a specific machine UUID for the container\n"
224 " -S --slice=SLICE Place the container in the specified slice\n"
225 " --property=NAME=VALUE Set scope unit property\n"
226 " --private-users[=UIDBASE[:NUIDS]]\n"
227 " Run within user namespace\n"
228 " --private-network Disable network in container\n"
229 " --network-interface=INTERFACE\n"
230 " Assign an existing network interface to the\n"
232 " --network-macvlan=INTERFACE\n"
233 " Create a macvlan network interface based on an\n"
234 " existing network interface to the container\n"
235 " --network-ipvlan=INTERFACE\n"
236 " Create a ipvlan network interface based on an\n"
237 " existing network interface to the container\n"
238 " -n --network-veth Add a virtual ethernet connection between host\n"
240 " --network-bridge=INTERFACE\n"
241 " Add a virtual ethernet connection between host\n"
242 " and container and add it to an existing bridge on\n"
244 " -p --port=[PROTOCOL:]HOSTPORT[:CONTAINERPORT]\n"
245 " Expose a container IP port on the host\n"
246 " -Z --selinux-context=SECLABEL\n"
247 " Set the SELinux security context to be used by\n"
248 " processes in the container\n"
249 " -L --selinux-apifs-context=SECLABEL\n"
250 " Set the SELinux security context to be used by\n"
251 " API/tmpfs file systems in the container\n"
252 " --capability=CAP In addition to the default, retain specified\n"
254 " --drop-capability=CAP Drop the specified capability from the default set\n"
255 " --kill-signal=SIGNAL Select signal to use for shutting down PID 1\n"
256 " --link-journal=MODE Link up guest journal, one of no, auto, guest, host,\n"
257 " try-guest, try-host\n"
258 " -j Equivalent to --link-journal=try-guest\n"
259 " --read-only Mount the root directory read-only\n"
260 " --bind=PATH[:PATH] Bind mount a file or directory from the host into\n"
262 " --bind-ro=PATH[:PATH] Similar, but creates a read-only bind mount\n"
263 " --tmpfs=PATH:[OPTIONS] Mount an empty tmpfs to the specified directory\n"
264 " --overlay=PATH[:PATH...]:PATH\n"
265 " Create an overlay mount from the host to \n"
267 " --overlay-ro=PATH[:PATH...]:PATH\n"
268 " Similar, but creates a read-only overlay mount\n"
269 " --setenv=NAME=VALUE Pass an environment variable to PID 1\n"
270 " --share-system Share system namespaces with host\n"
271 " --register=BOOLEAN Register container as machine\n"
272 " --keep-unit Do not register a scope for the machine, reuse\n"
273 " the service unit nspawn is running in\n"
274 " --volatile[=MODE] Run the system in volatile mode\n"
275 , program_invocation_short_name
);
278 static CustomMount
* custom_mount_add(CustomMountType t
) {
279 CustomMount
*c
, *ret
;
281 c
= realloc(arg_custom_mounts
, (arg_n_custom_mounts
+ 1) * sizeof(CustomMount
));
285 arg_custom_mounts
= c
;
286 ret
= arg_custom_mounts
+ arg_n_custom_mounts
;
287 arg_n_custom_mounts
++;
289 *ret
= (CustomMount
) { .type
= t
};
294 static void custom_mount_free_all(void) {
297 for (i
= 0; i
< arg_n_custom_mounts
; i
++) {
298 CustomMount
*m
= &arg_custom_mounts
[i
];
301 free(m
->destination
);
305 (void) rm_rf(m
->work_dir
, REMOVE_ROOT
|REMOVE_PHYSICAL
);
312 arg_custom_mounts
= mfree(arg_custom_mounts
);
313 arg_n_custom_mounts
= 0;
316 static int custom_mount_compare(const void *a
, const void *b
) {
317 const CustomMount
*x
= a
, *y
= b
;
320 r
= path_compare(x
->destination
, y
->destination
);
324 if (x
->type
< y
->type
)
326 if (x
->type
> y
->type
)
332 static int custom_mounts_prepare(void) {
336 /* Ensure the mounts are applied prefix first. */
337 qsort_safe(arg_custom_mounts
, arg_n_custom_mounts
, sizeof(CustomMount
), custom_mount_compare
);
339 /* Allocate working directories for the overlay file systems that need it */
340 for (i
= 0; i
< arg_n_custom_mounts
; i
++) {
341 CustomMount
*m
= &arg_custom_mounts
[i
];
343 if (arg_userns
&& arg_uid_shift
== UID_INVALID
&& path_equal(m
->destination
, "/")) {
344 log_error("--private-users with automatic UID shift may not be combined with custom root mounts.");
348 if (m
->type
!= CUSTOM_MOUNT_OVERLAY
)
357 r
= tempfn_random(m
->source
, NULL
, &m
->work_dir
);
359 return log_error_errno(r
, "Failed to generate work directory from %s: %m", m
->source
);
365 static int set_sanitized_path(char **b
, const char *path
) {
371 p
= canonicalize_file_name(path
);
376 p
= path_make_absolute_cwd(path
);
382 *b
= path_kill_slashes(p
);
386 static int parse_argv(int argc
, char *argv
[]) {
405 ARG_NETWORK_INTERFACE
,
417 static const struct option options
[] = {
418 { "help", no_argument
, NULL
, 'h' },
419 { "version", no_argument
, NULL
, ARG_VERSION
},
420 { "directory", required_argument
, NULL
, 'D' },
421 { "template", required_argument
, NULL
, ARG_TEMPLATE
},
422 { "ephemeral", no_argument
, NULL
, 'x' },
423 { "user", required_argument
, NULL
, 'u' },
424 { "private-network", no_argument
, NULL
, ARG_PRIVATE_NETWORK
},
425 { "boot", no_argument
, NULL
, 'b' },
426 { "uuid", required_argument
, NULL
, ARG_UUID
},
427 { "read-only", no_argument
, NULL
, ARG_READ_ONLY
},
428 { "capability", required_argument
, NULL
, ARG_CAPABILITY
},
429 { "drop-capability", required_argument
, NULL
, ARG_DROP_CAPABILITY
},
430 { "link-journal", required_argument
, NULL
, ARG_LINK_JOURNAL
},
431 { "bind", required_argument
, NULL
, ARG_BIND
},
432 { "bind-ro", required_argument
, NULL
, ARG_BIND_RO
},
433 { "tmpfs", required_argument
, NULL
, ARG_TMPFS
},
434 { "overlay", required_argument
, NULL
, ARG_OVERLAY
},
435 { "overlay-ro", required_argument
, NULL
, ARG_OVERLAY_RO
},
436 { "machine", required_argument
, NULL
, 'M' },
437 { "slice", required_argument
, NULL
, 'S' },
438 { "setenv", required_argument
, NULL
, ARG_SETENV
},
439 { "selinux-context", required_argument
, NULL
, 'Z' },
440 { "selinux-apifs-context", required_argument
, NULL
, 'L' },
441 { "quiet", no_argument
, NULL
, 'q' },
442 { "share-system", no_argument
, NULL
, ARG_SHARE_SYSTEM
},
443 { "register", required_argument
, NULL
, ARG_REGISTER
},
444 { "keep-unit", no_argument
, NULL
, ARG_KEEP_UNIT
},
445 { "network-interface", required_argument
, NULL
, ARG_NETWORK_INTERFACE
},
446 { "network-macvlan", required_argument
, NULL
, ARG_NETWORK_MACVLAN
},
447 { "network-ipvlan", required_argument
, NULL
, ARG_NETWORK_IPVLAN
},
448 { "network-veth", no_argument
, NULL
, 'n' },
449 { "network-bridge", required_argument
, NULL
, ARG_NETWORK_BRIDGE
},
450 { "personality", required_argument
, NULL
, ARG_PERSONALITY
},
451 { "image", required_argument
, NULL
, 'i' },
452 { "volatile", optional_argument
, NULL
, ARG_VOLATILE
},
453 { "port", required_argument
, NULL
, 'p' },
454 { "property", required_argument
, NULL
, ARG_PROPERTY
},
455 { "private-users", optional_argument
, NULL
, ARG_PRIVATE_USERS
},
456 { "kill-signal", required_argument
, NULL
, ARG_KILL_SIGNAL
},
461 uint64_t plus
= 0, minus
= 0;
466 while ((c
= getopt_long(argc
, argv
, "+hD:u:bL:M:jS:Z:qi:xp:n", options
, NULL
)) >= 0)
475 puts(PACKAGE_STRING
);
476 puts(SYSTEMD_FEATURES
);
480 r
= set_sanitized_path(&arg_directory
, optarg
);
482 return log_error_errno(r
, "Invalid root directory: %m");
487 r
= set_sanitized_path(&arg_template
, optarg
);
489 return log_error_errno(r
, "Invalid template directory: %m");
494 r
= set_sanitized_path(&arg_image
, optarg
);
496 return log_error_errno(r
, "Invalid image path: %m");
501 arg_ephemeral
= true;
505 r
= free_and_strdup(&arg_user
, optarg
);
511 case ARG_NETWORK_BRIDGE
:
512 arg_network_bridge
= optarg
;
517 arg_network_veth
= true;
518 arg_private_network
= true;
521 case ARG_NETWORK_INTERFACE
:
522 if (strv_extend(&arg_network_interfaces
, optarg
) < 0)
525 arg_private_network
= true;
528 case ARG_NETWORK_MACVLAN
:
529 if (strv_extend(&arg_network_macvlan
, optarg
) < 0)
532 arg_private_network
= true;
535 case ARG_NETWORK_IPVLAN
:
536 if (strv_extend(&arg_network_ipvlan
, optarg
) < 0)
541 case ARG_PRIVATE_NETWORK
:
542 arg_private_network
= true;
550 r
= sd_id128_from_string(optarg
, &arg_uuid
);
552 log_error("Invalid UUID: %s", optarg
);
562 if (isempty(optarg
)) {
563 arg_machine
= mfree(arg_machine
);
565 if (!machine_name_is_valid(optarg
)) {
566 log_error("Invalid machine name: %s", optarg
);
570 r
= free_and_strdup(&arg_machine
, optarg
);
578 arg_selinux_context
= optarg
;
582 arg_selinux_apifs_context
= optarg
;
586 arg_read_only
= true;
590 case ARG_DROP_CAPABILITY
: {
591 const char *state
, *word
;
594 FOREACH_WORD_SEPARATOR(word
, length
, optarg
, ",", state
) {
595 _cleanup_free_
char *t
;
597 t
= strndup(word
, length
);
601 if (streq(t
, "all")) {
602 if (c
== ARG_CAPABILITY
)
603 plus
= (uint64_t) -1;
605 minus
= (uint64_t) -1;
609 cap
= capability_from_name(t
);
611 log_error("Failed to parse capability %s.", t
);
615 if (c
== ARG_CAPABILITY
)
616 plus
|= 1ULL << (uint64_t) cap
;
618 minus
|= 1ULL << (uint64_t) cap
;
626 arg_link_journal
= LINK_GUEST
;
627 arg_link_journal_try
= true;
630 case ARG_LINK_JOURNAL
:
631 if (streq(optarg
, "auto")) {
632 arg_link_journal
= LINK_AUTO
;
633 arg_link_journal_try
= false;
634 } else if (streq(optarg
, "no")) {
635 arg_link_journal
= LINK_NO
;
636 arg_link_journal_try
= false;
637 } else if (streq(optarg
, "guest")) {
638 arg_link_journal
= LINK_GUEST
;
639 arg_link_journal_try
= false;
640 } else if (streq(optarg
, "host")) {
641 arg_link_journal
= LINK_HOST
;
642 arg_link_journal_try
= false;
643 } else if (streq(optarg
, "try-guest")) {
644 arg_link_journal
= LINK_GUEST
;
645 arg_link_journal_try
= true;
646 } else if (streq(optarg
, "try-host")) {
647 arg_link_journal
= LINK_HOST
;
648 arg_link_journal_try
= true;
650 log_error("Failed to parse link journal mode %s", optarg
);
658 const char *current
= optarg
;
659 _cleanup_free_
char *source
= NULL
, *destination
= NULL
;
661 _cleanup_strv_free_
char **strv
= NULL
;
663 r
= extract_many_words(¤t
, ":", EXTRACT_DONT_COALESCE_SEPARATORS
, &source
, &destination
, NULL
);
666 destination
= strdup(source
);
672 log_error("Invalid bind mount specification: %s", optarg
);
676 if (!source
|| !destination
)
679 if (!path_is_absolute(source
) || !path_is_absolute(destination
)) {
680 log_error("Invalid bind mount specification: %s", optarg
);
684 m
= custom_mount_add(CUSTOM_MOUNT_BIND
);
689 m
->destination
= destination
;
690 m
->read_only
= c
== ARG_BIND_RO
;
692 source
= destination
= NULL
;
698 const char *current
= optarg
;
699 _cleanup_free_
char *path
= NULL
, *opts
= NULL
;
702 r
= extract_first_word(¤t
, &path
, ":", EXTRACT_DONT_COALESCE_SEPARATORS
);
706 log_error("Invalid tmpfs specification: %s", optarg
);
710 opts
= strdup(current
);
712 opts
= strdup("mode=0755");
717 if (!path_is_absolute(path
)) {
718 log_error("Invalid tmpfs specification: %s", optarg
);
722 m
= custom_mount_add(CUSTOM_MOUNT_TMPFS
);
726 m
->destination
= path
;
735 case ARG_OVERLAY_RO
: {
736 _cleanup_free_
char *upper
= NULL
, *destination
= NULL
;
737 _cleanup_strv_free_
char **lower
= NULL
;
742 lower
= strv_split(optarg
, ":");
746 STRV_FOREACH(i
, lower
) {
747 if (!path_is_absolute(*i
)) {
748 log_error("Overlay path %s is not absolute.", *i
);
756 log_error("--overlay= needs at least two colon-separated directories specified.");
761 /* If two parameters are specified,
762 * the first one is the lower, the
763 * second one the upper directory. And
764 * we'll also define the destination
765 * mount point the same as the upper. */
769 destination
= strdup(upper
);
774 upper
= lower
[n
- 2];
775 destination
= lower
[n
- 1];
779 m
= custom_mount_add(CUSTOM_MOUNT_OVERLAY
);
783 m
->destination
= destination
;
786 m
->read_only
= c
== ARG_OVERLAY_RO
;
788 upper
= destination
= NULL
;
797 if (!env_assignment_is_valid(optarg
)) {
798 log_error("Environment variable assignment '%s' is not valid.", optarg
);
802 n
= strv_env_set(arg_setenv
, optarg
);
806 strv_free(arg_setenv
);
815 case ARG_SHARE_SYSTEM
:
816 arg_share_system
= true;
820 r
= parse_boolean(optarg
);
822 log_error("Failed to parse --register= argument: %s", optarg
);
830 arg_keep_unit
= true;
833 case ARG_PERSONALITY
:
835 arg_personality
= personality_from_string(optarg
);
836 if (arg_personality
== PERSONALITY_INVALID
) {
837 log_error("Unknown or unsupported personality '%s'.", optarg
);
846 arg_volatile
= VOLATILE_YES
;
848 r
= parse_boolean(optarg
);
850 if (streq(optarg
, "state"))
851 arg_volatile
= VOLATILE_STATE
;
853 log_error("Failed to parse --volatile= argument: %s", optarg
);
857 arg_volatile
= r
? VOLATILE_YES
: VOLATILE_NO
;
863 const char *split
, *e
;
864 uint16_t container_port
, host_port
;
868 if ((e
= startswith(optarg
, "tcp:")))
869 protocol
= IPPROTO_TCP
;
870 else if ((e
= startswith(optarg
, "udp:")))
871 protocol
= IPPROTO_UDP
;
874 protocol
= IPPROTO_TCP
;
877 split
= strchr(e
, ':');
879 char v
[split
- e
+ 1];
881 memcpy(v
, e
, split
- e
);
884 r
= safe_atou16(v
, &host_port
);
885 if (r
< 0 || host_port
<= 0) {
886 log_error("Failed to parse host port: %s", optarg
);
890 r
= safe_atou16(split
+ 1, &container_port
);
892 r
= safe_atou16(e
, &container_port
);
893 host_port
= container_port
;
896 if (r
< 0 || container_port
<= 0) {
897 log_error("Failed to parse host port: %s", optarg
);
901 LIST_FOREACH(ports
, p
, arg_expose_ports
) {
902 if (p
->protocol
== protocol
&& p
->host_port
== host_port
) {
903 log_error("Duplicate port specification: %s", optarg
);
908 p
= new(ExposePort
, 1);
912 p
->protocol
= protocol
;
913 p
->host_port
= host_port
;
914 p
->container_port
= container_port
;
916 LIST_PREPEND(ports
, arg_expose_ports
, p
);
922 if (strv_extend(&arg_property
, optarg
) < 0)
927 case ARG_PRIVATE_USERS
:
929 _cleanup_free_
char *buffer
= NULL
;
930 const char *range
, *shift
;
932 range
= strchr(optarg
, ':');
934 buffer
= strndup(optarg
, range
- optarg
);
940 if (safe_atou32(range
, &arg_uid_range
) < 0 || arg_uid_range
<= 0) {
941 log_error("Failed to parse UID range: %s", range
);
947 if (parse_uid(shift
, &arg_uid_shift
) < 0) {
948 log_error("Failed to parse UID: %s", optarg
);
956 case ARG_KILL_SIGNAL
:
957 arg_kill_signal
= signal_from_string_try_harder(optarg
);
958 if (arg_kill_signal
< 0) {
959 log_error("Cannot parse signal: %s", optarg
);
969 assert_not_reached("Unhandled option");
972 if (arg_share_system
)
973 arg_register
= false;
975 if (arg_boot
&& arg_share_system
) {
976 log_error("--boot and --share-system may not be combined.");
980 if (arg_keep_unit
&& cg_pid_get_owner_uid(0, NULL
) >= 0) {
981 log_error("--keep-unit may not be used when invoked from a user session.");
985 if (arg_directory
&& arg_image
) {
986 log_error("--directory= and --image= may not be combined.");
990 if (arg_template
&& arg_image
) {
991 log_error("--template= and --image= may not be combined.");
995 if (arg_template
&& !(arg_directory
|| arg_machine
)) {
996 log_error("--template= needs --directory= or --machine=.");
1000 if (arg_ephemeral
&& arg_template
) {
1001 log_error("--ephemeral and --template= may not be combined.");
1005 if (arg_ephemeral
&& arg_image
) {
1006 log_error("--ephemeral and --image= may not be combined.");
1010 if (arg_ephemeral
&& !IN_SET(arg_link_journal
, LINK_NO
, LINK_AUTO
)) {
1011 log_error("--ephemeral and --link-journal= may not be combined.");
1015 if (arg_volatile
!= VOLATILE_NO
&& arg_read_only
) {
1016 log_error("Cannot combine --read-only with --volatile. Note that --volatile already implies a read-only base hierarchy.");
1020 if (arg_expose_ports
&& !arg_private_network
) {
1021 log_error("Cannot use --port= without private networking.");
1025 if (arg_userns
&& access("/proc/self/uid_map", F_OK
) < 0)
1026 return log_error_errno(EOPNOTSUPP
, "--private-users= is not supported, kernel compiled without user namespace support.");
1028 arg_retain
= (arg_retain
| plus
| (arg_private_network
? 1ULL << CAP_NET_ADMIN
: 0)) & ~minus
;
1030 if (arg_boot
&& arg_kill_signal
<= 0)
1031 arg_kill_signal
= SIGRTMIN
+3;
1036 static int tmpfs_patch_options(const char *options
, char **ret
) {
1039 if (arg_userns
&& arg_uid_shift
!= 0) {
1040 assert(arg_uid_shift
!= UID_INVALID
);
1043 (void) asprintf(&buf
, "%s,uid=" UID_FMT
",gid=" UID_FMT
, options
, arg_uid_shift
, arg_uid_shift
);
1045 (void) asprintf(&buf
, "uid=" UID_FMT
",gid=" UID_FMT
, arg_uid_shift
, arg_uid_shift
);
1053 if (arg_selinux_apifs_context
) {
1057 t
= strjoin(options
, ",context=\"", arg_selinux_apifs_context
, "\"", NULL
);
1059 t
= strjoin("context=\"", arg_selinux_apifs_context
, "\"", NULL
);
1074 static int mount_all(const char *dest
, bool userns
) {
1076 typedef struct MountPoint
{
1080 const char *options
;
1081 unsigned long flags
;
1086 static const MountPoint mount_table
[] = {
1087 { "proc", "/proc", "proc", NULL
, MS_NOSUID
|MS_NOEXEC
|MS_NODEV
, true, true },
1088 { "/proc/sys", "/proc/sys", NULL
, NULL
, MS_BIND
, true, true }, /* Bind mount first */
1089 { NULL
, "/proc/sys", NULL
, NULL
, MS_BIND
|MS_RDONLY
|MS_NOSUID
|MS_NOEXEC
|MS_NODEV
|MS_REMOUNT
, true, true }, /* Then, make it r/o */
1090 { "sysfs", "/sys", "sysfs", NULL
, MS_RDONLY
|MS_NOSUID
|MS_NOEXEC
|MS_NODEV
, true, false },
1091 { "tmpfs", "/sys/fs/cgroup", "tmpfs", "mode=755", MS_NOSUID
|MS_NOEXEC
|MS_NODEV
|MS_STRICTATIME
, true, false },
1092 { "tmpfs", "/dev", "tmpfs", "mode=755", MS_NOSUID
|MS_STRICTATIME
, true, false },
1093 { "tmpfs", "/dev/shm", "tmpfs", "mode=1777", MS_NOSUID
|MS_NODEV
|MS_STRICTATIME
, true, false },
1094 { "tmpfs", "/run", "tmpfs", "mode=755", MS_NOSUID
|MS_NODEV
|MS_STRICTATIME
, true, false },
1095 { "tmpfs", "/tmp", "tmpfs", "mode=1777", MS_STRICTATIME
, true, false },
1097 { "/sys/fs/selinux", "/sys/fs/selinux", NULL
, NULL
, MS_BIND
, false, false }, /* Bind mount first */
1098 { NULL
, "/sys/fs/selinux", NULL
, NULL
, MS_BIND
|MS_RDONLY
|MS_NOSUID
|MS_NOEXEC
|MS_NODEV
|MS_REMOUNT
, false, false }, /* Then, make it r/o */
1105 for (k
= 0; k
< ELEMENTSOF(mount_table
); k
++) {
1106 _cleanup_free_
char *where
= NULL
, *options
= NULL
;
1109 if (userns
!= mount_table
[k
].userns
)
1112 where
= prefix_root(dest
, mount_table
[k
].where
);
1116 r
= path_is_mount_point(where
, AT_SYMLINK_FOLLOW
);
1117 if (r
< 0 && r
!= -ENOENT
)
1118 return log_error_errno(r
, "Failed to detect whether %s is a mount point: %m", where
);
1120 /* Skip this entry if it is not a remount. */
1121 if (mount_table
[k
].what
&& r
> 0)
1124 r
= mkdir_p(where
, 0755);
1126 if (mount_table
[k
].fatal
)
1127 return log_error_errno(r
, "Failed to create directory %s: %m", where
);
1129 log_warning_errno(r
, "Failed to create directory %s: %m", where
);
1133 o
= mount_table
[k
].options
;
1134 if (streq_ptr(mount_table
[k
].type
, "tmpfs")) {
1135 r
= tmpfs_patch_options(o
, &options
);
1142 if (mount(mount_table
[k
].what
,
1144 mount_table
[k
].type
,
1145 mount_table
[k
].flags
,
1148 if (mount_table
[k
].fatal
)
1149 return log_error_errno(errno
, "mount(%s) failed: %m", where
);
1151 log_warning_errno(errno
, "mount(%s) failed, ignoring: %m", where
);
1158 static int mount_bind(const char *dest
, CustomMount
*m
) {
1159 struct stat source_st
, dest_st
;
1165 if (stat(m
->source
, &source_st
) < 0)
1166 return log_error_errno(errno
, "Failed to stat %s: %m", m
->source
);
1168 where
= prefix_roota(dest
, m
->destination
);
1170 if (stat(where
, &dest_st
) >= 0) {
1171 if (S_ISDIR(source_st
.st_mode
) && !S_ISDIR(dest_st
.st_mode
)) {
1172 log_error("Cannot bind mount directory %s on file %s.", m
->source
, where
);
1176 if (!S_ISDIR(source_st
.st_mode
) && S_ISDIR(dest_st
.st_mode
)) {
1177 log_error("Cannot bind mount file %s on directory %s.", m
->source
, where
);
1181 } else if (errno
== ENOENT
) {
1182 r
= mkdir_parents_label(where
, 0755);
1184 return log_error_errno(r
, "Failed to make parents of %s: %m", where
);
1186 log_error_errno(errno
, "Failed to stat %s: %m", where
);
1190 /* Create the mount point. Any non-directory file can be
1191 * mounted on any non-directory file (regular, fifo, socket,
1194 if (S_ISDIR(source_st
.st_mode
))
1195 r
= mkdir_label(where
, 0755);
1198 if (r
< 0 && r
!= -EEXIST
)
1199 return log_error_errno(r
, "Failed to create mount point %s: %m", where
);
1201 if (mount(m
->source
, where
, NULL
, MS_BIND
, NULL
) < 0)
1202 return log_error_errno(errno
, "mount(%s) failed: %m", where
);
1205 r
= bind_remount_recursive(where
, true);
1207 return log_error_errno(r
, "Read-only bind mount failed: %m");
1213 static int mount_tmpfs(const char *dest
, CustomMount
*m
) {
1214 const char *where
, *options
;
1215 _cleanup_free_
char *buf
= NULL
;
1221 where
= prefix_roota(dest
, m
->destination
);
1223 r
= mkdir_p_label(where
, 0755);
1224 if (r
< 0 && r
!= -EEXIST
)
1225 return log_error_errno(r
, "Creating mount point for tmpfs %s failed: %m", where
);
1227 r
= tmpfs_patch_options(m
->options
, &buf
);
1230 options
= r
> 0 ? buf
: m
->options
;
1232 if (mount("tmpfs", where
, "tmpfs", MS_NODEV
|MS_STRICTATIME
, options
) < 0)
1233 return log_error_errno(errno
, "tmpfs mount to %s failed: %m", where
);
1238 static char *joined_and_escaped_lower_dirs(char * const *lower
) {
1239 _cleanup_strv_free_
char **sv
= NULL
;
1241 sv
= strv_copy(lower
);
1247 if (!strv_shell_escape(sv
, ",:"))
1250 return strv_join(sv
, ":");
1253 static int mount_overlay(const char *dest
, CustomMount
*m
) {
1254 _cleanup_free_
char *lower
= NULL
;
1255 const char *where
, *options
;
1261 where
= prefix_roota(dest
, m
->destination
);
1263 r
= mkdir_label(where
, 0755);
1264 if (r
< 0 && r
!= -EEXIST
)
1265 return log_error_errno(r
, "Creating mount point for overlay %s failed: %m", where
);
1267 (void) mkdir_p_label(m
->source
, 0755);
1269 lower
= joined_and_escaped_lower_dirs(m
->lower
);
1274 _cleanup_free_
char *escaped_source
= NULL
;
1276 escaped_source
= shell_escape(m
->source
, ",:");
1277 if (!escaped_source
)
1280 options
= strjoina("lowerdir=", escaped_source
, ":", lower
);
1282 _cleanup_free_
char *escaped_source
= NULL
, *escaped_work_dir
= NULL
;
1284 assert(m
->work_dir
);
1285 (void) mkdir_label(m
->work_dir
, 0700);
1287 escaped_source
= shell_escape(m
->source
, ",:");
1288 if (!escaped_source
)
1290 escaped_work_dir
= shell_escape(m
->work_dir
, ",:");
1291 if (!escaped_work_dir
)
1294 options
= strjoina("lowerdir=", lower
, ",upperdir=", escaped_source
, ",workdir=", escaped_work_dir
);
1297 if (mount("overlay", where
, "overlay", m
->read_only
? MS_RDONLY
: 0, options
) < 0)
1298 return log_error_errno(errno
, "overlay mount to %s failed: %m", where
);
1303 static int mount_custom(const char *dest
) {
1309 for (i
= 0; i
< arg_n_custom_mounts
; i
++) {
1310 CustomMount
*m
= &arg_custom_mounts
[i
];
1314 case CUSTOM_MOUNT_BIND
:
1315 r
= mount_bind(dest
, m
);
1318 case CUSTOM_MOUNT_TMPFS
:
1319 r
= mount_tmpfs(dest
, m
);
1322 case CUSTOM_MOUNT_OVERLAY
:
1323 r
= mount_overlay(dest
, m
);
1327 assert_not_reached("Unknown custom mount type");
1337 static int mount_cgroup_hierarchy(const char *dest
, const char *controller
, const char *hierarchy
, bool read_only
) {
1341 to
= strjoina(dest
, "/sys/fs/cgroup/", hierarchy
);
1343 r
= path_is_mount_point(to
, 0);
1344 if (r
< 0 && r
!= -ENOENT
)
1345 return log_error_errno(r
, "Failed to determine if %s is mounted already: %m", to
);
1351 /* The superblock mount options of the mount point need to be
1352 * identical to the hosts', and hence writable... */
1353 if (mount("cgroup", to
, "cgroup", MS_NOSUID
|MS_NOEXEC
|MS_NODEV
, controller
) < 0)
1354 return log_error_errno(errno
, "Failed to mount to %s: %m", to
);
1356 /* ... hence let's only make the bind mount read-only, not the
1359 if (mount(NULL
, to
, NULL
, MS_BIND
|MS_REMOUNT
|MS_NOSUID
|MS_NOEXEC
|MS_NODEV
|MS_RDONLY
, NULL
) < 0)
1360 return log_error_errno(errno
, "Failed to remount %s read-only: %m", to
);
1365 static int mount_cgroup(const char *dest
) {
1366 _cleanup_set_free_free_ Set
*controllers
= NULL
;
1367 const char *cgroup_root
;
1370 controllers
= set_new(&string_hash_ops
);
1374 r
= cg_kernel_controllers(controllers
);
1376 return log_error_errno(r
, "Failed to determine cgroup controllers: %m");
1379 _cleanup_free_
char *controller
= NULL
, *origin
= NULL
, *combined
= NULL
;
1381 controller
= set_steal_first(controllers
);
1385 origin
= prefix_root("/sys/fs/cgroup/", controller
);
1389 r
= readlink_malloc(origin
, &combined
);
1391 /* Not a symbolic link, but directly a single cgroup hierarchy */
1393 r
= mount_cgroup_hierarchy(dest
, controller
, controller
, true);
1398 return log_error_errno(r
, "Failed to read link %s: %m", origin
);
1400 _cleanup_free_
char *target
= NULL
;
1402 target
= prefix_root(dest
, origin
);
1406 /* A symbolic link, a combination of controllers in one hierarchy */
1408 if (!filename_is_valid(combined
)) {
1409 log_warning("Ignoring invalid combined hierarchy %s.", combined
);
1413 r
= mount_cgroup_hierarchy(dest
, combined
, combined
, true);
1417 r
= symlink_idempotent(combined
, target
);
1419 log_error("Invalid existing symlink for combined hierarchy");
1423 return log_error_errno(r
, "Failed to create symlink for combined hierarchy: %m");
1427 r
= mount_cgroup_hierarchy(dest
, "name=systemd,xattr", "systemd", false);
1431 cgroup_root
= prefix_roota(dest
, "/sys/fs/cgroup");
1432 if (mount(NULL
, cgroup_root
, NULL
, MS_REMOUNT
|MS_NOSUID
|MS_NOEXEC
|MS_NODEV
|MS_STRICTATIME
|MS_RDONLY
, "mode=755") < 0)
1433 return log_error_errno(errno
, "Failed to remount %s read-only: %m", cgroup_root
);
1438 static int mount_systemd_cgroup_writable(const char *dest
) {
1439 _cleanup_free_
char *own_cgroup_path
= NULL
;
1440 const char *systemd_root
, *systemd_own
;
1445 r
= cg_pid_get_path(NULL
, 0, &own_cgroup_path
);
1447 return log_error_errno(r
, "Failed to determine our own cgroup path: %m");
1449 /* Make our own cgroup a (writable) bind mount */
1450 systemd_own
= strjoina(dest
, "/sys/fs/cgroup/systemd", own_cgroup_path
);
1451 if (mount(systemd_own
, systemd_own
, NULL
, MS_BIND
, NULL
) < 0)
1452 return log_error_errno(errno
, "Failed to turn %s into a bind mount: %m", own_cgroup_path
);
1454 /* And then remount the systemd cgroup root read-only */
1455 systemd_root
= prefix_roota(dest
, "/sys/fs/cgroup/systemd");
1456 if (mount(NULL
, systemd_root
, NULL
, MS_BIND
|MS_REMOUNT
|MS_NOSUID
|MS_NOEXEC
|MS_NODEV
|MS_RDONLY
, NULL
) < 0)
1457 return log_error_errno(errno
, "Failed to mount cgroup root read-only: %m");
1462 static int userns_lchown(const char *p
, uid_t uid
, gid_t gid
) {
1468 if (uid
== UID_INVALID
&& gid
== GID_INVALID
)
1471 if (uid
!= UID_INVALID
) {
1472 uid
+= arg_uid_shift
;
1474 if (uid
< arg_uid_shift
|| uid
>= arg_uid_shift
+ arg_uid_range
)
1478 if (gid
!= GID_INVALID
) {
1479 gid
+= (gid_t
) arg_uid_shift
;
1481 if (gid
< (gid_t
) arg_uid_shift
|| gid
>= (gid_t
) (arg_uid_shift
+ arg_uid_range
))
1485 if (lchown(p
, uid
, gid
) < 0)
1491 static int userns_mkdir(const char *root
, const char *path
, mode_t mode
, uid_t uid
, gid_t gid
) {
1494 q
= prefix_roota(root
, path
);
1495 if (mkdir(q
, mode
) < 0) {
1496 if (errno
== EEXIST
)
1501 return userns_lchown(q
, uid
, gid
);
1504 static int setup_timezone(const char *dest
) {
1505 _cleanup_free_
char *p
= NULL
, *q
= NULL
;
1506 const char *where
, *check
, *what
;
1512 /* Fix the timezone, if possible */
1513 r
= readlink_malloc("/etc/localtime", &p
);
1515 log_warning("/etc/localtime is not a symlink, not updating container timezone.");
1519 z
= path_startswith(p
, "../usr/share/zoneinfo/");
1521 z
= path_startswith(p
, "/usr/share/zoneinfo/");
1523 log_warning("/etc/localtime does not point into /usr/share/zoneinfo/, not updating container timezone.");
1527 where
= prefix_roota(dest
, "/etc/localtime");
1528 r
= readlink_malloc(where
, &q
);
1530 y
= path_startswith(q
, "../usr/share/zoneinfo/");
1532 y
= path_startswith(q
, "/usr/share/zoneinfo/");
1534 /* Already pointing to the right place? Then do nothing .. */
1535 if (y
&& streq(y
, z
))
1539 check
= strjoina("/usr/share/zoneinfo/", z
);
1540 check
= prefix_root(dest
, check
);
1541 if (laccess(check
, F_OK
) < 0) {
1542 log_warning("Timezone %s does not exist in container, not updating container timezone.", z
);
1547 if (r
< 0 && errno
!= ENOENT
) {
1548 log_error_errno(errno
, "Failed to remove existing timezone info %s in container: %m", where
);
1552 what
= strjoina("../usr/share/zoneinfo/", z
);
1553 if (symlink(what
, where
) < 0) {
1554 log_error_errno(errno
, "Failed to correct timezone of container: %m");
1558 r
= userns_lchown(where
, 0, 0);
1560 return log_warning_errno(r
, "Failed to chown /etc/localtime: %m");
1565 static int setup_resolv_conf(const char *dest
) {
1566 const char *where
= NULL
;
1571 if (arg_private_network
)
1574 /* Fix resolv.conf, if possible */
1575 where
= prefix_roota(dest
, "/etc/resolv.conf");
1577 r
= copy_file("/etc/resolv.conf", where
, O_TRUNC
|O_NOFOLLOW
, 0644, 0);
1579 /* If the file already exists as symlink, let's
1580 * suppress the warning, under the assumption that
1581 * resolved or something similar runs inside and the
1582 * symlink points there.
1584 * If the disk image is read-only, there's also no
1585 * point in complaining.
1587 log_full_errno(IN_SET(r
, -ELOOP
, -EROFS
) ? LOG_DEBUG
: LOG_WARNING
, r
,
1588 "Failed to copy /etc/resolv.conf to %s: %m", where
);
1592 r
= userns_lchown(where
, 0, 0);
1594 log_warning_errno(r
, "Failed to chown /etc/resolv.conf: %m");
1599 static int setup_volatile_state(const char *directory
) {
1600 _cleanup_free_
char *buf
= NULL
;
1601 const char *p
, *options
;
1606 if (arg_volatile
!= VOLATILE_STATE
)
1609 /* --volatile=state means we simply overmount /var
1610 with a tmpfs, and the rest read-only. */
1612 r
= bind_remount_recursive(directory
, true);
1614 return log_error_errno(r
, "Failed to remount %s read-only: %m", directory
);
1616 p
= prefix_roota(directory
, "/var");
1618 if (r
< 0 && errno
!= EEXIST
)
1619 return log_error_errno(errno
, "Failed to create %s: %m", directory
);
1621 options
= "mode=755";
1622 r
= tmpfs_patch_options(options
, &buf
);
1628 if (mount("tmpfs", p
, "tmpfs", MS_STRICTATIME
, options
) < 0)
1629 return log_error_errno(errno
, "Failed to mount tmpfs to /var: %m");
1634 static int setup_volatile(const char *directory
) {
1635 bool tmpfs_mounted
= false, bind_mounted
= false;
1636 char template[] = "/tmp/nspawn-volatile-XXXXXX";
1637 _cleanup_free_
char *buf
= NULL
;
1638 const char *f
, *t
, *options
;
1643 if (arg_volatile
!= VOLATILE_YES
)
1646 /* --volatile=yes means we mount a tmpfs to the root dir, and
1647 the original /usr to use inside it, and that read-only. */
1649 if (!mkdtemp(template))
1650 return log_error_errno(errno
, "Failed to create temporary directory: %m");
1652 options
= "mode=755";
1653 r
= tmpfs_patch_options(options
, &buf
);
1659 if (mount("tmpfs", template, "tmpfs", MS_STRICTATIME
, options
) < 0) {
1660 r
= log_error_errno(errno
, "Failed to mount tmpfs for root directory: %m");
1664 tmpfs_mounted
= true;
1666 f
= prefix_roota(directory
, "/usr");
1667 t
= prefix_roota(template, "/usr");
1670 if (r
< 0 && errno
!= EEXIST
) {
1671 r
= log_error_errno(errno
, "Failed to create %s: %m", t
);
1675 if (mount(f
, t
, NULL
, MS_BIND
|MS_REC
, NULL
) < 0) {
1676 r
= log_error_errno(errno
, "Failed to create /usr bind mount: %m");
1680 bind_mounted
= true;
1682 r
= bind_remount_recursive(t
, true);
1684 log_error_errno(r
, "Failed to remount %s read-only: %m", t
);
1688 if (mount(template, directory
, NULL
, MS_MOVE
, NULL
) < 0) {
1689 r
= log_error_errno(errno
, "Failed to move root mount: %m");
1693 (void) rmdir(template);
1702 (void) umount(template);
1703 (void) rmdir(template);
1707 static char* id128_format_as_uuid(sd_id128_t id
, char s
[37]) {
1711 "%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
1712 SD_ID128_FORMAT_VAL(id
));
1717 static int setup_boot_id(const char *dest
) {
1718 const char *from
, *to
;
1719 sd_id128_t rnd
= {};
1723 if (arg_share_system
)
1726 /* Generate a new randomized boot ID, so that each boot-up of
1727 * the container gets a new one */
1729 from
= prefix_roota(dest
, "/run/proc-sys-kernel-random-boot-id");
1730 to
= prefix_roota(dest
, "/proc/sys/kernel/random/boot_id");
1732 r
= sd_id128_randomize(&rnd
);
1734 return log_error_errno(r
, "Failed to generate random boot id: %m");
1736 id128_format_as_uuid(rnd
, as_uuid
);
1738 r
= write_string_file(from
, as_uuid
, WRITE_STRING_FILE_CREATE
);
1740 return log_error_errno(r
, "Failed to write boot id: %m");
1742 if (mount(from
, to
, NULL
, MS_BIND
, NULL
) < 0)
1743 r
= log_error_errno(errno
, "Failed to bind mount boot id: %m");
1744 else if (mount(NULL
, to
, NULL
, MS_BIND
|MS_REMOUNT
|MS_RDONLY
|MS_NOSUID
|MS_NODEV
, NULL
) < 0)
1745 log_warning_errno(errno
, "Failed to make boot id read-only: %m");
1751 static int copy_devnodes(const char *dest
) {
1753 static const char devnodes
[] =
1764 _cleanup_umask_ mode_t u
;
1770 /* Create /dev/net, so that we can create /dev/net/tun in it */
1771 if (userns_mkdir(dest
, "/dev/net", 0755, 0, 0) < 0)
1772 return log_error_errno(r
, "Failed to create /dev/net directory: %m");
1774 NULSTR_FOREACH(d
, devnodes
) {
1775 _cleanup_free_
char *from
= NULL
, *to
= NULL
;
1778 from
= strappend("/dev/", d
);
1779 to
= prefix_root(dest
, from
);
1781 if (stat(from
, &st
) < 0) {
1783 if (errno
!= ENOENT
)
1784 return log_error_errno(errno
, "Failed to stat %s: %m", from
);
1786 } else if (!S_ISCHR(st
.st_mode
) && !S_ISBLK(st
.st_mode
)) {
1788 log_error("%s is not a char or block device, cannot copy.", from
);
1792 if (mknod(to
, st
.st_mode
, st
.st_rdev
) < 0) {
1794 return log_error_errno(errno
, "mknod(%s) failed: %m", to
);
1796 /* Some systems abusively restrict mknod but
1797 * allow bind mounts. */
1800 return log_error_errno(r
, "touch (%s) failed: %m", to
);
1801 if (mount(from
, to
, NULL
, MS_BIND
, NULL
) < 0)
1802 return log_error_errno(errno
, "Both mknod and bind mount (%s) failed: %m", to
);
1805 r
= userns_lchown(to
, 0, 0);
1807 return log_error_errno(r
, "chown() of device node %s failed: %m", to
);
1814 static int setup_pts(const char *dest
) {
1815 _cleanup_free_
char *options
= NULL
;
1819 if (arg_selinux_apifs_context
)
1820 (void) asprintf(&options
,
1821 "newinstance,ptmxmode=0666,mode=620,gid=" GID_FMT
",context=\"%s\"",
1822 arg_uid_shift
+ TTY_GID
,
1823 arg_selinux_apifs_context
);
1826 (void) asprintf(&options
,
1827 "newinstance,ptmxmode=0666,mode=620,gid=" GID_FMT
,
1828 arg_uid_shift
+ TTY_GID
);
1833 /* Mount /dev/pts itself */
1834 p
= prefix_roota(dest
, "/dev/pts");
1835 if (mkdir(p
, 0755) < 0)
1836 return log_error_errno(errno
, "Failed to create /dev/pts: %m");
1837 if (mount("devpts", p
, "devpts", MS_NOSUID
|MS_NOEXEC
, options
) < 0)
1838 return log_error_errno(errno
, "Failed to mount /dev/pts: %m");
1839 if (userns_lchown(p
, 0, 0) < 0)
1840 return log_error_errno(errno
, "Failed to chown /dev/pts: %m");
1842 /* Create /dev/ptmx symlink */
1843 p
= prefix_roota(dest
, "/dev/ptmx");
1844 if (symlink("pts/ptmx", p
) < 0)
1845 return log_error_errno(errno
, "Failed to create /dev/ptmx symlink: %m");
1846 if (userns_lchown(p
, 0, 0) < 0)
1847 return log_error_errno(errno
, "Failed to chown /dev/ptmx: %m");
1849 /* And fix /dev/pts/ptmx ownership */
1850 p
= prefix_roota(dest
, "/dev/pts/ptmx");
1851 if (userns_lchown(p
, 0, 0) < 0)
1852 return log_error_errno(errno
, "Failed to chown /dev/pts/ptmx: %m");
1857 static int setup_dev_console(const char *dest
, const char *console
) {
1858 _cleanup_umask_ mode_t u
;
1867 r
= chmod_and_chown(console
, 0600, arg_uid_shift
, arg_uid_shift
);
1869 return log_error_errno(r
, "Failed to correct access mode for TTY: %m");
1871 /* We need to bind mount the right tty to /dev/console since
1872 * ptys can only exist on pts file systems. To have something
1873 * to bind mount things on we create a empty regular file. */
1875 to
= prefix_roota(dest
, "/dev/console");
1878 return log_error_errno(r
, "touch() for /dev/console failed: %m");
1880 if (mount(console
, to
, NULL
, MS_BIND
, NULL
) < 0)
1881 return log_error_errno(errno
, "Bind mount for /dev/console failed: %m");
1886 static int setup_kmsg(const char *dest
, int kmsg_socket
) {
1887 const char *from
, *to
;
1888 _cleanup_umask_ mode_t u
;
1891 struct cmsghdr cmsghdr
;
1892 uint8_t buf
[CMSG_SPACE(sizeof(int))];
1894 struct msghdr mh
= {
1895 .msg_control
= &control
,
1896 .msg_controllen
= sizeof(control
),
1898 struct cmsghdr
*cmsg
;
1900 assert(kmsg_socket
>= 0);
1904 /* We create the kmsg FIFO as /run/kmsg, but immediately
1905 * delete it after bind mounting it to /proc/kmsg. While FIFOs
1906 * on the reading side behave very similar to /proc/kmsg,
1907 * their writing side behaves differently from /dev/kmsg in
1908 * that writing blocks when nothing is reading. In order to
1909 * avoid any problems with containers deadlocking due to this
1910 * we simply make /dev/kmsg unavailable to the container. */
1911 from
= prefix_roota(dest
, "/run/kmsg");
1912 to
= prefix_roota(dest
, "/proc/kmsg");
1914 if (mkfifo(from
, 0600) < 0)
1915 return log_error_errno(errno
, "mkfifo() for /run/kmsg failed: %m");
1916 if (mount(from
, to
, NULL
, MS_BIND
, NULL
) < 0)
1917 return log_error_errno(errno
, "Bind mount for /proc/kmsg failed: %m");
1919 fd
= open(from
, O_RDWR
|O_NDELAY
|O_CLOEXEC
);
1921 return log_error_errno(errno
, "Failed to open fifo: %m");
1923 cmsg
= CMSG_FIRSTHDR(&mh
);
1924 cmsg
->cmsg_level
= SOL_SOCKET
;
1925 cmsg
->cmsg_type
= SCM_RIGHTS
;
1926 cmsg
->cmsg_len
= CMSG_LEN(sizeof(int));
1927 memcpy(CMSG_DATA(cmsg
), &fd
, sizeof(int));
1929 mh
.msg_controllen
= cmsg
->cmsg_len
;
1931 /* Store away the fd in the socket, so that it stays open as
1932 * long as we run the child */
1933 k
= sendmsg(kmsg_socket
, &mh
, MSG_NOSIGNAL
);
1937 return log_error_errno(errno
, "Failed to send FIFO fd: %m");
1939 /* And now make the FIFO unavailable as /run/kmsg... */
1940 (void) unlink(from
);
1945 static int send_rtnl(int send_fd
) {
1947 struct cmsghdr cmsghdr
;
1948 uint8_t buf
[CMSG_SPACE(sizeof(int))];
1950 struct msghdr mh
= {
1951 .msg_control
= &control
,
1952 .msg_controllen
= sizeof(control
),
1954 struct cmsghdr
*cmsg
;
1955 _cleanup_close_
int fd
= -1;
1958 assert(send_fd
>= 0);
1960 if (!arg_expose_ports
)
1963 fd
= socket(PF_NETLINK
, SOCK_RAW
|SOCK_CLOEXEC
|SOCK_NONBLOCK
, NETLINK_ROUTE
);
1965 return log_error_errno(errno
, "Failed to allocate container netlink: %m");
1967 cmsg
= CMSG_FIRSTHDR(&mh
);
1968 cmsg
->cmsg_level
= SOL_SOCKET
;
1969 cmsg
->cmsg_type
= SCM_RIGHTS
;
1970 cmsg
->cmsg_len
= CMSG_LEN(sizeof(int));
1971 memcpy(CMSG_DATA(cmsg
), &fd
, sizeof(int));
1973 mh
.msg_controllen
= cmsg
->cmsg_len
;
1975 /* Store away the fd in the socket, so that it stays open as
1976 * long as we run the child */
1977 k
= sendmsg(send_fd
, &mh
, MSG_NOSIGNAL
);
1979 return log_error_errno(errno
, "Failed to send netlink fd: %m");
1984 static int flush_ports(union in_addr_union
*exposed
) {
1986 int r
, af
= AF_INET
;
1990 if (!arg_expose_ports
)
1993 if (in_addr_is_null(af
, exposed
))
1996 log_debug("Lost IP address.");
1998 LIST_FOREACH(ports
, p
, arg_expose_ports
) {
1999 r
= fw_add_local_dnat(false,
2010 log_warning_errno(r
, "Failed to modify firewall: %m");
2013 *exposed
= IN_ADDR_NULL
;
2017 static int expose_ports(sd_netlink
*rtnl
, union in_addr_union
*exposed
) {
2018 _cleanup_free_
struct local_address
*addresses
= NULL
;
2019 _cleanup_free_
char *pretty
= NULL
;
2020 union in_addr_union new_exposed
;
2023 int af
= AF_INET
, r
;
2027 /* Invoked each time an address is added or removed inside the
2030 if (!arg_expose_ports
)
2033 r
= local_addresses(rtnl
, 0, af
, &addresses
);
2035 return log_error_errno(r
, "Failed to enumerate local addresses: %m");
2038 addresses
[0].family
== af
&&
2039 addresses
[0].scope
< RT_SCOPE_LINK
;
2042 return flush_ports(exposed
);
2044 new_exposed
= addresses
[0].address
;
2045 if (in_addr_equal(af
, exposed
, &new_exposed
))
2048 in_addr_to_string(af
, &new_exposed
, &pretty
);
2049 log_debug("New container IP is %s.", strna(pretty
));
2051 LIST_FOREACH(ports
, p
, arg_expose_ports
) {
2053 r
= fw_add_local_dnat(true,
2062 in_addr_is_null(af
, exposed
) ? NULL
: exposed
);
2064 log_warning_errno(r
, "Failed to modify firewall: %m");
2067 *exposed
= new_exposed
;
2071 static int on_address_change(sd_netlink
*rtnl
, sd_netlink_message
*m
, void *userdata
) {
2072 union in_addr_union
*exposed
= userdata
;
2078 expose_ports(rtnl
, exposed
);
2082 static int watch_rtnl(sd_event
*event
, int recv_fd
, union in_addr_union
*exposed
, sd_netlink
**ret
) {
2084 struct cmsghdr cmsghdr
;
2085 uint8_t buf
[CMSG_SPACE(sizeof(int))];
2087 struct msghdr mh
= {
2088 .msg_control
= &control
,
2089 .msg_controllen
= sizeof(control
),
2091 struct cmsghdr
*cmsg
;
2092 _cleanup_netlink_unref_ sd_netlink
*rtnl
= NULL
;
2097 assert(recv_fd
>= 0);
2100 if (!arg_expose_ports
)
2103 k
= recvmsg(recv_fd
, &mh
, MSG_NOSIGNAL
);
2105 return log_error_errno(errno
, "Failed to recv netlink fd: %m");
2107 cmsg
= CMSG_FIRSTHDR(&mh
);
2108 assert(cmsg
->cmsg_level
== SOL_SOCKET
);
2109 assert(cmsg
->cmsg_type
== SCM_RIGHTS
);
2110 assert(cmsg
->cmsg_len
== CMSG_LEN(sizeof(int)));
2111 memcpy(&fd
, CMSG_DATA(cmsg
), sizeof(int));
2113 r
= sd_netlink_open_fd(&rtnl
, fd
);
2116 return log_error_errno(r
, "Failed to create rtnl object: %m");
2119 r
= sd_netlink_add_match(rtnl
, RTM_NEWADDR
, on_address_change
, exposed
);
2121 return log_error_errno(r
, "Failed to subscribe to RTM_NEWADDR messages: %m");
2123 r
= sd_netlink_add_match(rtnl
, RTM_DELADDR
, on_address_change
, exposed
);
2125 return log_error_errno(r
, "Failed to subscribe to RTM_DELADDR messages: %m");
2127 r
= sd_netlink_attach_event(rtnl
, event
, 0);
2129 return log_error_errno(r
, "Failed to add to even loop: %m");
2137 static int setup_hostname(void) {
2139 if (arg_share_system
)
2142 if (sethostname_idempotent(arg_machine
) < 0)
2148 static int setup_journal(const char *directory
) {
2149 sd_id128_t machine_id
, this_id
;
2150 _cleanup_free_
char *b
= NULL
, *d
= NULL
;
2151 const char *etc_machine_id
, *p
, *q
;
2155 /* Don't link journals in ephemeral mode */
2159 etc_machine_id
= prefix_roota(directory
, "/etc/machine-id");
2161 r
= read_one_line_file(etc_machine_id
, &b
);
2162 if (r
== -ENOENT
&& arg_link_journal
== LINK_AUTO
)
2165 return log_error_errno(r
, "Failed to read machine ID from %s: %m", etc_machine_id
);
2168 if (isempty(id
) && arg_link_journal
== LINK_AUTO
)
2171 /* Verify validity */
2172 r
= sd_id128_from_string(id
, &machine_id
);
2174 return log_error_errno(r
, "Failed to parse machine ID from %s: %m", etc_machine_id
);
2176 r
= sd_id128_get_machine(&this_id
);
2178 return log_error_errno(r
, "Failed to retrieve machine ID: %m");
2180 if (sd_id128_equal(machine_id
, this_id
)) {
2181 log_full(arg_link_journal
== LINK_AUTO
? LOG_WARNING
: LOG_ERR
,
2182 "Host and machine ids are equal (%s): refusing to link journals", id
);
2183 if (arg_link_journal
== LINK_AUTO
)
2188 if (arg_link_journal
== LINK_NO
)
2191 r
= userns_mkdir(directory
, "/var", 0755, 0, 0);
2193 return log_error_errno(r
, "Failed to create /var: %m");
2195 r
= userns_mkdir(directory
, "/var/log", 0755, 0, 0);
2197 return log_error_errno(r
, "Failed to create /var/log: %m");
2199 r
= userns_mkdir(directory
, "/var/log/journal", 0755, 0, 0);
2201 return log_error_errno(r
, "Failed to create /var/log/journal: %m");
2203 p
= strjoina("/var/log/journal/", id
);
2204 q
= prefix_roota(directory
, p
);
2206 if (path_is_mount_point(p
, 0) > 0) {
2207 if (arg_link_journal
!= LINK_AUTO
) {
2208 log_error("%s: already a mount point, refusing to use for journal", p
);
2215 if (path_is_mount_point(q
, 0) > 0) {
2216 if (arg_link_journal
!= LINK_AUTO
) {
2217 log_error("%s: already a mount point, refusing to use for journal", q
);
2224 r
= readlink_and_make_absolute(p
, &d
);
2226 if ((arg_link_journal
== LINK_GUEST
||
2227 arg_link_journal
== LINK_AUTO
) &&
2230 r
= userns_mkdir(directory
, p
, 0755, 0, 0);
2232 log_warning_errno(errno
, "Failed to create directory %s: %m", q
);
2237 return log_error_errno(errno
, "Failed to remove symlink %s: %m", p
);
2238 } else if (r
== -EINVAL
) {
2240 if (arg_link_journal
== LINK_GUEST
&&
2243 if (errno
== ENOTDIR
) {
2244 log_error("%s already exists and is neither a symlink nor a directory", p
);
2247 log_error_errno(errno
, "Failed to remove %s: %m", p
);
2251 } else if (r
!= -ENOENT
) {
2252 log_error_errno(errno
, "readlink(%s) failed: %m", p
);
2256 if (arg_link_journal
== LINK_GUEST
) {
2258 if (symlink(q
, p
) < 0) {
2259 if (arg_link_journal_try
) {
2260 log_debug_errno(errno
, "Failed to symlink %s to %s, skipping journal setup: %m", q
, p
);
2263 log_error_errno(errno
, "Failed to symlink %s to %s: %m", q
, p
);
2268 r
= userns_mkdir(directory
, p
, 0755, 0, 0);
2270 log_warning_errno(errno
, "Failed to create directory %s: %m", q
);
2274 if (arg_link_journal
== LINK_HOST
) {
2275 /* don't create parents here -- if the host doesn't have
2276 * permanent journal set up, don't force it here */
2279 if (arg_link_journal_try
) {
2280 log_debug_errno(errno
, "Failed to create %s, skipping journal setup: %m", p
);
2283 log_error_errno(errno
, "Failed to create %s: %m", p
);
2288 } else if (access(p
, F_OK
) < 0)
2291 if (dir_is_empty(q
) == 0)
2292 log_warning("%s is not empty, proceeding anyway.", q
);
2294 r
= userns_mkdir(directory
, p
, 0755, 0, 0);
2296 log_error_errno(errno
, "Failed to create %s: %m", q
);
2300 if (mount(p
, q
, NULL
, MS_BIND
, NULL
) < 0)
2301 return log_error_errno(errno
, "Failed to bind mount journal from host into guest: %m");
2306 static int drop_capabilities(void) {
2307 return capability_bounding_set_drop(~arg_retain
, false);
2310 static int register_machine(pid_t pid
, int local_ifindex
) {
2311 _cleanup_bus_error_free_ sd_bus_error error
= SD_BUS_ERROR_NULL
;
2312 _cleanup_bus_flush_close_unref_ sd_bus
*bus
= NULL
;
2318 r
= sd_bus_default_system(&bus
);
2320 return log_error_errno(r
, "Failed to open system bus: %m");
2322 if (arg_keep_unit
) {
2323 r
= sd_bus_call_method(
2325 "org.freedesktop.machine1",
2326 "/org/freedesktop/machine1",
2327 "org.freedesktop.machine1.Manager",
2328 "RegisterMachineWithNetwork",
2333 SD_BUS_MESSAGE_APPEND_ID128(arg_uuid
),
2337 strempty(arg_directory
),
2338 local_ifindex
> 0 ? 1 : 0, local_ifindex
);
2340 _cleanup_bus_message_unref_ sd_bus_message
*m
= NULL
;
2344 r
= sd_bus_message_new_method_call(
2347 "org.freedesktop.machine1",
2348 "/org/freedesktop/machine1",
2349 "org.freedesktop.machine1.Manager",
2350 "CreateMachineWithNetwork");
2352 return bus_log_create_error(r
);
2354 r
= sd_bus_message_append(
2358 SD_BUS_MESSAGE_APPEND_ID128(arg_uuid
),
2362 strempty(arg_directory
),
2363 local_ifindex
> 0 ? 1 : 0, local_ifindex
);
2365 return bus_log_create_error(r
);
2367 r
= sd_bus_message_open_container(m
, 'a', "(sv)");
2369 return bus_log_create_error(r
);
2371 if (!isempty(arg_slice
)) {
2372 r
= sd_bus_message_append(m
, "(sv)", "Slice", "s", arg_slice
);
2374 return bus_log_create_error(r
);
2377 r
= sd_bus_message_append(m
, "(sv)", "DevicePolicy", "s", "strict");
2379 return bus_log_create_error(r
);
2381 /* If you make changes here, also make sure to update
2382 * systemd-nspawn@.service, to keep the device
2383 * policies in sync regardless if we are run with or
2384 * without the --keep-unit switch. */
2385 r
= sd_bus_message_append(m
, "(sv)", "DeviceAllow", "a(ss)", 9,
2386 /* Allow the container to
2387 * access and create the API
2388 * device nodes, so that
2389 * PrivateDevices= in the
2390 * container can work
2395 "/dev/random", "rwm",
2396 "/dev/urandom", "rwm",
2398 "/dev/net/tun", "rwm",
2399 /* Allow the container
2400 * access to ptys. However,
2402 * container to ever create
2403 * these device nodes. */
2404 "/dev/pts/ptmx", "rw",
2407 return bus_log_create_error(r
);
2409 for (j
= 0; j
< arg_n_custom_mounts
; j
++) {
2410 CustomMount
*cm
= &arg_custom_mounts
[j
];
2412 if (cm
->type
!= CUSTOM_MOUNT_BIND
)
2415 r
= is_device_node(cm
->source
);
2417 return log_error_errno(r
, "Failed to stat %s: %m", cm
->source
);
2420 r
= sd_bus_message_append(m
, "(sv)", "DeviceAllow", "a(ss)", 1,
2421 cm
->source
, cm
->read_only
? "r" : "rw");
2423 return log_error_errno(r
, "Failed to append message arguments: %m");
2427 if (arg_kill_signal
!= 0) {
2428 r
= sd_bus_message_append(m
, "(sv)", "KillSignal", "i", arg_kill_signal
);
2430 return bus_log_create_error(r
);
2432 r
= sd_bus_message_append(m
, "(sv)", "KillMode", "s", "mixed");
2434 return bus_log_create_error(r
);
2437 STRV_FOREACH(i
, arg_property
) {
2438 r
= sd_bus_message_open_container(m
, 'r', "sv");
2440 return bus_log_create_error(r
);
2442 r
= bus_append_unit_property_assignment(m
, *i
);
2446 r
= sd_bus_message_close_container(m
);
2448 return bus_log_create_error(r
);
2451 r
= sd_bus_message_close_container(m
);
2453 return bus_log_create_error(r
);
2455 r
= sd_bus_call(bus
, m
, 0, &error
, NULL
);
2459 log_error("Failed to register machine: %s", bus_error_message(&error
, r
));
2466 static int terminate_machine(pid_t pid
) {
2467 _cleanup_bus_error_free_ sd_bus_error error
= SD_BUS_ERROR_NULL
;
2468 _cleanup_bus_message_unref_ sd_bus_message
*reply
= NULL
;
2469 _cleanup_bus_flush_close_unref_ sd_bus
*bus
= NULL
;
2476 /* If we are reusing the unit, then just exit, systemd will do
2477 * the right thing when we exit. */
2481 r
= sd_bus_default_system(&bus
);
2483 return log_error_errno(r
, "Failed to open system bus: %m");
2485 r
= sd_bus_call_method(
2487 "org.freedesktop.machine1",
2488 "/org/freedesktop/machine1",
2489 "org.freedesktop.machine1.Manager",
2496 /* Note that the machine might already have been
2497 * cleaned up automatically, hence don't consider it a
2498 * failure if we cannot get the machine object. */
2499 log_debug("Failed to get machine: %s", bus_error_message(&error
, r
));
2503 r
= sd_bus_message_read(reply
, "o", &path
);
2505 return bus_log_parse_error(r
);
2507 r
= sd_bus_call_method(
2509 "org.freedesktop.machine1",
2511 "org.freedesktop.machine1.Machine",
2517 log_debug("Failed to terminate machine: %s", bus_error_message(&error
, r
));
2524 static int reset_audit_loginuid(void) {
2525 _cleanup_free_
char *p
= NULL
;
2528 if (arg_share_system
)
2531 r
= read_one_line_file("/proc/self/loginuid", &p
);
2535 return log_error_errno(r
, "Failed to read /proc/self/loginuid: %m");
2537 /* Already reset? */
2538 if (streq(p
, "4294967295"))
2541 r
= write_string_file("/proc/self/loginuid", "4294967295", 0);
2544 "Failed to reset audit login UID. This probably means that your kernel is too\n"
2545 "old and you have audit enabled. Note that the auditing subsystem is known to\n"
2546 "be incompatible with containers on old kernels. Please make sure to upgrade\n"
2547 "your kernel or to off auditing with 'audit=0' on the kernel command line before\n"
2548 "using systemd-nspawn. Sleeping for 5s... (%m)");
2556 #define HOST_HASH_KEY SD_ID128_MAKE(1a,37,6f,c7,46,ec,45,0b,ad,a3,d5,31,06,60,5d,b1)
2557 #define CONTAINER_HASH_KEY SD_ID128_MAKE(c3,c4,f9,19,b5,57,b2,1c,e6,cf,14,27,03,9c,ee,a2)
2558 #define MACVLAN_HASH_KEY SD_ID128_MAKE(00,13,6d,bc,66,83,44,81,bb,0c,f9,51,1f,24,a6,6f)
2560 static int generate_mac(struct ether_addr
*mac
, sd_id128_t hash_key
, uint64_t idx
) {
2566 l
= strlen(arg_machine
);
2567 sz
= sizeof(sd_id128_t
) + l
;
2573 /* fetch some persistent data unique to the host */
2574 r
= sd_id128_get_machine((sd_id128_t
*) v
);
2578 /* combine with some data unique (on this host) to this
2579 * container instance */
2580 i
= mempcpy(v
+ sizeof(sd_id128_t
), arg_machine
, l
);
2583 memcpy(i
, &idx
, sizeof(idx
));
2586 /* Let's hash the host machine ID plus the container name. We
2587 * use a fixed, but originally randomly created hash key here. */
2588 siphash24(result
, v
, sz
, hash_key
.bytes
);
2590 assert_cc(ETH_ALEN
<= sizeof(result
));
2591 memcpy(mac
->ether_addr_octet
, result
, ETH_ALEN
);
2593 /* see eth_random_addr in the kernel */
2594 mac
->ether_addr_octet
[0] &= 0xfe; /* clear multicast bit */
2595 mac
->ether_addr_octet
[0] |= 0x02; /* set local assignment bit (IEEE802) */
2600 static int setup_veth(pid_t pid
, char iface_name
[IFNAMSIZ
], int *ifi
) {
2601 _cleanup_netlink_message_unref_ sd_netlink_message
*m
= NULL
;
2602 _cleanup_netlink_unref_ sd_netlink
*rtnl
= NULL
;
2603 struct ether_addr mac_host
, mac_container
;
2606 if (!arg_private_network
)
2609 if (!arg_network_veth
)
2612 /* Use two different interface name prefixes depending whether
2613 * we are in bridge mode or not. */
2614 snprintf(iface_name
, IFNAMSIZ
- 1, "%s-%s",
2615 arg_network_bridge
? "vb" : "ve", arg_machine
);
2617 r
= generate_mac(&mac_container
, CONTAINER_HASH_KEY
, 0);
2619 return log_error_errno(r
, "Failed to generate predictable MAC address for container side: %m");
2621 r
= generate_mac(&mac_host
, HOST_HASH_KEY
, 0);
2623 return log_error_errno(r
, "Failed to generate predictable MAC address for host side: %m");
2625 r
= sd_netlink_open(&rtnl
);
2627 return log_error_errno(r
, "Failed to connect to netlink: %m");
2629 r
= sd_rtnl_message_new_link(rtnl
, &m
, RTM_NEWLINK
, 0);
2631 return log_error_errno(r
, "Failed to allocate netlink message: %m");
2633 r
= sd_netlink_message_append_string(m
, IFLA_IFNAME
, iface_name
);
2635 return log_error_errno(r
, "Failed to add netlink interface name: %m");
2637 r
= sd_netlink_message_append_ether_addr(m
, IFLA_ADDRESS
, &mac_host
);
2639 return log_error_errno(r
, "Failed to add netlink MAC address: %m");
2641 r
= sd_netlink_message_open_container(m
, IFLA_LINKINFO
);
2643 return log_error_errno(r
, "Failed to open netlink container: %m");
2645 r
= sd_netlink_message_open_container_union(m
, IFLA_INFO_DATA
, "veth");
2647 return log_error_errno(r
, "Failed to open netlink container: %m");
2649 r
= sd_netlink_message_open_container(m
, VETH_INFO_PEER
);
2651 return log_error_errno(r
, "Failed to open netlink container: %m");
2653 r
= sd_netlink_message_append_string(m
, IFLA_IFNAME
, "host0");
2655 return log_error_errno(r
, "Failed to add netlink interface name: %m");
2657 r
= sd_netlink_message_append_ether_addr(m
, IFLA_ADDRESS
, &mac_container
);
2659 return log_error_errno(r
, "Failed to add netlink MAC address: %m");
2661 r
= sd_netlink_message_append_u32(m
, IFLA_NET_NS_PID
, pid
);
2663 return log_error_errno(r
, "Failed to add netlink namespace field: %m");
2665 r
= sd_netlink_message_close_container(m
);
2667 return log_error_errno(r
, "Failed to close netlink container: %m");
2669 r
= sd_netlink_message_close_container(m
);
2671 return log_error_errno(r
, "Failed to close netlink container: %m");
2673 r
= sd_netlink_message_close_container(m
);
2675 return log_error_errno(r
, "Failed to close netlink container: %m");
2677 r
= sd_netlink_call(rtnl
, m
, 0, NULL
);
2679 return log_error_errno(r
, "Failed to add new veth interfaces (host0, %s): %m", iface_name
);
2681 i
= (int) if_nametoindex(iface_name
);
2683 return log_error_errno(errno
, "Failed to resolve interface %s: %m", iface_name
);
2690 static int setup_bridge(const char veth_name
[], int *ifi
) {
2691 _cleanup_netlink_message_unref_ sd_netlink_message
*m
= NULL
;
2692 _cleanup_netlink_unref_ sd_netlink
*rtnl
= NULL
;
2695 if (!arg_private_network
)
2698 if (!arg_network_veth
)
2701 if (!arg_network_bridge
)
2704 bridge
= (int) if_nametoindex(arg_network_bridge
);
2706 return log_error_errno(errno
, "Failed to resolve interface %s: %m", arg_network_bridge
);
2710 r
= sd_netlink_open(&rtnl
);
2712 return log_error_errno(r
, "Failed to connect to netlink: %m");
2714 r
= sd_rtnl_message_new_link(rtnl
, &m
, RTM_SETLINK
, 0);
2716 return log_error_errno(r
, "Failed to allocate netlink message: %m");
2718 r
= sd_rtnl_message_link_set_flags(m
, IFF_UP
, IFF_UP
);
2720 return log_error_errno(r
, "Failed to set IFF_UP flag: %m");
2722 r
= sd_netlink_message_append_string(m
, IFLA_IFNAME
, veth_name
);
2724 return log_error_errno(r
, "Failed to add netlink interface name field: %m");
2726 r
= sd_netlink_message_append_u32(m
, IFLA_MASTER
, bridge
);
2728 return log_error_errno(r
, "Failed to add netlink master field: %m");
2730 r
= sd_netlink_call(rtnl
, m
, 0, NULL
);
2732 return log_error_errno(r
, "Failed to add veth interface to bridge: %m");
2737 static int parse_interface(struct udev
*udev
, const char *name
) {
2738 _cleanup_udev_device_unref_
struct udev_device
*d
= NULL
;
2739 char ifi_str
[2 + DECIMAL_STR_MAX(int)];
2742 ifi
= (int) if_nametoindex(name
);
2744 return log_error_errno(errno
, "Failed to resolve interface %s: %m", name
);
2746 sprintf(ifi_str
, "n%i", ifi
);
2747 d
= udev_device_new_from_device_id(udev
, ifi_str
);
2749 return log_error_errno(errno
, "Failed to get udev device for interface %s: %m", name
);
2751 if (udev_device_get_is_initialized(d
) <= 0) {
2752 log_error("Network interface %s is not initialized yet.", name
);
2759 static int move_network_interfaces(pid_t pid
) {
2760 _cleanup_udev_unref_
struct udev
*udev
= NULL
;
2761 _cleanup_netlink_unref_ sd_netlink
*rtnl
= NULL
;
2765 if (!arg_private_network
)
2768 if (strv_isempty(arg_network_interfaces
))
2771 r
= sd_netlink_open(&rtnl
);
2773 return log_error_errno(r
, "Failed to connect to netlink: %m");
2777 log_error("Failed to connect to udev.");
2781 STRV_FOREACH(i
, arg_network_interfaces
) {
2782 _cleanup_netlink_message_unref_ sd_netlink_message
*m
= NULL
;
2785 ifi
= parse_interface(udev
, *i
);
2789 r
= sd_rtnl_message_new_link(rtnl
, &m
, RTM_SETLINK
, ifi
);
2791 return log_error_errno(r
, "Failed to allocate netlink message: %m");
2793 r
= sd_netlink_message_append_u32(m
, IFLA_NET_NS_PID
, pid
);
2795 return log_error_errno(r
, "Failed to append namespace PID to netlink message: %m");
2797 r
= sd_netlink_call(rtnl
, m
, 0, NULL
);
2799 return log_error_errno(r
, "Failed to move interface %s to namespace: %m", *i
);
2805 static int setup_macvlan(pid_t pid
) {
2806 _cleanup_udev_unref_
struct udev
*udev
= NULL
;
2807 _cleanup_netlink_unref_ sd_netlink
*rtnl
= NULL
;
2812 if (!arg_private_network
)
2815 if (strv_isempty(arg_network_macvlan
))
2818 r
= sd_netlink_open(&rtnl
);
2820 return log_error_errno(r
, "Failed to connect to netlink: %m");
2824 log_error("Failed to connect to udev.");
2828 STRV_FOREACH(i
, arg_network_macvlan
) {
2829 _cleanup_netlink_message_unref_ sd_netlink_message
*m
= NULL
;
2830 _cleanup_free_
char *n
= NULL
;
2831 struct ether_addr mac
;
2834 ifi
= parse_interface(udev
, *i
);
2838 r
= generate_mac(&mac
, MACVLAN_HASH_KEY
, idx
++);
2840 return log_error_errno(r
, "Failed to create MACVLAN MAC address: %m");
2842 r
= sd_rtnl_message_new_link(rtnl
, &m
, RTM_NEWLINK
, 0);
2844 return log_error_errno(r
, "Failed to allocate netlink message: %m");
2846 r
= sd_netlink_message_append_u32(m
, IFLA_LINK
, ifi
);
2848 return log_error_errno(r
, "Failed to add netlink interface index: %m");
2850 n
= strappend("mv-", *i
);
2854 strshorten(n
, IFNAMSIZ
-1);
2856 r
= sd_netlink_message_append_string(m
, IFLA_IFNAME
, n
);
2858 return log_error_errno(r
, "Failed to add netlink interface name: %m");
2860 r
= sd_netlink_message_append_ether_addr(m
, IFLA_ADDRESS
, &mac
);
2862 return log_error_errno(r
, "Failed to add netlink MAC address: %m");
2864 r
= sd_netlink_message_append_u32(m
, IFLA_NET_NS_PID
, pid
);
2866 return log_error_errno(r
, "Failed to add netlink namespace field: %m");
2868 r
= sd_netlink_message_open_container(m
, IFLA_LINKINFO
);
2870 return log_error_errno(r
, "Failed to open netlink container: %m");
2872 r
= sd_netlink_message_open_container_union(m
, IFLA_INFO_DATA
, "macvlan");
2874 return log_error_errno(r
, "Failed to open netlink container: %m");
2876 r
= sd_netlink_message_append_u32(m
, IFLA_MACVLAN_MODE
, MACVLAN_MODE_BRIDGE
);
2878 return log_error_errno(r
, "Failed to append macvlan mode: %m");
2880 r
= sd_netlink_message_close_container(m
);
2882 return log_error_errno(r
, "Failed to close netlink container: %m");
2884 r
= sd_netlink_message_close_container(m
);
2886 return log_error_errno(r
, "Failed to close netlink container: %m");
2888 r
= sd_netlink_call(rtnl
, m
, 0, NULL
);
2890 return log_error_errno(r
, "Failed to add new macvlan interfaces: %m");
2896 static int setup_ipvlan(pid_t pid
) {
2897 _cleanup_udev_unref_
struct udev
*udev
= NULL
;
2898 _cleanup_netlink_unref_ sd_netlink
*rtnl
= NULL
;
2902 if (!arg_private_network
)
2905 if (strv_isempty(arg_network_ipvlan
))
2908 r
= sd_netlink_open(&rtnl
);
2910 return log_error_errno(r
, "Failed to connect to netlink: %m");
2914 log_error("Failed to connect to udev.");
2918 STRV_FOREACH(i
, arg_network_ipvlan
) {
2919 _cleanup_netlink_message_unref_ sd_netlink_message
*m
= NULL
;
2920 _cleanup_free_
char *n
= NULL
;
2923 ifi
= parse_interface(udev
, *i
);
2927 r
= sd_rtnl_message_new_link(rtnl
, &m
, RTM_NEWLINK
, 0);
2929 return log_error_errno(r
, "Failed to allocate netlink message: %m");
2931 r
= sd_netlink_message_append_u32(m
, IFLA_LINK
, ifi
);
2933 return log_error_errno(r
, "Failed to add netlink interface index: %m");
2935 n
= strappend("iv-", *i
);
2939 strshorten(n
, IFNAMSIZ
-1);
2941 r
= sd_netlink_message_append_string(m
, IFLA_IFNAME
, n
);
2943 return log_error_errno(r
, "Failed to add netlink interface name: %m");
2945 r
= sd_netlink_message_append_u32(m
, IFLA_NET_NS_PID
, pid
);
2947 return log_error_errno(r
, "Failed to add netlink namespace field: %m");
2949 r
= sd_netlink_message_open_container(m
, IFLA_LINKINFO
);
2951 return log_error_errno(r
, "Failed to open netlink container: %m");
2953 r
= sd_netlink_message_open_container_union(m
, IFLA_INFO_DATA
, "ipvlan");
2955 return log_error_errno(r
, "Failed to open netlink container: %m");
2957 r
= sd_netlink_message_append_u16(m
, IFLA_IPVLAN_MODE
, IPVLAN_MODE_L2
);
2959 return log_error_errno(r
, "Failed to add ipvlan mode: %m");
2961 r
= sd_netlink_message_close_container(m
);
2963 return log_error_errno(r
, "Failed to close netlink container: %m");
2965 r
= sd_netlink_message_close_container(m
);
2967 return log_error_errno(r
, "Failed to close netlink container: %m");
2969 r
= sd_netlink_call(rtnl
, m
, 0, NULL
);
2971 return log_error_errno(r
, "Failed to add new ipvlan interfaces: %m");
2977 static int setup_seccomp(void) {
2980 static const struct {
2981 uint64_t capability
;
2984 { CAP_SYS_RAWIO
, SCMP_SYS(iopl
) },
2985 { CAP_SYS_RAWIO
, SCMP_SYS(ioperm
) },
2986 { CAP_SYS_BOOT
, SCMP_SYS(kexec_load
) },
2987 { CAP_SYS_ADMIN
, SCMP_SYS(swapon
) },
2988 { CAP_SYS_ADMIN
, SCMP_SYS(swapoff
) },
2989 { CAP_SYS_ADMIN
, SCMP_SYS(open_by_handle_at
) },
2990 { CAP_SYS_MODULE
, SCMP_SYS(init_module
) },
2991 { CAP_SYS_MODULE
, SCMP_SYS(finit_module
) },
2992 { CAP_SYS_MODULE
, SCMP_SYS(delete_module
) },
2993 { CAP_SYSLOG
, SCMP_SYS(syslog
) },
2996 scmp_filter_ctx seccomp
;
3000 seccomp
= seccomp_init(SCMP_ACT_ALLOW
);
3004 r
= seccomp_add_secondary_archs(seccomp
);
3006 log_error_errno(r
, "Failed to add secondary archs to seccomp filter: %m");
3010 for (i
= 0; i
< ELEMENTSOF(blacklist
); i
++) {
3011 if (arg_retain
& (1ULL << blacklist
[i
].capability
))
3014 r
= seccomp_rule_add(seccomp
, SCMP_ACT_ERRNO(EPERM
), blacklist
[i
].syscall_num
, 0);
3016 continue; /* unknown syscall */
3018 log_error_errno(r
, "Failed to block syscall: %m");
3025 Audit is broken in containers, much of the userspace audit
3026 hookup will fail if running inside a container. We don't
3027 care and just turn off creation of audit sockets.
3029 This will make socket(AF_NETLINK, *, NETLINK_AUDIT) fail
3030 with EAFNOSUPPORT which audit userspace uses as indication
3031 that audit is disabled in the kernel.
3034 r
= seccomp_rule_add(
3036 SCMP_ACT_ERRNO(EAFNOSUPPORT
),
3039 SCMP_A0(SCMP_CMP_EQ
, AF_NETLINK
),
3040 SCMP_A2(SCMP_CMP_EQ
, NETLINK_AUDIT
));
3042 log_error_errno(r
, "Failed to add audit seccomp rule: %m");
3046 r
= seccomp_attr_set(seccomp
, SCMP_FLTATR_CTL_NNP
, 0);
3048 log_error_errno(r
, "Failed to unset NO_NEW_PRIVS: %m");
3052 r
= seccomp_load(seccomp
);
3054 log_debug_errno(r
, "Kernel is probably not configured with CONFIG_SECCOMP. Disabling seccomp audit filter: %m");
3059 log_error_errno(r
, "Failed to install seccomp audit filter: %m");
3064 seccomp_release(seccomp
);
3072 static int setup_propagate(const char *root
) {
3075 (void) mkdir_p("/run/systemd/nspawn/", 0755);
3076 (void) mkdir_p("/run/systemd/nspawn/propagate", 0600);
3077 p
= strjoina("/run/systemd/nspawn/propagate/", arg_machine
);
3078 (void) mkdir_p(p
, 0600);
3080 if (userns_mkdir(root
, "/run/systemd", 0755, 0, 0) < 0)
3081 return log_error_errno(errno
, "Failed to create /run/systemd: %m");
3083 if (userns_mkdir(root
, "/run/systemd/nspawn", 0755, 0, 0) < 0)
3084 return log_error_errno(errno
, "Failed to create /run/systemd/nspawn: %m");
3086 if (userns_mkdir(root
, "/run/systemd/nspawn/incoming", 0600, 0, 0) < 0)
3087 return log_error_errno(errno
, "Failed to create /run/systemd/nspawn/incoming: %m");
3089 q
= prefix_roota(root
, "/run/systemd/nspawn/incoming");
3090 if (mount(p
, q
, NULL
, MS_BIND
, NULL
) < 0)
3091 return log_error_errno(errno
, "Failed to install propagation bind mount.");
3093 if (mount(NULL
, q
, NULL
, MS_BIND
|MS_REMOUNT
|MS_RDONLY
, NULL
) < 0)
3094 return log_error_errno(errno
, "Failed to make propagation mount read-only");
3099 static int setup_image(char **device_path
, int *loop_nr
) {
3100 struct loop_info64 info
= {
3101 .lo_flags
= LO_FLAGS_AUTOCLEAR
|LO_FLAGS_PARTSCAN
3103 _cleanup_close_
int fd
= -1, control
= -1, loop
= -1;
3104 _cleanup_free_
char* loopdev
= NULL
;
3108 assert(device_path
);
3112 fd
= open(arg_image
, O_CLOEXEC
|(arg_read_only
? O_RDONLY
: O_RDWR
)|O_NONBLOCK
|O_NOCTTY
);
3114 return log_error_errno(errno
, "Failed to open %s: %m", arg_image
);
3116 if (fstat(fd
, &st
) < 0)
3117 return log_error_errno(errno
, "Failed to stat %s: %m", arg_image
);
3119 if (S_ISBLK(st
.st_mode
)) {
3122 p
= strdup(arg_image
);
3136 if (!S_ISREG(st
.st_mode
)) {
3137 log_error_errno(errno
, "%s is not a regular file or block device: %m", arg_image
);
3141 control
= open("/dev/loop-control", O_RDWR
|O_CLOEXEC
|O_NOCTTY
|O_NONBLOCK
);
3143 return log_error_errno(errno
, "Failed to open /dev/loop-control: %m");
3145 nr
= ioctl(control
, LOOP_CTL_GET_FREE
);
3147 return log_error_errno(errno
, "Failed to allocate loop device: %m");
3149 if (asprintf(&loopdev
, "/dev/loop%i", nr
) < 0)
3152 loop
= open(loopdev
, O_CLOEXEC
|(arg_read_only
? O_RDONLY
: O_RDWR
)|O_NONBLOCK
|O_NOCTTY
);
3154 return log_error_errno(errno
, "Failed to open loop device %s: %m", loopdev
);
3156 if (ioctl(loop
, LOOP_SET_FD
, fd
) < 0)
3157 return log_error_errno(errno
, "Failed to set loopback file descriptor on %s: %m", loopdev
);
3160 info
.lo_flags
|= LO_FLAGS_READ_ONLY
;
3162 if (ioctl(loop
, LOOP_SET_STATUS64
, &info
) < 0)
3163 return log_error_errno(errno
, "Failed to set loopback settings on %s: %m", loopdev
);
3165 *device_path
= loopdev
;
3176 #define PARTITION_TABLE_BLURB \
3177 "Note that the disk image needs to either contain only a single MBR partition of\n" \
3178 "type 0x83 that is marked bootable, or a single GPT partition of type " \
3179 "0FC63DAF-8483-4772-8E79-3D69D8477DE4 or follow\n" \
3180 " http://www.freedesktop.org/wiki/Specifications/DiscoverablePartitionsSpec/\n" \
3181 "to be bootable with systemd-nspawn."
3183 static int dissect_image(
3185 char **root_device
, bool *root_device_rw
,
3186 char **home_device
, bool *home_device_rw
,
3187 char **srv_device
, bool *srv_device_rw
,
3191 int home_nr
= -1, srv_nr
= -1;
3192 #ifdef GPT_ROOT_NATIVE
3195 #ifdef GPT_ROOT_SECONDARY
3196 int secondary_root_nr
= -1;
3198 _cleanup_free_
char *home
= NULL
, *root
= NULL
, *secondary_root
= NULL
, *srv
= NULL
, *generic
= NULL
;
3199 _cleanup_udev_enumerate_unref_
struct udev_enumerate
*e
= NULL
;
3200 _cleanup_udev_device_unref_
struct udev_device
*d
= NULL
;
3201 _cleanup_blkid_free_probe_ blkid_probe b
= NULL
;
3202 _cleanup_udev_unref_
struct udev
*udev
= NULL
;
3203 struct udev_list_entry
*first
, *item
;
3204 bool home_rw
= true, root_rw
= true, secondary_root_rw
= true, srv_rw
= true, generic_rw
= true;
3205 bool is_gpt
, is_mbr
, multiple_generic
= false;
3206 const char *pttype
= NULL
;
3213 assert(root_device
);
3214 assert(home_device
);
3219 b
= blkid_new_probe();
3224 r
= blkid_probe_set_device(b
, fd
, 0, 0);
3229 log_error_errno(errno
, "Failed to set device on blkid probe: %m");
3233 blkid_probe_enable_partitions(b
, 1);
3234 blkid_probe_set_partitions_flags(b
, BLKID_PARTS_ENTRY_DETAILS
);
3237 r
= blkid_do_safeprobe(b
);
3238 if (r
== -2 || r
== 1) {
3239 log_error("Failed to identify any partition table on\n"
3241 PARTITION_TABLE_BLURB
, arg_image
);
3243 } else if (r
!= 0) {
3246 log_error_errno(errno
, "Failed to probe: %m");
3250 (void) blkid_probe_lookup_value(b
, "PTTYPE", &pttype
, NULL
);
3252 is_gpt
= streq_ptr(pttype
, "gpt");
3253 is_mbr
= streq_ptr(pttype
, "dos");
3255 if (!is_gpt
&& !is_mbr
) {
3256 log_error("No GPT or MBR partition table discovered on\n"
3258 PARTITION_TABLE_BLURB
, arg_image
);
3263 pl
= blkid_probe_get_partitions(b
);
3268 log_error("Failed to list partitions of %s", arg_image
);
3276 if (fstat(fd
, &st
) < 0)
3277 return log_error_errno(errno
, "Failed to stat block device: %m");
3279 d
= udev_device_new_from_devnum(udev
, 'b', st
.st_rdev
);
3287 log_error("Kernel partitions never appeared.");
3291 e
= udev_enumerate_new(udev
);
3295 r
= udev_enumerate_add_match_parent(e
, d
);
3299 r
= udev_enumerate_scan_devices(e
);
3301 return log_error_errno(r
, "Failed to scan for partition devices of %s: %m", arg_image
);
3303 /* Count the partitions enumerated by the kernel */
3305 first
= udev_enumerate_get_list_entry(e
);
3306 udev_list_entry_foreach(item
, first
)
3309 /* Count the partitions enumerated by blkid */
3310 m
= blkid_partlist_numof_partitions(pl
);
3314 log_error("blkid and kernel partition list do not match.");
3320 /* The kernel has probed fewer partitions than
3321 * blkid? Maybe the kernel prober is still
3322 * running or it got EBUSY because udev
3323 * already opened the device. Let's reprobe
3324 * the device, which is a synchronous call
3325 * that waits until probing is complete. */
3327 for (j
= 0; j
< 20; j
++) {
3329 r
= ioctl(fd
, BLKRRPART
, 0);
3332 if (r
>= 0 || r
!= -EBUSY
)
3335 /* If something else has the device
3336 * open, such as an udev rule, the
3337 * ioctl will return EBUSY. Since
3338 * there's no way to wait until it
3339 * isn't busy anymore, let's just wait
3340 * a bit, and try again.
3342 * This is really something they
3343 * should fix in the kernel! */
3345 usleep(50 * USEC_PER_MSEC
);
3349 return log_error_errno(r
, "Failed to reread partition table: %m");
3352 e
= udev_enumerate_unref(e
);
3355 first
= udev_enumerate_get_list_entry(e
);
3356 udev_list_entry_foreach(item
, first
) {
3357 _cleanup_udev_device_unref_
struct udev_device
*q
;
3359 unsigned long long flags
;
3365 q
= udev_device_new_from_syspath(udev
, udev_list_entry_get_name(item
));
3370 log_error_errno(errno
, "Failed to get partition device of %s: %m", arg_image
);
3374 qn
= udev_device_get_devnum(q
);
3378 if (st
.st_rdev
== qn
)
3381 node
= udev_device_get_devnode(q
);
3385 pp
= blkid_partlist_devno_to_partition(pl
, qn
);
3389 flags
= blkid_partition_get_flags(pp
);
3391 nr
= blkid_partition_get_partno(pp
);
3399 if (flags
& GPT_FLAG_NO_AUTO
)
3402 stype
= blkid_partition_get_type_string(pp
);
3406 if (sd_id128_from_string(stype
, &type_id
) < 0)
3409 if (sd_id128_equal(type_id
, GPT_HOME
)) {
3411 if (home
&& nr
>= home_nr
)
3415 home_rw
= !(flags
& GPT_FLAG_READ_ONLY
);
3417 r
= free_and_strdup(&home
, node
);
3421 } else if (sd_id128_equal(type_id
, GPT_SRV
)) {
3423 if (srv
&& nr
>= srv_nr
)
3427 srv_rw
= !(flags
& GPT_FLAG_READ_ONLY
);
3429 r
= free_and_strdup(&srv
, node
);
3433 #ifdef GPT_ROOT_NATIVE
3434 else if (sd_id128_equal(type_id
, GPT_ROOT_NATIVE
)) {
3436 if (root
&& nr
>= root_nr
)
3440 root_rw
= !(flags
& GPT_FLAG_READ_ONLY
);
3442 r
= free_and_strdup(&root
, node
);
3447 #ifdef GPT_ROOT_SECONDARY
3448 else if (sd_id128_equal(type_id
, GPT_ROOT_SECONDARY
)) {
3450 if (secondary_root
&& nr
>= secondary_root_nr
)
3453 secondary_root_nr
= nr
;
3454 secondary_root_rw
= !(flags
& GPT_FLAG_READ_ONLY
);
3456 r
= free_and_strdup(&secondary_root
, node
);
3461 else if (sd_id128_equal(type_id
, GPT_LINUX_GENERIC
)) {
3464 multiple_generic
= true;
3466 generic_rw
= !(flags
& GPT_FLAG_READ_ONLY
);
3468 r
= free_and_strdup(&generic
, node
);
3474 } else if (is_mbr
) {
3477 if (flags
!= 0x80) /* Bootable flag */
3480 type
= blkid_partition_get_type(pp
);
3481 if (type
!= 0x83) /* Linux partition */
3485 multiple_generic
= true;
3489 r
= free_and_strdup(&root
, node
);
3497 *root_device
= root
;
3500 *root_device_rw
= root_rw
;
3502 } else if (secondary_root
) {
3503 *root_device
= secondary_root
;
3504 secondary_root
= NULL
;
3506 *root_device_rw
= secondary_root_rw
;
3508 } else if (generic
) {
3510 /* There were no partitions with precise meanings
3511 * around, but we found generic partitions. In this
3512 * case, if there's only one, we can go ahead and boot
3513 * it, otherwise we bail out, because we really cannot
3514 * make any sense of it. */
3516 if (multiple_generic
) {
3517 log_error("Identified multiple bootable Linux partitions on\n"
3519 PARTITION_TABLE_BLURB
, arg_image
);
3523 *root_device
= generic
;
3526 *root_device_rw
= generic_rw
;
3529 log_error("Failed to identify root partition in disk image\n"
3531 PARTITION_TABLE_BLURB
, arg_image
);
3536 *home_device
= home
;
3539 *home_device_rw
= home_rw
;
3546 *srv_device_rw
= srv_rw
;
3551 log_error("--image= is not supported, compiled without blkid support.");
3556 static int mount_device(const char *what
, const char *where
, const char *directory
, bool rw
) {
3558 _cleanup_blkid_free_probe_ blkid_probe b
= NULL
;
3559 const char *fstype
, *p
;
3569 p
= strjoina(where
, directory
);
3574 b
= blkid_new_probe_from_filename(what
);
3578 log_error_errno(errno
, "Failed to allocate prober for %s: %m", what
);
3582 blkid_probe_enable_superblocks(b
, 1);
3583 blkid_probe_set_superblocks_flags(b
, BLKID_SUBLKS_TYPE
);
3586 r
= blkid_do_safeprobe(b
);
3587 if (r
== -1 || r
== 1) {
3588 log_error("Cannot determine file system type of %s", what
);
3590 } else if (r
!= 0) {
3593 log_error_errno(errno
, "Failed to probe %s: %m", what
);
3598 if (blkid_probe_lookup_value(b
, "TYPE", &fstype
, NULL
) < 0) {
3601 log_error("Failed to determine file system type of %s", what
);
3605 if (streq(fstype
, "crypto_LUKS")) {
3606 log_error("nspawn currently does not support LUKS disk images.");
3610 if (mount(what
, p
, fstype
, MS_NODEV
|(rw
? 0 : MS_RDONLY
), NULL
) < 0)
3611 return log_error_errno(errno
, "Failed to mount %s: %m", what
);
3615 log_error("--image= is not supported, compiled without blkid support.");
3620 static int mount_devices(
3622 const char *root_device
, bool root_device_rw
,
3623 const char *home_device
, bool home_device_rw
,
3624 const char *srv_device
, bool srv_device_rw
) {
3630 r
= mount_device(root_device
, arg_directory
, NULL
, root_device_rw
);
3632 return log_error_errno(r
, "Failed to mount root directory: %m");
3636 r
= mount_device(home_device
, arg_directory
, "/home", home_device_rw
);
3638 return log_error_errno(r
, "Failed to mount home directory: %m");
3642 r
= mount_device(srv_device
, arg_directory
, "/srv", srv_device_rw
);
3644 return log_error_errno(r
, "Failed to mount server data directory: %m");
3650 static void loop_remove(int nr
, int *image_fd
) {
3651 _cleanup_close_
int control
= -1;
3657 if (image_fd
&& *image_fd
>= 0) {
3658 r
= ioctl(*image_fd
, LOOP_CLR_FD
);
3660 log_debug_errno(errno
, "Failed to close loop image: %m");
3661 *image_fd
= safe_close(*image_fd
);
3664 control
= open("/dev/loop-control", O_RDWR
|O_CLOEXEC
|O_NOCTTY
|O_NONBLOCK
);
3666 log_warning_errno(errno
, "Failed to open /dev/loop-control: %m");
3670 r
= ioctl(control
, LOOP_CTL_REMOVE
, nr
);
3672 log_debug_errno(errno
, "Failed to remove loop %d: %m", nr
);
3675 static int spawn_getent(const char *database
, const char *key
, pid_t
*rpid
) {
3683 if (pipe2(pipe_fds
, O_CLOEXEC
) < 0)
3684 return log_error_errno(errno
, "Failed to allocate pipe: %m");
3688 return log_error_errno(errno
, "Failed to fork getent child: %m");
3689 else if (pid
== 0) {
3691 char *empty_env
= NULL
;
3693 if (dup3(pipe_fds
[1], STDOUT_FILENO
, 0) < 0)
3694 _exit(EXIT_FAILURE
);
3696 if (pipe_fds
[0] > 2)
3697 safe_close(pipe_fds
[0]);
3698 if (pipe_fds
[1] > 2)
3699 safe_close(pipe_fds
[1]);
3701 nullfd
= open("/dev/null", O_RDWR
);
3703 _exit(EXIT_FAILURE
);
3705 if (dup3(nullfd
, STDIN_FILENO
, 0) < 0)
3706 _exit(EXIT_FAILURE
);
3708 if (dup3(nullfd
, STDERR_FILENO
, 0) < 0)
3709 _exit(EXIT_FAILURE
);
3714 (void) reset_all_signal_handlers();
3715 (void) reset_signal_mask();
3716 close_all_fds(NULL
, 0);
3718 execle("/usr/bin/getent", "getent", database
, key
, NULL
, &empty_env
);
3719 execle("/bin/getent", "getent", database
, key
, NULL
, &empty_env
);
3720 _exit(EXIT_FAILURE
);
3723 pipe_fds
[1] = safe_close(pipe_fds
[1]);
3730 static int change_uid_gid(char **_home
) {
3731 char line
[LINE_MAX
], *x
, *u
, *g
, *h
;
3732 const char *word
, *state
;
3733 _cleanup_free_ uid_t
*uids
= NULL
;
3734 _cleanup_free_
char *home
= NULL
;
3735 _cleanup_fclose_
FILE *f
= NULL
;
3736 _cleanup_close_
int fd
= -1;
3737 unsigned n_uids
= 0;
3746 if (!arg_user
|| streq(arg_user
, "root") || streq(arg_user
, "0")) {
3747 /* Reset everything fully to 0, just in case */
3749 r
= reset_uid_gid();
3751 return log_error_errno(r
, "Failed to become root: %m");
3757 /* First, get user credentials */
3758 fd
= spawn_getent("passwd", arg_user
, &pid
);
3762 f
= fdopen(fd
, "r");
3767 if (!fgets(line
, sizeof(line
), f
)) {
3770 log_error("Failed to resolve user %s.", arg_user
);
3774 log_error_errno(errno
, "Failed to read from getent: %m");
3780 wait_for_terminate_and_warn("getent passwd", pid
, true);
3782 x
= strchr(line
, ':');
3784 log_error("/etc/passwd entry has invalid user field.");
3788 u
= strchr(x
+1, ':');
3790 log_error("/etc/passwd entry has invalid password field.");
3797 log_error("/etc/passwd entry has invalid UID field.");
3805 log_error("/etc/passwd entry has invalid GID field.");
3810 h
= strchr(x
+1, ':');
3812 log_error("/etc/passwd entry has invalid GECOS field.");
3819 log_error("/etc/passwd entry has invalid home directory field.");
3825 r
= parse_uid(u
, &uid
);
3827 log_error("Failed to parse UID of user.");
3831 r
= parse_gid(g
, &gid
);
3833 log_error("Failed to parse GID of user.");
3841 /* Second, get group memberships */
3842 fd
= spawn_getent("initgroups", arg_user
, &pid
);
3847 f
= fdopen(fd
, "r");
3852 if (!fgets(line
, sizeof(line
), f
)) {
3854 log_error("Failed to resolve user %s.", arg_user
);
3858 log_error_errno(errno
, "Failed to read from getent: %m");
3864 wait_for_terminate_and_warn("getent initgroups", pid
, true);
3866 /* Skip over the username and subsequent separator whitespace */
3868 x
+= strcspn(x
, WHITESPACE
);
3869 x
+= strspn(x
, WHITESPACE
);
3871 FOREACH_WORD(word
, l
, x
, state
) {
3877 if (!GREEDY_REALLOC(uids
, sz
, n_uids
+1))
3880 r
= parse_uid(c
, &uids
[n_uids
++]);
3882 log_error("Failed to parse group data from getent.");
3887 r
= mkdir_parents(home
, 0775);
3889 return log_error_errno(r
, "Failed to make home root directory: %m");
3891 r
= mkdir_safe(home
, 0755, uid
, gid
);
3892 if (r
< 0 && r
!= -EEXIST
)
3893 return log_error_errno(r
, "Failed to make home directory: %m");
3895 (void) fchown(STDIN_FILENO
, uid
, gid
);
3896 (void) fchown(STDOUT_FILENO
, uid
, gid
);
3897 (void) fchown(STDERR_FILENO
, uid
, gid
);
3899 if (setgroups(n_uids
, uids
) < 0)
3900 return log_error_errno(errno
, "Failed to set auxiliary groups: %m");
3902 if (setresgid(gid
, gid
, gid
) < 0)
3903 return log_error_errno(errno
, "setregid() failed: %m");
3905 if (setresuid(uid
, uid
, uid
) < 0)
3906 return log_error_errno(errno
, "setreuid() failed: %m");
3918 * < 0 : wait_for_terminate() failed to get the state of the
3919 * container, the container was terminated by a signal, or
3920 * failed for an unknown reason. No change is made to the
3921 * container argument.
3922 * > 0 : The program executed in the container terminated with an
3923 * error. The exit code of the program executed in the
3924 * container is returned. The container argument has been set
3925 * to CONTAINER_TERMINATED.
3926 * 0 : The container is being rebooted, has been shut down or exited
3927 * successfully. The container argument has been set to either
3928 * CONTAINER_TERMINATED or CONTAINER_REBOOTED.
3930 * That is, success is indicated by a return value of zero, and an
3931 * error is indicated by a non-zero value.
3933 static int wait_for_container(pid_t pid
, ContainerStatus
*container
) {
3937 r
= wait_for_terminate(pid
, &status
);
3939 return log_warning_errno(r
, "Failed to wait for container: %m");
3941 switch (status
.si_code
) {
3944 if (status
.si_status
== 0) {
3945 log_full(arg_quiet
? LOG_DEBUG
: LOG_INFO
, "Container %s exited successfully.", arg_machine
);
3948 log_full(arg_quiet
? LOG_DEBUG
: LOG_INFO
, "Container %s failed with error code %i.", arg_machine
, status
.si_status
);
3950 *container
= CONTAINER_TERMINATED
;
3951 return status
.si_status
;
3954 if (status
.si_status
== SIGINT
) {
3956 log_full(arg_quiet
? LOG_DEBUG
: LOG_INFO
, "Container %s has been shut down.", arg_machine
);
3957 *container
= CONTAINER_TERMINATED
;
3960 } else if (status
.si_status
== SIGHUP
) {
3962 log_full(arg_quiet
? LOG_DEBUG
: LOG_INFO
, "Container %s is being rebooted.", arg_machine
);
3963 *container
= CONTAINER_REBOOTED
;
3967 /* CLD_KILLED fallthrough */
3970 log_error("Container %s terminated by signal %s.", arg_machine
, signal_to_string(status
.si_status
));
3974 log_error("Container %s failed due to unknown reason.", arg_machine
);
3981 static void nop_handler(int sig
) {}
3983 static int on_orderly_shutdown(sd_event_source
*s
, const struct signalfd_siginfo
*si
, void *userdata
) {
3986 pid
= PTR_TO_UINT32(userdata
);
3988 if (kill(pid
, arg_kill_signal
) >= 0) {
3989 log_info("Trying to halt container. Send SIGTERM again to trigger immediate termination.");
3990 sd_event_source_set_userdata(s
, NULL
);
3995 sd_event_exit(sd_event_source_get_event(s
), 0);
3999 static int determine_names(void) {
4002 if (!arg_image
&& !arg_directory
) {
4004 _cleanup_(image_unrefp
) Image
*i
= NULL
;
4006 r
= image_find(arg_machine
, &i
);
4008 return log_error_errno(r
, "Failed to find image for machine '%s': %m", arg_machine
);
4010 log_error("No image for machine '%s': %m", arg_machine
);
4014 if (i
->type
== IMAGE_RAW
)
4015 r
= set_sanitized_path(&arg_image
, i
->path
);
4017 r
= set_sanitized_path(&arg_directory
, i
->path
);
4019 return log_error_errno(r
, "Invalid image directory: %m");
4022 arg_read_only
= arg_read_only
|| i
->read_only
;
4024 arg_directory
= get_current_dir_name();
4026 if (!arg_directory
&& !arg_machine
) {
4027 log_error("Failed to determine path, please use -D or -i.");
4033 if (arg_directory
&& path_equal(arg_directory
, "/"))
4034 arg_machine
= gethostname_malloc();
4036 arg_machine
= strdup(basename(arg_image
?: arg_directory
));
4041 hostname_cleanup(arg_machine
);
4042 if (!machine_name_is_valid(arg_machine
)) {
4043 log_error("Failed to determine machine name automatically, please use -M.");
4047 if (arg_ephemeral
) {
4050 /* Add a random suffix when this is an
4051 * ephemeral machine, so that we can run many
4052 * instances at once without manually having
4053 * to specify -M each time. */
4055 if (asprintf(&b
, "%s-%016" PRIx64
, arg_machine
, random_u64()) < 0)
4066 static int determine_uid_shift(const char *directory
) {
4074 if (arg_uid_shift
== UID_INVALID
) {
4077 r
= stat(directory
, &st
);
4079 return log_error_errno(errno
, "Failed to determine UID base of %s: %m", directory
);
4081 arg_uid_shift
= st
.st_uid
& UINT32_C(0xffff0000);
4083 if (arg_uid_shift
!= (st
.st_gid
& UINT32_C(0xffff0000))) {
4084 log_error("UID and GID base of %s don't match.", directory
);
4088 arg_uid_range
= UINT32_C(0x10000);
4091 if (arg_uid_shift
> (uid_t
) -1 - arg_uid_range
) {
4092 log_error("UID base too high for UID range.");
4096 log_info("Using user namespaces with base " UID_FMT
" and range " UID_FMT
".", arg_uid_shift
, arg_uid_range
);
4100 static int inner_child(
4102 const char *directory
,
4110 _cleanup_free_
char *home
= NULL
;
4112 const char *envp
[] = {
4113 "PATH=" DEFAULT_PATH_SPLIT_USR
,
4114 "container=systemd-nspawn", /* LXC sets container=lxc, so follow the scheme here */
4119 NULL
, /* container_uuid */
4120 NULL
, /* LISTEN_FDS */
4121 NULL
, /* LISTEN_PID */
4125 _cleanup_strv_free_
char **env_use
= NULL
;
4130 assert(kmsg_socket
>= 0);
4133 /* Tell the parent, that it now can write the UID map. */
4134 (void) barrier_place(barrier
); /* #1 */
4136 /* Wait until the parent wrote the UID map */
4137 if (!barrier_place_and_sync(barrier
)) { /* #2 */
4138 log_error("Parent died too early");
4143 r
= mount_all(NULL
, true);
4147 /* Wait until we are cgroup-ified, so that we
4148 * can mount the right cgroup path writable */
4149 if (!barrier_place_and_sync(barrier
)) { /* #3 */
4150 log_error("Parent died too early");
4154 r
= mount_systemd_cgroup_writable("");
4158 r
= reset_uid_gid();
4160 return log_error_errno(r
, "Couldn't become new root: %m");
4162 r
= setup_boot_id(NULL
);
4166 r
= setup_kmsg(NULL
, kmsg_socket
);
4169 kmsg_socket
= safe_close(kmsg_socket
);
4174 return log_error_errno(errno
, "setsid() failed: %m");
4176 if (arg_private_network
)
4179 r
= send_rtnl(rtnl_socket
);
4182 rtnl_socket
= safe_close(rtnl_socket
);
4184 if (drop_capabilities() < 0)
4185 return log_error_errno(errno
, "drop_capabilities() failed: %m");
4189 if (arg_personality
!= PERSONALITY_INVALID
) {
4190 if (personality(arg_personality
) < 0)
4191 return log_error_errno(errno
, "personality() failed: %m");
4192 } else if (secondary
) {
4193 if (personality(PER_LINUX32
) < 0)
4194 return log_error_errno(errno
, "personality() failed: %m");
4198 if (arg_selinux_context
)
4199 if (setexeccon((security_context_t
) arg_selinux_context
) < 0)
4200 return log_error_errno(errno
, "setexeccon(\"%s\") failed: %m", arg_selinux_context
);
4203 r
= change_uid_gid(&home
);
4207 envp
[n_env
] = strv_find_prefix(environ
, "TERM=");
4211 if ((asprintf((char**)(envp
+ n_env
++), "HOME=%s", home
? home
: "/root") < 0) ||
4212 (asprintf((char**)(envp
+ n_env
++), "USER=%s", arg_user
? arg_user
: "root") < 0) ||
4213 (asprintf((char**)(envp
+ n_env
++), "LOGNAME=%s", arg_user
? arg_user
: "root") < 0))
4216 if (!sd_id128_equal(arg_uuid
, SD_ID128_NULL
)) {
4219 if (asprintf((char**)(envp
+ n_env
++), "container_uuid=%s", id128_format_as_uuid(arg_uuid
, as_uuid
)) < 0)
4223 if (fdset_size(fds
) > 0) {
4224 r
= fdset_cloexec(fds
, false);
4226 return log_error_errno(r
, "Failed to unset O_CLOEXEC for file descriptors.");
4228 if ((asprintf((char **)(envp
+ n_env
++), "LISTEN_FDS=%u", fdset_size(fds
)) < 0) ||
4229 (asprintf((char **)(envp
+ n_env
++), "LISTEN_PID=1") < 0))
4233 env_use
= strv_env_merge(2, envp
, arg_setenv
);
4237 /* Let the parent know that we are ready and
4238 * wait until the parent is ready with the
4240 if (!barrier_place_and_sync(barrier
)) { /* #4 */
4241 log_error("Parent died too early");
4245 /* Now, explicitly close the log, so that we
4246 * then can close all remaining fds. Closing
4247 * the log explicitly first has the benefit
4248 * that the logging subsystem knows about it,
4249 * and is thus ready to be reopened should we
4250 * need it again. Note that the other fds
4251 * closed here are at least the locking and
4254 (void) fdset_close_others(fds
);
4260 /* Automatically search for the init system */
4262 m
= 1 + argc
- optind
;
4263 a
= newa(char*, m
+ 1);
4264 memcpy(a
+ 1, argv
+ optind
, m
* sizeof(char*));
4266 a
[0] = (char*) "/usr/lib/systemd/systemd";
4267 execve(a
[0], a
, env_use
);
4269 a
[0] = (char*) "/lib/systemd/systemd";
4270 execve(a
[0], a
, env_use
);
4272 a
[0] = (char*) "/sbin/init";
4273 execve(a
[0], a
, env_use
);
4274 } else if (argc
> optind
)
4275 execvpe(argv
[optind
], argv
+ optind
, env_use
);
4277 chdir(home
? home
: "/root");
4278 execle("/bin/bash", "-bash", NULL
, env_use
);
4279 execle("/bin/sh", "-sh", NULL
, env_use
);
4283 return log_error_errno(errno
, "execv() failed: %m");
4286 static int outer_child(
4288 const char *directory
,
4289 const char *console
,
4290 const char *root_device
, bool root_device_rw
,
4291 const char *home_device
, bool home_device_rw
,
4292 const char *srv_device
, bool srv_device_rw
,
4298 int uid_shift_socket
,
4310 assert(pid_socket
>= 0);
4311 assert(kmsg_socket
>= 0);
4313 if (prctl(PR_SET_PDEATHSIG
, SIGKILL
) < 0)
4314 return log_error_errno(errno
, "PR_SET_PDEATHSIG failed: %m");
4317 close_nointr(STDIN_FILENO
);
4318 close_nointr(STDOUT_FILENO
);
4319 close_nointr(STDERR_FILENO
);
4321 r
= open_terminal(console
, O_RDWR
);
4322 if (r
!= STDIN_FILENO
) {
4328 return log_error_errno(r
, "Failed to open console: %m");
4331 if (dup2(STDIN_FILENO
, STDOUT_FILENO
) != STDOUT_FILENO
||
4332 dup2(STDIN_FILENO
, STDERR_FILENO
) != STDERR_FILENO
)
4333 return log_error_errno(errno
, "Failed to duplicate console: %m");
4336 r
= reset_audit_loginuid();
4340 /* Mark everything as slave, so that we still
4341 * receive mounts from the real root, but don't
4342 * propagate mounts to the real root. */
4343 if (mount(NULL
, "/", NULL
, MS_SLAVE
|MS_REC
, NULL
) < 0)
4344 return log_error_errno(errno
, "MS_SLAVE|MS_REC failed: %m");
4346 r
= mount_devices(directory
,
4347 root_device
, root_device_rw
,
4348 home_device
, home_device_rw
,
4349 srv_device
, srv_device_rw
);
4353 r
= determine_uid_shift(directory
);
4358 l
= send(uid_shift_socket
, &arg_uid_shift
, sizeof(arg_uid_shift
), MSG_NOSIGNAL
);
4360 return log_error_errno(errno
, "Failed to send UID shift: %m");
4361 if (l
!= sizeof(arg_uid_shift
)) {
4362 log_error("Short write while sending UID shift.");
4367 /* Turn directory into bind mount */
4368 if (mount(directory
, directory
, NULL
, MS_BIND
|MS_REC
, NULL
) < 0)
4369 return log_error_errno(errno
, "Failed to make bind mount: %m");
4371 r
= setup_volatile(directory
);
4375 r
= setup_volatile_state(directory
);
4379 r
= base_filesystem_create(directory
, arg_uid_shift
, (gid_t
) arg_uid_shift
);
4383 if (arg_read_only
) {
4384 r
= bind_remount_recursive(directory
, true);
4386 return log_error_errno(r
, "Failed to make tree read-only: %m");
4389 r
= mount_all(directory
, false);
4393 if (copy_devnodes(directory
) < 0)
4396 dev_setup(directory
, arg_uid_shift
, arg_uid_shift
);
4398 if (setup_pts(directory
) < 0)
4401 r
= setup_propagate(directory
);
4405 r
= setup_dev_console(directory
, console
);
4409 r
= setup_seccomp();
4413 r
= setup_timezone(directory
);
4417 r
= setup_resolv_conf(directory
);
4421 r
= setup_journal(directory
);
4425 r
= mount_custom(directory
);
4429 r
= mount_cgroup(directory
);
4433 r
= mount_move_root(directory
);
4435 return log_error_errno(r
, "Failed to move root directory: %m");
4437 pid
= raw_clone(SIGCHLD
|CLONE_NEWNS
|
4438 (arg_share_system
? 0 : CLONE_NEWIPC
|CLONE_NEWPID
|CLONE_NEWUTS
) |
4439 (arg_private_network
? CLONE_NEWNET
: 0) |
4440 (arg_userns
? CLONE_NEWUSER
: 0),
4443 return log_error_errno(errno
, "Failed to fork inner child: %m");
4446 pid_socket
= safe_close(pid_socket
);
4447 uid_shift_socket
= safe_close(uid_shift_socket
);
4449 /* The inner child has all namespaces that are
4450 * requested, so that we all are owned by the user if
4451 * user namespaces are turned on. */
4453 r
= inner_child(barrier
, directory
, secondary
, kmsg_socket
, rtnl_socket
, fds
, argc
, argv
);
4455 _exit(EXIT_FAILURE
);
4457 _exit(EXIT_SUCCESS
);
4460 l
= send(pid_socket
, &pid
, sizeof(pid
), MSG_NOSIGNAL
);
4462 return log_error_errno(errno
, "Failed to send PID: %m");
4463 if (l
!= sizeof(pid
)) {
4464 log_error("Short write while sending PID.");
4468 pid_socket
= safe_close(pid_socket
);
4473 static int setup_uid_map(pid_t pid
) {
4474 char uid_map
[strlen("/proc//uid_map") + DECIMAL_STR_MAX(uid_t
) + 1], line
[DECIMAL_STR_MAX(uid_t
)*3+3+1];
4479 xsprintf(uid_map
, "/proc/" PID_FMT
"/uid_map", pid
);
4480 xsprintf(line
, UID_FMT
" " UID_FMT
" " UID_FMT
"\n", 0, arg_uid_shift
, arg_uid_range
);
4481 r
= write_string_file(uid_map
, line
, 0);
4483 return log_error_errno(r
, "Failed to write UID map: %m");
4485 /* We always assign the same UID and GID ranges */
4486 xsprintf(uid_map
, "/proc/" PID_FMT
"/gid_map", pid
);
4487 r
= write_string_file(uid_map
, line
, 0);
4489 return log_error_errno(r
, "Failed to write GID map: %m");
4494 static int chown_cgroup(pid_t pid
) {
4495 _cleanup_free_
char *path
= NULL
, *fs
= NULL
;
4496 _cleanup_close_
int fd
= -1;
4500 r
= cg_pid_get_path(NULL
, pid
, &path
);
4502 return log_error_errno(r
, "Failed to get container cgroup path: %m");
4504 r
= cg_get_path(SYSTEMD_CGROUP_CONTROLLER
, path
, NULL
, &fs
);
4506 return log_error_errno(r
, "Failed to get file system path for container cgroup: %m");
4508 fd
= open(fs
, O_RDONLY
|O_CLOEXEC
|O_DIRECTORY
);
4510 return log_error_errno(errno
, "Failed to open %s: %m", fs
);
4512 FOREACH_STRING(fn
, ".", "tasks", "notify_on_release", "cgroup.procs", "cgroup.clone_children")
4513 if (fchownat(fd
, fn
, arg_uid_shift
, arg_uid_shift
, 0) < 0)
4514 log_warning_errno(errno
, "Failed to chown() cgroup file %s, ignoring: %m", fn
);
4519 int main(int argc
, char *argv
[]) {
4521 _cleanup_free_
char *device_path
= NULL
, *root_device
= NULL
, *home_device
= NULL
, *srv_device
= NULL
, *console
= NULL
;
4522 bool root_device_rw
= true, home_device_rw
= true, srv_device_rw
= true;
4523 _cleanup_close_
int master
= -1, image_fd
= -1;
4524 _cleanup_fdset_free_ FDSet
*fds
= NULL
;
4525 int r
, n_fd_passed
, loop_nr
= -1;
4526 char veth_name
[IFNAMSIZ
];
4527 bool secondary
= false, remove_subvol
= false;
4530 int ret
= EXIT_SUCCESS
;
4531 union in_addr_union exposed
= {};
4532 _cleanup_release_lock_file_ LockFile tree_global_lock
= LOCK_FILE_INIT
, tree_local_lock
= LOCK_FILE_INIT
;
4535 log_parse_environment();
4538 r
= parse_argv(argc
, argv
);
4542 r
= determine_names();
4546 if (geteuid() != 0) {
4547 log_error("Need to be root.");
4552 n_fd_passed
= sd_listen_fds(false);
4553 if (n_fd_passed
> 0) {
4554 r
= fdset_new_listen_fds(&fds
, false);
4556 log_error_errno(r
, "Failed to collect file descriptors: %m");
4561 if (arg_directory
) {
4564 if (path_equal(arg_directory
, "/") && !arg_ephemeral
) {
4565 log_error("Spawning container on root directory is not supported. Consider using --ephemeral.");
4570 if (arg_ephemeral
) {
4571 _cleanup_free_
char *np
= NULL
;
4573 /* If the specified path is a mount point we
4574 * generate the new snapshot immediately
4575 * inside it under a random name. However if
4576 * the specified is not a mount point we
4577 * create the new snapshot in the parent
4578 * directory, just next to it. */
4579 r
= path_is_mount_point(arg_directory
, 0);
4581 log_error_errno(r
, "Failed to determine whether directory %s is mount point: %m", arg_directory
);
4585 r
= tempfn_random_child(arg_directory
, "machine.", &np
);
4587 r
= tempfn_random(arg_directory
, "machine.", &np
);
4589 log_error_errno(r
, "Failed to generate name for snapshot: %m");
4593 r
= image_path_lock(np
, (arg_read_only
? LOCK_SH
: LOCK_EX
) | LOCK_NB
, &tree_global_lock
, &tree_local_lock
);
4595 log_error_errno(r
, "Failed to lock %s: %m", np
);
4599 r
= btrfs_subvol_snapshot(arg_directory
, np
, (arg_read_only
? BTRFS_SNAPSHOT_READ_ONLY
: 0) | BTRFS_SNAPSHOT_FALLBACK_COPY
| BTRFS_SNAPSHOT_RECURSIVE
);
4601 log_error_errno(r
, "Failed to create snapshot %s from %s: %m", np
, arg_directory
);
4605 free(arg_directory
);
4609 remove_subvol
= true;
4612 r
= image_path_lock(arg_directory
, (arg_read_only
? LOCK_SH
: LOCK_EX
) | LOCK_NB
, &tree_global_lock
, &tree_local_lock
);
4614 log_error_errno(r
, "Directory tree %s is currently busy.", arg_directory
);
4618 log_error_errno(r
, "Failed to lock %s: %m", arg_directory
);
4623 r
= btrfs_subvol_snapshot(arg_template
, arg_directory
, (arg_read_only
? BTRFS_SNAPSHOT_READ_ONLY
: 0) | BTRFS_SNAPSHOT_FALLBACK_COPY
| BTRFS_SNAPSHOT_RECURSIVE
);
4626 log_info("Directory %s already exists, not populating from template %s.", arg_directory
, arg_template
);
4628 log_error_errno(r
, "Couldn't create snapshot %s from %s: %m", arg_directory
, arg_template
);
4632 log_info("Populated %s from template %s.", arg_directory
, arg_template
);
4638 if (path_is_os_tree(arg_directory
) <= 0) {
4639 log_error("Directory %s doesn't look like an OS root directory (os-release file is missing). Refusing.", arg_directory
);
4646 p
= strjoina(arg_directory
,
4647 argc
> optind
&& path_is_absolute(argv
[optind
]) ? argv
[optind
] : "/usr/bin/");
4648 if (access(p
, F_OK
) < 0) {
4649 log_error("Directory %s lacks the binary to execute or doesn't look like a binary tree. Refusing.", arg_directory
);
4656 char template[] = "/tmp/nspawn-root-XXXXXX";
4659 assert(!arg_template
);
4661 r
= image_path_lock(arg_image
, (arg_read_only
? LOCK_SH
: LOCK_EX
) | LOCK_NB
, &tree_global_lock
, &tree_local_lock
);
4663 r
= log_error_errno(r
, "Disk image %s is currently busy.", arg_image
);
4667 r
= log_error_errno(r
, "Failed to create image lock: %m");
4671 if (!mkdtemp(template)) {
4672 log_error_errno(errno
, "Failed to create temporary directory: %m");
4677 arg_directory
= strdup(template);
4678 if (!arg_directory
) {
4683 image_fd
= setup_image(&device_path
, &loop_nr
);
4689 r
= dissect_image(image_fd
,
4690 &root_device
, &root_device_rw
,
4691 &home_device
, &home_device_rw
,
4692 &srv_device
, &srv_device_rw
,
4698 r
= custom_mounts_prepare();
4703 isatty(STDIN_FILENO
) > 0 &&
4704 isatty(STDOUT_FILENO
) > 0;
4706 master
= posix_openpt(O_RDWR
|O_NOCTTY
|O_CLOEXEC
|O_NDELAY
);
4708 r
= log_error_errno(errno
, "Failed to acquire pseudo tty: %m");
4712 r
= ptsname_malloc(master
, &console
);
4714 r
= log_error_errno(r
, "Failed to determine tty name: %m");
4718 if (unlockpt(master
) < 0) {
4719 r
= log_error_errno(errno
, "Failed to unlock tty: %m");
4724 log_info("Spawning container %s on %s.\nPress ^] three times within 1s to kill container.",
4725 arg_machine
, arg_image
?: arg_directory
);
4727 assert_se(sigprocmask_many(SIG_BLOCK
, NULL
, SIGCHLD
, SIGWINCH
, SIGTERM
, SIGINT
, -1) >= 0);
4729 assert_se(sigemptyset(&mask_chld
) == 0);
4730 assert_se(sigaddset(&mask_chld
, SIGCHLD
) == 0);
4732 if (prctl(PR_SET_CHILD_SUBREAPER
, 1) < 0) {
4733 r
= log_error_errno(errno
, "Failed to become subreaper: %m");
4738 _cleanup_close_pair_
int kmsg_socket_pair
[2] = { -1, -1 }, rtnl_socket_pair
[2] = { -1, -1 }, pid_socket_pair
[2] = { -1, -1 },
4739 uid_shift_socket_pair
[2] = { -1, -1 };
4740 ContainerStatus container_status
;
4741 _cleanup_(barrier_destroy
) Barrier barrier
= BARRIER_NULL
;
4742 static const struct sigaction sa
= {
4743 .sa_handler
= nop_handler
,
4744 .sa_flags
= SA_NOCLDSTOP
,
4748 _cleanup_event_unref_ sd_event
*event
= NULL
;
4749 _cleanup_(pty_forward_freep
) PTYForward
*forward
= NULL
;
4750 _cleanup_netlink_unref_ sd_netlink
*rtnl
= NULL
;
4753 r
= barrier_create(&barrier
);
4755 log_error_errno(r
, "Cannot initialize IPC barrier: %m");
4759 if (socketpair(AF_UNIX
, SOCK_DGRAM
|SOCK_CLOEXEC
, 0, kmsg_socket_pair
) < 0) {
4760 r
= log_error_errno(errno
, "Failed to create kmsg socket pair: %m");
4764 if (socketpair(AF_UNIX
, SOCK_DGRAM
|SOCK_CLOEXEC
, 0, rtnl_socket_pair
) < 0) {
4765 r
= log_error_errno(errno
, "Failed to create rtnl socket pair: %m");
4769 if (socketpair(AF_UNIX
, SOCK_DGRAM
|SOCK_CLOEXEC
, 0, pid_socket_pair
) < 0) {
4770 r
= log_error_errno(errno
, "Failed to create pid socket pair: %m");
4775 if (socketpair(AF_UNIX
, SOCK_DGRAM
|SOCK_CLOEXEC
, 0, uid_shift_socket_pair
) < 0) {
4776 r
= log_error_errno(errno
, "Failed to create uid shift socket pair: %m");
4780 /* Child can be killed before execv(), so handle SIGCHLD
4781 * in order to interrupt parent's blocking calls and
4782 * give it a chance to call wait() and terminate. */
4783 r
= sigprocmask(SIG_UNBLOCK
, &mask_chld
, NULL
);
4785 r
= log_error_errno(errno
, "Failed to change the signal mask: %m");
4789 r
= sigaction(SIGCHLD
, &sa
, NULL
);
4791 r
= log_error_errno(errno
, "Failed to install SIGCHLD handler: %m");
4795 pid
= raw_clone(SIGCHLD
|CLONE_NEWNS
, NULL
);
4797 if (errno
== EINVAL
)
4798 r
= log_error_errno(errno
, "clone() failed, do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in): %m");
4800 r
= log_error_errno(errno
, "clone() failed: %m");
4806 /* The outer child only has a file system namespace. */
4807 barrier_set_role(&barrier
, BARRIER_CHILD
);
4809 master
= safe_close(master
);
4811 kmsg_socket_pair
[0] = safe_close(kmsg_socket_pair
[0]);
4812 rtnl_socket_pair
[0] = safe_close(rtnl_socket_pair
[0]);
4813 pid_socket_pair
[0] = safe_close(pid_socket_pair
[0]);
4814 uid_shift_socket_pair
[0] = safe_close(uid_shift_socket_pair
[0]);
4816 (void) reset_all_signal_handlers();
4817 (void) reset_signal_mask();
4819 r
= outer_child(&barrier
,
4822 root_device
, root_device_rw
,
4823 home_device
, home_device_rw
,
4824 srv_device
, srv_device_rw
,
4828 kmsg_socket_pair
[1],
4829 rtnl_socket_pair
[1],
4830 uid_shift_socket_pair
[1],
4834 _exit(EXIT_FAILURE
);
4836 _exit(EXIT_SUCCESS
);
4839 barrier_set_role(&barrier
, BARRIER_PARENT
);
4844 kmsg_socket_pair
[1] = safe_close(kmsg_socket_pair
[1]);
4845 rtnl_socket_pair
[1] = safe_close(rtnl_socket_pair
[1]);
4846 pid_socket_pair
[1] = safe_close(pid_socket_pair
[1]);
4848 /* Wait for the outer child. */
4849 r
= wait_for_terminate_and_warn("namespace helper", pid
, NULL
);
4858 /* And now retrieve the PID of the inner child. */
4859 l
= recv(pid_socket_pair
[0], &pid
, sizeof(pid
), 0);
4861 r
= log_error_errno(errno
, "Failed to read inner child PID: %m");
4864 if (l
!= sizeof(pid
)) {
4865 log_error("Short read while reading inner child PID: %m");
4870 log_debug("Init process invoked as PID " PID_FMT
, pid
);
4873 if (!barrier_place_and_sync(&barrier
)) { /* #1 */
4874 log_error("Child died too early.");
4879 l
= recv(uid_shift_socket_pair
[0], &arg_uid_shift
, sizeof(arg_uid_shift
), 0);
4881 r
= log_error_errno(errno
, "Failed to read UID shift: %m");
4884 if (l
!= sizeof(arg_uid_shift
)) {
4885 log_error("Short read while reading UID shift: %m");
4890 r
= setup_uid_map(pid
);
4894 (void) barrier_place(&barrier
); /* #2 */
4897 r
= move_network_interfaces(pid
);
4901 r
= setup_veth(pid
, veth_name
, &ifi
);
4905 r
= setup_bridge(veth_name
, &ifi
);
4909 r
= setup_macvlan(pid
);
4913 r
= setup_ipvlan(pid
);
4917 r
= register_machine(pid
, ifi
);
4921 r
= chown_cgroup(pid
);
4925 /* Notify the child that the parent is ready with all
4926 * its setup (including cgroup-ification), and that
4927 * the child can now hand over control to the code to
4928 * run inside the container. */
4929 (void) barrier_place(&barrier
); /* #3 */
4931 /* Block SIGCHLD here, before notifying child.
4932 * process_pty() will handle it with the other signals. */
4933 assert_se(sigprocmask(SIG_BLOCK
, &mask_chld
, NULL
) >= 0);
4935 /* Reset signal to default */
4936 r
= default_signals(SIGCHLD
, -1);
4938 log_error_errno(r
, "Failed to reset SIGCHLD: %m");
4942 /* Let the child know that we are ready and wait that the child is completely ready now. */
4943 if (!barrier_place_and_sync(&barrier
)) { /* #5 */
4944 log_error("Client died too early.");
4951 "STATUS=Container running.\n"
4952 "X_NSPAWN_LEADER_PID=" PID_FMT
, pid
);
4954 r
= sd_event_new(&event
);
4956 log_error_errno(r
, "Failed to get default event source: %m");
4960 if (arg_kill_signal
> 0) {
4961 /* Try to kill the init system on SIGINT or SIGTERM */
4962 sd_event_add_signal(event
, NULL
, SIGINT
, on_orderly_shutdown
, UINT32_TO_PTR(pid
));
4963 sd_event_add_signal(event
, NULL
, SIGTERM
, on_orderly_shutdown
, UINT32_TO_PTR(pid
));
4965 /* Immediately exit */
4966 sd_event_add_signal(event
, NULL
, SIGINT
, NULL
, NULL
);
4967 sd_event_add_signal(event
, NULL
, SIGTERM
, NULL
, NULL
);
4970 /* simply exit on sigchld */
4971 sd_event_add_signal(event
, NULL
, SIGCHLD
, NULL
, NULL
);
4973 if (arg_expose_ports
) {
4974 r
= watch_rtnl(event
, rtnl_socket_pair
[0], &exposed
, &rtnl
);
4978 (void) expose_ports(rtnl
, &exposed
);
4981 rtnl_socket_pair
[0] = safe_close(rtnl_socket_pair
[0]);
4983 r
= pty_forward_new(event
, master
, true, !interactive
, &forward
);
4985 log_error_errno(r
, "Failed to create PTY forwarder: %m");
4989 r
= sd_event_loop(event
);
4991 log_error_errno(r
, "Failed to run event loop: %m");
4995 pty_forward_get_last_char(forward
, &last_char
);
4997 forward
= pty_forward_free(forward
);
4999 if (!arg_quiet
&& last_char
!= '\n')
5002 /* Kill if it is not dead yet anyway */
5003 terminate_machine(pid
);
5005 /* Normally redundant, but better safe than sorry */
5008 r
= wait_for_container(pid
, &container_status
);
5012 /* We failed to wait for the container, or the
5013 * container exited abnormally */
5015 else if (r
> 0 || container_status
== CONTAINER_TERMINATED
){
5016 /* The container exited with a non-zero
5017 * status, or with zero status and no reboot
5023 /* CONTAINER_REBOOTED, loop again */
5025 if (arg_keep_unit
) {
5026 /* Special handling if we are running as a
5027 * service: instead of simply restarting the
5028 * machine we want to restart the entire
5029 * service, so let's inform systemd about this
5030 * with the special exit code 133. The service
5031 * file uses RestartForceExitStatus=133 so
5032 * that this results in a full nspawn
5033 * restart. This is necessary since we might
5034 * have cgroup parameters set we want to have
5041 flush_ports(&exposed
);
5047 "STATUS=Terminating...");
5052 /* Try to flush whatever is still queued in the pty */
5054 (void) copy_bytes(master
, STDOUT_FILENO
, (off_t
) -1, false);
5056 loop_remove(loop_nr
, &image_fd
);
5058 if (remove_subvol
&& arg_directory
) {
5061 k
= btrfs_subvol_remove(arg_directory
, true);
5063 log_warning_errno(k
, "Cannot remove subvolume '%s', ignoring: %m", arg_directory
);
5069 p
= strjoina("/run/systemd/nspawn/propagate/", arg_machine
);
5070 (void) rm_rf(p
, REMOVE_ROOT
);
5073 free(arg_directory
);
5078 strv_free(arg_setenv
);
5079 strv_free(arg_network_interfaces
);
5080 strv_free(arg_network_macvlan
);
5081 strv_free(arg_network_ipvlan
);
5082 custom_mount_free_all();
5084 flush_ports(&exposed
);
5086 while (arg_expose_ports
) {
5087 ExposePort
*p
= arg_expose_ports
;
5088 LIST_REMOVE(ports
, arg_expose_ports
, p
);
5092 return r
< 0 ? EXIT_FAILURE
: ret
;