1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
4 This file is part of systemd.
6 Copyright 2010 Lennart Poettering
8 systemd is free software; you can redistribute it and/or modify it
9 under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 2.1 of the License, or
11 (at your option) any later version.
13 systemd is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 Lesser General Public License for more details.
18 You should have received a copy of the GNU Lesser General Public License
19 along with systemd; If not, see <http://www.gnu.org/licenses/>.
25 #include <sys/types.h>
26 #include <sys/mount.h>
31 #include <sys/prctl.h>
35 #include <sys/socket.h>
36 #include <linux/netlink.h>
38 #include <linux/veth.h>
39 #include <sys/personality.h>
40 #include <linux/loop.h>
44 #include <selinux/selinux.h>
52 #include <blkid/blkid.h>
55 #include "sd-daemon.h"
58 #include "sd-netlink.h"
59 #include "random-util.h"
66 #include "cgroup-util.h"
68 #include "path-util.h"
69 #include "loopback-setup.h"
70 #include "dev-setup.h"
75 #include "bus-error.h"
78 #include "netlink-util.h"
79 #include "udev-util.h"
80 #include "blkid-util.h"
82 #include "siphash24.h"
84 #include "base-filesystem.h"
86 #include "event-util.h"
87 #include "capability.h"
89 #include "btrfs-util.h"
90 #include "machine-image.h"
92 #include "in-addr-util.h"
93 #include "firewall-util.h"
94 #include "local-addresses.h"
95 #include "formats-util.h"
96 #include "process-util.h"
97 #include "terminal-util.h"
98 #include "hostname-util.h"
99 #include "signal-util.h"
102 #include "seccomp-util.h"
105 typedef struct ExposePort
{
108 uint16_t container_port
;
109 LIST_FIELDS(struct ExposePort
, ports
);
112 typedef enum ContainerStatus
{
113 CONTAINER_TERMINATED
,
117 typedef enum LinkJournal
{
124 typedef enum Volatile
{
130 typedef enum CustomMountType
{
133 CUSTOM_MOUNT_OVERLAY
,
136 typedef struct CustomMount
{
137 CustomMountType type
;
139 char *source
; /* for overlayfs this is the upper directory */
146 static char *arg_directory
= NULL
;
147 static char *arg_template
= NULL
;
148 static char *arg_user
= NULL
;
149 static sd_id128_t arg_uuid
= {};
150 static char *arg_machine
= NULL
;
151 static const char *arg_selinux_context
= NULL
;
152 static const char *arg_selinux_apifs_context
= NULL
;
153 static const char *arg_slice
= NULL
;
154 static bool arg_private_network
= false;
155 static bool arg_read_only
= false;
156 static bool arg_boot
= false;
157 static bool arg_ephemeral
= false;
158 static LinkJournal arg_link_journal
= LINK_AUTO
;
159 static bool arg_link_journal_try
= false;
160 static uint64_t arg_retain
=
161 (1ULL << CAP_CHOWN
) |
162 (1ULL << CAP_DAC_OVERRIDE
) |
163 (1ULL << CAP_DAC_READ_SEARCH
) |
164 (1ULL << CAP_FOWNER
) |
165 (1ULL << CAP_FSETID
) |
166 (1ULL << CAP_IPC_OWNER
) |
168 (1ULL << CAP_LEASE
) |
169 (1ULL << CAP_LINUX_IMMUTABLE
) |
170 (1ULL << CAP_NET_BIND_SERVICE
) |
171 (1ULL << CAP_NET_BROADCAST
) |
172 (1ULL << CAP_NET_RAW
) |
173 (1ULL << CAP_SETGID
) |
174 (1ULL << CAP_SETFCAP
) |
175 (1ULL << CAP_SETPCAP
) |
176 (1ULL << CAP_SETUID
) |
177 (1ULL << CAP_SYS_ADMIN
) |
178 (1ULL << CAP_SYS_CHROOT
) |
179 (1ULL << CAP_SYS_NICE
) |
180 (1ULL << CAP_SYS_PTRACE
) |
181 (1ULL << CAP_SYS_TTY_CONFIG
) |
182 (1ULL << CAP_SYS_RESOURCE
) |
183 (1ULL << CAP_SYS_BOOT
) |
184 (1ULL << CAP_AUDIT_WRITE
) |
185 (1ULL << CAP_AUDIT_CONTROL
) |
187 static CustomMount
*arg_custom_mounts
= NULL
;
188 static unsigned arg_n_custom_mounts
= 0;
189 static char **arg_setenv
= NULL
;
190 static bool arg_quiet
= false;
191 static bool arg_share_system
= false;
192 static bool arg_register
= true;
193 static bool arg_keep_unit
= false;
194 static char **arg_network_interfaces
= NULL
;
195 static char **arg_network_macvlan
= NULL
;
196 static char **arg_network_ipvlan
= NULL
;
197 static bool arg_network_veth
= false;
198 static const char *arg_network_bridge
= NULL
;
199 static unsigned long arg_personality
= PERSONALITY_INVALID
;
200 static char *arg_image
= NULL
;
201 static Volatile arg_volatile
= VOLATILE_NO
;
202 static ExposePort
*arg_expose_ports
= NULL
;
203 static char **arg_property
= NULL
;
204 static uid_t arg_uid_shift
= UID_INVALID
, arg_uid_range
= 0x10000U
;
205 static bool arg_userns
= false;
206 static int arg_kill_signal
= 0;
208 static void help(void) {
209 printf("%s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
210 "Spawn a minimal namespace container for debugging, testing and building.\n\n"
211 " -h --help Show this help\n"
212 " --version Print version string\n"
213 " -q --quiet Do not show status information\n"
214 " -D --directory=PATH Root directory for the container\n"
215 " --template=PATH Initialize root directory from template directory,\n"
217 " -x --ephemeral Run container with snapshot of root directory, and\n"
218 " remove it after exit\n"
219 " -i --image=PATH File system device or disk image for the container\n"
220 " -b --boot Boot up full system (i.e. invoke init)\n"
221 " -u --user=USER Run the command under specified user or uid\n"
222 " -M --machine=NAME Set the machine name for the container\n"
223 " --uuid=UUID Set a specific machine UUID for the container\n"
224 " -S --slice=SLICE Place the container in the specified slice\n"
225 " --property=NAME=VALUE Set scope unit property\n"
226 " --private-users[=UIDBASE[:NUIDS]]\n"
227 " Run within user namespace\n"
228 " --private-network Disable network in container\n"
229 " --network-interface=INTERFACE\n"
230 " Assign an existing network interface to the\n"
232 " --network-macvlan=INTERFACE\n"
233 " Create a macvlan network interface based on an\n"
234 " existing network interface to the container\n"
235 " --network-ipvlan=INTERFACE\n"
236 " Create a ipvlan network interface based on an\n"
237 " existing network interface to the container\n"
238 " -n --network-veth Add a virtual ethernet connection between host\n"
240 " --network-bridge=INTERFACE\n"
241 " Add a virtual ethernet connection between host\n"
242 " and container and add it to an existing bridge on\n"
244 " -p --port=[PROTOCOL:]HOSTPORT[:CONTAINERPORT]\n"
245 " Expose a container IP port on the host\n"
246 " -Z --selinux-context=SECLABEL\n"
247 " Set the SELinux security context to be used by\n"
248 " processes in the container\n"
249 " -L --selinux-apifs-context=SECLABEL\n"
250 " Set the SELinux security context to be used by\n"
251 " API/tmpfs file systems in the container\n"
252 " --capability=CAP In addition to the default, retain specified\n"
254 " --drop-capability=CAP Drop the specified capability from the default set\n"
255 " --kill-signal=SIGNAL Select signal to use for shutting down PID 1\n"
256 " --link-journal=MODE Link up guest journal, one of no, auto, guest, host,\n"
257 " try-guest, try-host\n"
258 " -j Equivalent to --link-journal=try-guest\n"
259 " --read-only Mount the root directory read-only\n"
260 " --bind=PATH[:PATH] Bind mount a file or directory from the host into\n"
262 " --bind-ro=PATH[:PATH] Similar, but creates a read-only bind mount\n"
263 " --tmpfs=PATH:[OPTIONS] Mount an empty tmpfs to the specified directory\n"
264 " --overlay=PATH[:PATH...]:PATH\n"
265 " Create an overlay mount from the host to \n"
267 " --overlay-ro=PATH[:PATH...]:PATH\n"
268 " Similar, but creates a read-only overlay mount\n"
269 " --setenv=NAME=VALUE Pass an environment variable to PID 1\n"
270 " --share-system Share system namespaces with host\n"
271 " --register=BOOLEAN Register container as machine\n"
272 " --keep-unit Do not register a scope for the machine, reuse\n"
273 " the service unit nspawn is running in\n"
274 " --volatile[=MODE] Run the system in volatile mode\n"
275 , program_invocation_short_name
);
278 static CustomMount
* custom_mount_add(CustomMountType t
) {
279 CustomMount
*c
, *ret
;
281 c
= realloc(arg_custom_mounts
, (arg_n_custom_mounts
+ 1) * sizeof(CustomMount
));
285 arg_custom_mounts
= c
;
286 ret
= arg_custom_mounts
+ arg_n_custom_mounts
;
287 arg_n_custom_mounts
++;
289 *ret
= (CustomMount
) { .type
= t
};
294 static void custom_mount_free_all(void) {
297 for (i
= 0; i
< arg_n_custom_mounts
; i
++) {
298 CustomMount
*m
= &arg_custom_mounts
[i
];
301 free(m
->destination
);
305 (void) rm_rf(m
->work_dir
, REMOVE_ROOT
|REMOVE_PHYSICAL
);
312 free(arg_custom_mounts
);
313 arg_custom_mounts
= NULL
;
314 arg_n_custom_mounts
= 0;
317 static int custom_mount_compare(const void *a
, const void *b
) {
318 const CustomMount
*x
= a
, *y
= b
;
321 r
= path_compare(x
->destination
, y
->destination
);
325 if (x
->type
< y
->type
)
327 if (x
->type
> y
->type
)
333 static int custom_mounts_prepare(void) {
337 /* Ensure the mounts are applied prefix first. */
338 qsort_safe(arg_custom_mounts
, arg_n_custom_mounts
, sizeof(CustomMount
), custom_mount_compare
);
340 /* Allocate working directories for the overlay file systems that need it */
341 for (i
= 0; i
< arg_n_custom_mounts
; i
++) {
342 CustomMount
*m
= &arg_custom_mounts
[i
];
344 if (arg_userns
&& arg_uid_shift
== UID_INVALID
&& path_equal(m
->destination
, "/")) {
345 log_error("--private-users with automatic UID shift may not be combined with custom root mounts.");
349 if (m
->type
!= CUSTOM_MOUNT_OVERLAY
)
358 r
= tempfn_random(m
->source
, NULL
, &m
->work_dir
);
360 return log_error_errno(r
, "Failed to generate work directory from %s: %m", m
->source
);
366 static int set_sanitized_path(char **b
, const char *path
) {
372 p
= canonicalize_file_name(path
);
377 p
= path_make_absolute_cwd(path
);
383 *b
= path_kill_slashes(p
);
387 static int parse_argv(int argc
, char *argv
[]) {
406 ARG_NETWORK_INTERFACE
,
418 static const struct option options
[] = {
419 { "help", no_argument
, NULL
, 'h' },
420 { "version", no_argument
, NULL
, ARG_VERSION
},
421 { "directory", required_argument
, NULL
, 'D' },
422 { "template", required_argument
, NULL
, ARG_TEMPLATE
},
423 { "ephemeral", no_argument
, NULL
, 'x' },
424 { "user", required_argument
, NULL
, 'u' },
425 { "private-network", no_argument
, NULL
, ARG_PRIVATE_NETWORK
},
426 { "boot", no_argument
, NULL
, 'b' },
427 { "uuid", required_argument
, NULL
, ARG_UUID
},
428 { "read-only", no_argument
, NULL
, ARG_READ_ONLY
},
429 { "capability", required_argument
, NULL
, ARG_CAPABILITY
},
430 { "drop-capability", required_argument
, NULL
, ARG_DROP_CAPABILITY
},
431 { "link-journal", required_argument
, NULL
, ARG_LINK_JOURNAL
},
432 { "bind", required_argument
, NULL
, ARG_BIND
},
433 { "bind-ro", required_argument
, NULL
, ARG_BIND_RO
},
434 { "tmpfs", required_argument
, NULL
, ARG_TMPFS
},
435 { "overlay", required_argument
, NULL
, ARG_OVERLAY
},
436 { "overlay-ro", required_argument
, NULL
, ARG_OVERLAY_RO
},
437 { "machine", required_argument
, NULL
, 'M' },
438 { "slice", required_argument
, NULL
, 'S' },
439 { "setenv", required_argument
, NULL
, ARG_SETENV
},
440 { "selinux-context", required_argument
, NULL
, 'Z' },
441 { "selinux-apifs-context", required_argument
, NULL
, 'L' },
442 { "quiet", no_argument
, NULL
, 'q' },
443 { "share-system", no_argument
, NULL
, ARG_SHARE_SYSTEM
},
444 { "register", required_argument
, NULL
, ARG_REGISTER
},
445 { "keep-unit", no_argument
, NULL
, ARG_KEEP_UNIT
},
446 { "network-interface", required_argument
, NULL
, ARG_NETWORK_INTERFACE
},
447 { "network-macvlan", required_argument
, NULL
, ARG_NETWORK_MACVLAN
},
448 { "network-ipvlan", required_argument
, NULL
, ARG_NETWORK_IPVLAN
},
449 { "network-veth", no_argument
, NULL
, 'n' },
450 { "network-bridge", required_argument
, NULL
, ARG_NETWORK_BRIDGE
},
451 { "personality", required_argument
, NULL
, ARG_PERSONALITY
},
452 { "image", required_argument
, NULL
, 'i' },
453 { "volatile", optional_argument
, NULL
, ARG_VOLATILE
},
454 { "port", required_argument
, NULL
, 'p' },
455 { "property", required_argument
, NULL
, ARG_PROPERTY
},
456 { "private-users", optional_argument
, NULL
, ARG_PRIVATE_USERS
},
457 { "kill-signal", required_argument
, NULL
, ARG_KILL_SIGNAL
},
462 uint64_t plus
= 0, minus
= 0;
467 while ((c
= getopt_long(argc
, argv
, "+hD:u:bL:M:jS:Z:qi:xp:n", options
, NULL
)) >= 0)
476 puts(PACKAGE_STRING
);
477 puts(SYSTEMD_FEATURES
);
481 r
= set_sanitized_path(&arg_directory
, optarg
);
483 return log_error_errno(r
, "Invalid root directory: %m");
488 r
= set_sanitized_path(&arg_template
, optarg
);
490 return log_error_errno(r
, "Invalid template directory: %m");
495 r
= set_sanitized_path(&arg_image
, optarg
);
497 return log_error_errno(r
, "Invalid image path: %m");
502 arg_ephemeral
= true;
507 arg_user
= strdup(optarg
);
513 case ARG_NETWORK_BRIDGE
:
514 arg_network_bridge
= optarg
;
519 arg_network_veth
= true;
520 arg_private_network
= true;
523 case ARG_NETWORK_INTERFACE
:
524 if (strv_extend(&arg_network_interfaces
, optarg
) < 0)
527 arg_private_network
= true;
530 case ARG_NETWORK_MACVLAN
:
531 if (strv_extend(&arg_network_macvlan
, optarg
) < 0)
534 arg_private_network
= true;
537 case ARG_NETWORK_IPVLAN
:
538 if (strv_extend(&arg_network_ipvlan
, optarg
) < 0)
543 case ARG_PRIVATE_NETWORK
:
544 arg_private_network
= true;
552 r
= sd_id128_from_string(optarg
, &arg_uuid
);
554 log_error("Invalid UUID: %s", optarg
);
564 if (isempty(optarg
)) {
568 if (!machine_name_is_valid(optarg
)) {
569 log_error("Invalid machine name: %s", optarg
);
573 r
= free_and_strdup(&arg_machine
, optarg
);
581 arg_selinux_context
= optarg
;
585 arg_selinux_apifs_context
= optarg
;
589 arg_read_only
= true;
593 case ARG_DROP_CAPABILITY
: {
594 const char *state
, *word
;
597 FOREACH_WORD_SEPARATOR(word
, length
, optarg
, ",", state
) {
598 _cleanup_free_
char *t
;
600 t
= strndup(word
, length
);
604 if (streq(t
, "all")) {
605 if (c
== ARG_CAPABILITY
)
606 plus
= (uint64_t) -1;
608 minus
= (uint64_t) -1;
612 cap
= capability_from_name(t
);
614 log_error("Failed to parse capability %s.", t
);
618 if (c
== ARG_CAPABILITY
)
619 plus
|= 1ULL << (uint64_t) cap
;
621 minus
|= 1ULL << (uint64_t) cap
;
629 arg_link_journal
= LINK_GUEST
;
630 arg_link_journal_try
= true;
633 case ARG_LINK_JOURNAL
:
634 if (streq(optarg
, "auto")) {
635 arg_link_journal
= LINK_AUTO
;
636 arg_link_journal_try
= false;
637 } else if (streq(optarg
, "no")) {
638 arg_link_journal
= LINK_NO
;
639 arg_link_journal_try
= false;
640 } else if (streq(optarg
, "guest")) {
641 arg_link_journal
= LINK_GUEST
;
642 arg_link_journal_try
= false;
643 } else if (streq(optarg
, "host")) {
644 arg_link_journal
= LINK_HOST
;
645 arg_link_journal_try
= false;
646 } else if (streq(optarg
, "try-guest")) {
647 arg_link_journal
= LINK_GUEST
;
648 arg_link_journal_try
= true;
649 } else if (streq(optarg
, "try-host")) {
650 arg_link_journal
= LINK_HOST
;
651 arg_link_journal_try
= true;
653 log_error("Failed to parse link journal mode %s", optarg
);
661 _cleanup_free_
char *source
= NULL
, *destination
= NULL
;
665 e
= strchr(optarg
, ':');
667 source
= strndup(optarg
, e
- optarg
);
668 destination
= strdup(e
+ 1);
670 source
= strdup(optarg
);
671 destination
= strdup(optarg
);
674 if (!source
|| !destination
)
677 if (!path_is_absolute(source
) || !path_is_absolute(destination
)) {
678 log_error("Invalid bind mount specification: %s", optarg
);
682 m
= custom_mount_add(CUSTOM_MOUNT_BIND
);
687 m
->destination
= destination
;
688 m
->read_only
= c
== ARG_BIND_RO
;
690 source
= destination
= NULL
;
696 _cleanup_free_
char *path
= NULL
, *opts
= NULL
;
700 e
= strchr(optarg
, ':');
702 path
= strndup(optarg
, e
- optarg
);
703 opts
= strdup(e
+ 1);
705 path
= strdup(optarg
);
706 opts
= strdup("mode=0755");
712 if (!path_is_absolute(path
)) {
713 log_error("Invalid tmpfs specification: %s", optarg
);
717 m
= custom_mount_add(CUSTOM_MOUNT_TMPFS
);
721 m
->destination
= path
;
730 case ARG_OVERLAY_RO
: {
731 _cleanup_free_
char *upper
= NULL
, *destination
= NULL
;
732 _cleanup_strv_free_
char **lower
= NULL
;
737 lower
= strv_split(optarg
, ":");
741 STRV_FOREACH(i
, lower
) {
742 if (!path_is_absolute(*i
)) {
743 log_error("Overlay path %s is not absolute.", *i
);
751 log_error("--overlay= needs at least two colon-separated directories specified.");
756 /* If two parameters are specified,
757 * the first one is the lower, the
758 * second one the upper directory. And
759 * we'll also define the destination
760 * mount point the same as the upper. */
764 destination
= strdup(upper
);
769 upper
= lower
[n
- 2];
770 destination
= lower
[n
- 1];
774 m
= custom_mount_add(CUSTOM_MOUNT_OVERLAY
);
778 m
->destination
= destination
;
781 m
->read_only
= c
== ARG_OVERLAY_RO
;
783 upper
= destination
= NULL
;
792 if (!env_assignment_is_valid(optarg
)) {
793 log_error("Environment variable assignment '%s' is not valid.", optarg
);
797 n
= strv_env_set(arg_setenv
, optarg
);
801 strv_free(arg_setenv
);
810 case ARG_SHARE_SYSTEM
:
811 arg_share_system
= true;
815 r
= parse_boolean(optarg
);
817 log_error("Failed to parse --register= argument: %s", optarg
);
825 arg_keep_unit
= true;
828 case ARG_PERSONALITY
:
830 arg_personality
= personality_from_string(optarg
);
831 if (arg_personality
== PERSONALITY_INVALID
) {
832 log_error("Unknown or unsupported personality '%s'.", optarg
);
841 arg_volatile
= VOLATILE_YES
;
843 r
= parse_boolean(optarg
);
845 if (streq(optarg
, "state"))
846 arg_volatile
= VOLATILE_STATE
;
848 log_error("Failed to parse --volatile= argument: %s", optarg
);
852 arg_volatile
= r
? VOLATILE_YES
: VOLATILE_NO
;
858 const char *split
, *e
;
859 uint16_t container_port
, host_port
;
863 if ((e
= startswith(optarg
, "tcp:")))
864 protocol
= IPPROTO_TCP
;
865 else if ((e
= startswith(optarg
, "udp:")))
866 protocol
= IPPROTO_UDP
;
869 protocol
= IPPROTO_TCP
;
872 split
= strchr(e
, ':');
874 char v
[split
- e
+ 1];
876 memcpy(v
, e
, split
- e
);
879 r
= safe_atou16(v
, &host_port
);
880 if (r
< 0 || host_port
<= 0) {
881 log_error("Failed to parse host port: %s", optarg
);
885 r
= safe_atou16(split
+ 1, &container_port
);
887 r
= safe_atou16(e
, &container_port
);
888 host_port
= container_port
;
891 if (r
< 0 || container_port
<= 0) {
892 log_error("Failed to parse host port: %s", optarg
);
896 LIST_FOREACH(ports
, p
, arg_expose_ports
) {
897 if (p
->protocol
== protocol
&& p
->host_port
== host_port
) {
898 log_error("Duplicate port specification: %s", optarg
);
903 p
= new(ExposePort
, 1);
907 p
->protocol
= protocol
;
908 p
->host_port
= host_port
;
909 p
->container_port
= container_port
;
911 LIST_PREPEND(ports
, arg_expose_ports
, p
);
917 if (strv_extend(&arg_property
, optarg
) < 0)
922 case ARG_PRIVATE_USERS
:
924 _cleanup_free_
char *buffer
= NULL
;
925 const char *range
, *shift
;
927 range
= strchr(optarg
, ':');
929 buffer
= strndup(optarg
, range
- optarg
);
935 if (safe_atou32(range
, &arg_uid_range
) < 0 || arg_uid_range
<= 0) {
936 log_error("Failed to parse UID range: %s", range
);
942 if (parse_uid(shift
, &arg_uid_shift
) < 0) {
943 log_error("Failed to parse UID: %s", optarg
);
951 case ARG_KILL_SIGNAL
:
952 arg_kill_signal
= signal_from_string_try_harder(optarg
);
953 if (arg_kill_signal
< 0) {
954 log_error("Cannot parse signal: %s", optarg
);
964 assert_not_reached("Unhandled option");
967 if (arg_share_system
)
968 arg_register
= false;
970 if (arg_boot
&& arg_share_system
) {
971 log_error("--boot and --share-system may not be combined.");
975 if (arg_keep_unit
&& cg_pid_get_owner_uid(0, NULL
) >= 0) {
976 log_error("--keep-unit may not be used when invoked from a user session.");
980 if (arg_directory
&& arg_image
) {
981 log_error("--directory= and --image= may not be combined.");
985 if (arg_template
&& arg_image
) {
986 log_error("--template= and --image= may not be combined.");
990 if (arg_template
&& !(arg_directory
|| arg_machine
)) {
991 log_error("--template= needs --directory= or --machine=.");
995 if (arg_ephemeral
&& arg_template
) {
996 log_error("--ephemeral and --template= may not be combined.");
1000 if (arg_ephemeral
&& arg_image
) {
1001 log_error("--ephemeral and --image= may not be combined.");
1005 if (arg_ephemeral
&& !IN_SET(arg_link_journal
, LINK_NO
, LINK_AUTO
)) {
1006 log_error("--ephemeral and --link-journal= may not be combined.");
1010 if (arg_volatile
!= VOLATILE_NO
&& arg_read_only
) {
1011 log_error("Cannot combine --read-only with --volatile. Note that --volatile already implies a read-only base hierarchy.");
1015 if (arg_expose_ports
&& !arg_private_network
) {
1016 log_error("Cannot use --port= without private networking.");
1020 if (arg_userns
&& access("/proc/self/uid_map", F_OK
) < 0)
1021 return log_error_errno(EOPNOTSUPP
, "--private-users= is not supported, kernel compiled without user namespace support.");
1023 arg_retain
= (arg_retain
| plus
| (arg_private_network
? 1ULL << CAP_NET_ADMIN
: 0)) & ~minus
;
1025 if (arg_boot
&& arg_kill_signal
<= 0)
1026 arg_kill_signal
= SIGRTMIN
+3;
1031 static int tmpfs_patch_options(const char *options
, char **ret
) {
1034 if (arg_userns
&& arg_uid_shift
!= 0) {
1035 assert(arg_uid_shift
!= UID_INVALID
);
1038 (void) asprintf(&buf
, "%s,uid=" UID_FMT
",gid=" UID_FMT
, options
, arg_uid_shift
, arg_uid_shift
);
1040 (void) asprintf(&buf
, "uid=" UID_FMT
",gid=" UID_FMT
, arg_uid_shift
, arg_uid_shift
);
1048 if (arg_selinux_apifs_context
) {
1052 t
= strjoin(options
, ",context=\"", arg_selinux_apifs_context
, "\"", NULL
);
1054 t
= strjoin("context=\"", arg_selinux_apifs_context
, "\"", NULL
);
1069 static int mount_all(const char *dest
, bool userns
) {
1071 typedef struct MountPoint
{
1075 const char *options
;
1076 unsigned long flags
;
1081 static const MountPoint mount_table
[] = {
1082 { "proc", "/proc", "proc", NULL
, MS_NOSUID
|MS_NOEXEC
|MS_NODEV
, true, true },
1083 { "/proc/sys", "/proc/sys", NULL
, NULL
, MS_BIND
, true, true }, /* Bind mount first */
1084 { NULL
, "/proc/sys", NULL
, NULL
, MS_BIND
|MS_RDONLY
|MS_NOSUID
|MS_NOEXEC
|MS_NODEV
|MS_REMOUNT
, true, true }, /* Then, make it r/o */
1085 { "sysfs", "/sys", "sysfs", NULL
, MS_RDONLY
|MS_NOSUID
|MS_NOEXEC
|MS_NODEV
, true, false },
1086 { "tmpfs", "/sys/fs/cgroup", "tmpfs", "mode=755", MS_NOSUID
|MS_NOEXEC
|MS_NODEV
|MS_STRICTATIME
, true, false },
1087 { "tmpfs", "/dev", "tmpfs", "mode=755", MS_NOSUID
|MS_STRICTATIME
, true, false },
1088 { "tmpfs", "/dev/shm", "tmpfs", "mode=1777", MS_NOSUID
|MS_NODEV
|MS_STRICTATIME
, true, false },
1089 { "tmpfs", "/run", "tmpfs", "mode=755", MS_NOSUID
|MS_NODEV
|MS_STRICTATIME
, true, false },
1090 { "tmpfs", "/tmp", "tmpfs", "mode=1777", MS_STRICTATIME
, true, false },
1092 { "/sys/fs/selinux", "/sys/fs/selinux", NULL
, NULL
, MS_BIND
, false, false }, /* Bind mount first */
1093 { NULL
, "/sys/fs/selinux", NULL
, NULL
, MS_BIND
|MS_RDONLY
|MS_NOSUID
|MS_NOEXEC
|MS_NODEV
|MS_REMOUNT
, false, false }, /* Then, make it r/o */
1100 for (k
= 0; k
< ELEMENTSOF(mount_table
); k
++) {
1101 _cleanup_free_
char *where
= NULL
, *options
= NULL
;
1104 if (userns
!= mount_table
[k
].userns
)
1107 where
= prefix_root(dest
, mount_table
[k
].where
);
1111 r
= path_is_mount_point(where
, AT_SYMLINK_FOLLOW
);
1112 if (r
< 0 && r
!= -ENOENT
)
1113 return log_error_errno(r
, "Failed to detect whether %s is a mount point: %m", where
);
1115 /* Skip this entry if it is not a remount. */
1116 if (mount_table
[k
].what
&& r
> 0)
1119 r
= mkdir_p(where
, 0755);
1121 if (mount_table
[k
].fatal
)
1122 return log_error_errno(r
, "Failed to create directory %s: %m", where
);
1124 log_warning_errno(r
, "Failed to create directory %s: %m", where
);
1128 o
= mount_table
[k
].options
;
1129 if (streq_ptr(mount_table
[k
].type
, "tmpfs")) {
1130 r
= tmpfs_patch_options(o
, &options
);
1137 if (mount(mount_table
[k
].what
,
1139 mount_table
[k
].type
,
1140 mount_table
[k
].flags
,
1143 if (mount_table
[k
].fatal
)
1144 return log_error_errno(errno
, "mount(%s) failed: %m", where
);
1146 log_warning_errno(errno
, "mount(%s) failed, ignoring: %m", where
);
1153 static int mount_bind(const char *dest
, CustomMount
*m
) {
1154 struct stat source_st
, dest_st
;
1160 if (stat(m
->source
, &source_st
) < 0)
1161 return log_error_errno(errno
, "Failed to stat %s: %m", m
->source
);
1163 where
= prefix_roota(dest
, m
->destination
);
1165 if (stat(where
, &dest_st
) >= 0) {
1166 if (S_ISDIR(source_st
.st_mode
) && !S_ISDIR(dest_st
.st_mode
)) {
1167 log_error("Cannot bind mount directory %s on file %s.", m
->source
, where
);
1171 if (!S_ISDIR(source_st
.st_mode
) && S_ISDIR(dest_st
.st_mode
)) {
1172 log_error("Cannot bind mount file %s on directory %s.", m
->source
, where
);
1176 } else if (errno
== ENOENT
) {
1177 r
= mkdir_parents_label(where
, 0755);
1179 return log_error_errno(r
, "Failed to make parents of %s: %m", where
);
1181 log_error_errno(errno
, "Failed to stat %s: %m", where
);
1185 /* Create the mount point. Any non-directory file can be
1186 * mounted on any non-directory file (regular, fifo, socket,
1189 if (S_ISDIR(source_st
.st_mode
))
1190 r
= mkdir_label(where
, 0755);
1193 if (r
< 0 && r
!= -EEXIST
)
1194 return log_error_errno(r
, "Failed to create mount point %s: %m", where
);
1196 if (mount(m
->source
, where
, NULL
, MS_BIND
, NULL
) < 0)
1197 return log_error_errno(errno
, "mount(%s) failed: %m", where
);
1200 r
= bind_remount_recursive(where
, true);
1202 return log_error_errno(r
, "Read-only bind mount failed: %m");
1208 static int mount_tmpfs(const char *dest
, CustomMount
*m
) {
1209 const char *where
, *options
;
1210 _cleanup_free_
char *buf
= NULL
;
1216 where
= prefix_roota(dest
, m
->destination
);
1218 r
= mkdir_p_label(where
, 0755);
1219 if (r
< 0 && r
!= -EEXIST
)
1220 return log_error_errno(r
, "Creating mount point for tmpfs %s failed: %m", where
);
1222 r
= tmpfs_patch_options(m
->options
, &buf
);
1225 options
= r
> 0 ? buf
: m
->options
;
1227 if (mount("tmpfs", where
, "tmpfs", MS_NODEV
|MS_STRICTATIME
, options
) < 0)
1228 return log_error_errno(errno
, "tmpfs mount to %s failed: %m", where
);
1233 static int mount_overlay(const char *dest
, CustomMount
*m
) {
1234 _cleanup_free_
char *lower
= NULL
;
1235 const char *where
, *options
;
1241 where
= prefix_roota(dest
, m
->destination
);
1243 r
= mkdir_label(where
, 0755);
1244 if (r
< 0 && r
!= -EEXIST
)
1245 return log_error_errno(r
, "Creating mount point for overlay %s failed: %m", where
);
1247 (void) mkdir_p_label(m
->source
, 0755);
1249 strv_reverse(m
->lower
);
1250 lower
= strv_join(m
->lower
, ":");
1251 strv_reverse(m
->lower
);
1256 options
= strjoina("lowerdir=", m
->source
, ":", lower
);
1258 assert(m
->work_dir
);
1259 (void) mkdir_label(m
->work_dir
, 0700);
1261 options
= strjoina("lowerdir=", lower
, ",upperdir=", m
->source
, ",workdir=", m
->work_dir
);
1264 if (mount("overlay", where
, "overlay", m
->read_only
? MS_RDONLY
: 0, options
) < 0)
1265 return log_error_errno(errno
, "overlay mount to %s failed: %m", where
);
1270 static int mount_custom(const char *dest
) {
1276 for (i
= 0; i
< arg_n_custom_mounts
; i
++) {
1277 CustomMount
*m
= &arg_custom_mounts
[i
];
1281 case CUSTOM_MOUNT_BIND
:
1282 r
= mount_bind(dest
, m
);
1285 case CUSTOM_MOUNT_TMPFS
:
1286 r
= mount_tmpfs(dest
, m
);
1289 case CUSTOM_MOUNT_OVERLAY
:
1290 r
= mount_overlay(dest
, m
);
1294 assert_not_reached("Unknown custom mount type");
1304 static int mount_cgroup_hierarchy(const char *dest
, const char *controller
, const char *hierarchy
, bool read_only
) {
1308 to
= strjoina(dest
, "/sys/fs/cgroup/", hierarchy
);
1310 r
= path_is_mount_point(to
, 0);
1311 if (r
< 0 && r
!= -ENOENT
)
1312 return log_error_errno(r
, "Failed to determine if %s is mounted already: %m", to
);
1318 /* The superblock mount options of the mount point need to be
1319 * identical to the hosts', and hence writable... */
1320 if (mount("cgroup", to
, "cgroup", MS_NOSUID
|MS_NOEXEC
|MS_NODEV
, controller
) < 0)
1321 return log_error_errno(errno
, "Failed to mount to %s: %m", to
);
1323 /* ... hence let's only make the bind mount read-only, not the
1326 if (mount(NULL
, to
, NULL
, MS_BIND
|MS_REMOUNT
|MS_NOSUID
|MS_NOEXEC
|MS_NODEV
|MS_RDONLY
, NULL
) < 0)
1327 return log_error_errno(errno
, "Failed to remount %s read-only: %m", to
);
1332 static int mount_cgroup(const char *dest
) {
1333 _cleanup_set_free_free_ Set
*controllers
= NULL
;
1334 const char *cgroup_root
;
1337 controllers
= set_new(&string_hash_ops
);
1341 r
= cg_kernel_controllers(controllers
);
1343 return log_error_errno(r
, "Failed to determine cgroup controllers: %m");
1346 _cleanup_free_
char *controller
= NULL
, *origin
= NULL
, *combined
= NULL
;
1348 controller
= set_steal_first(controllers
);
1352 origin
= prefix_root("/sys/fs/cgroup/", controller
);
1356 r
= readlink_malloc(origin
, &combined
);
1358 /* Not a symbolic link, but directly a single cgroup hierarchy */
1360 r
= mount_cgroup_hierarchy(dest
, controller
, controller
, true);
1365 return log_error_errno(r
, "Failed to read link %s: %m", origin
);
1367 _cleanup_free_
char *target
= NULL
;
1369 target
= prefix_root(dest
, origin
);
1373 /* A symbolic link, a combination of controllers in one hierarchy */
1375 if (!filename_is_valid(combined
)) {
1376 log_warning("Ignoring invalid combined hierarchy %s.", combined
);
1380 r
= mount_cgroup_hierarchy(dest
, combined
, combined
, true);
1384 r
= symlink_idempotent(combined
, target
);
1386 log_error("Invalid existing symlink for combined hierarchy");
1390 return log_error_errno(r
, "Failed to create symlink for combined hierarchy: %m");
1394 r
= mount_cgroup_hierarchy(dest
, "name=systemd,xattr", "systemd", false);
1398 cgroup_root
= prefix_roota(dest
, "/sys/fs/cgroup");
1399 if (mount(NULL
, cgroup_root
, NULL
, MS_REMOUNT
|MS_NOSUID
|MS_NOEXEC
|MS_NODEV
|MS_STRICTATIME
|MS_RDONLY
, "mode=755") < 0)
1400 return log_error_errno(errno
, "Failed to remount %s read-only: %m", cgroup_root
);
1405 static int mount_systemd_cgroup_writable(const char *dest
) {
1406 _cleanup_free_
char *own_cgroup_path
= NULL
;
1407 const char *systemd_root
, *systemd_own
;
1412 r
= cg_pid_get_path(NULL
, 0, &own_cgroup_path
);
1414 return log_error_errno(r
, "Failed to determine our own cgroup path: %m");
1416 /* Make our own cgroup a (writable) bind mount */
1417 systemd_own
= strjoina(dest
, "/sys/fs/cgroup/systemd", own_cgroup_path
);
1418 if (mount(systemd_own
, systemd_own
, NULL
, MS_BIND
, NULL
) < 0)
1419 return log_error_errno(errno
, "Failed to turn %s into a bind mount: %m", own_cgroup_path
);
1421 /* And then remount the systemd cgroup root read-only */
1422 systemd_root
= prefix_roota(dest
, "/sys/fs/cgroup/systemd");
1423 if (mount(NULL
, systemd_root
, NULL
, MS_BIND
|MS_REMOUNT
|MS_NOSUID
|MS_NOEXEC
|MS_NODEV
|MS_RDONLY
, NULL
) < 0)
1424 return log_error_errno(errno
, "Failed to mount cgroup root read-only: %m");
1429 static int userns_lchown(const char *p
, uid_t uid
, gid_t gid
) {
1435 if (uid
== UID_INVALID
&& gid
== GID_INVALID
)
1438 if (uid
!= UID_INVALID
) {
1439 uid
+= arg_uid_shift
;
1441 if (uid
< arg_uid_shift
|| uid
>= arg_uid_shift
+ arg_uid_range
)
1445 if (gid
!= GID_INVALID
) {
1446 gid
+= (gid_t
) arg_uid_shift
;
1448 if (gid
< (gid_t
) arg_uid_shift
|| gid
>= (gid_t
) (arg_uid_shift
+ arg_uid_range
))
1452 if (lchown(p
, uid
, gid
) < 0)
1458 static int userns_mkdir(const char *root
, const char *path
, mode_t mode
, uid_t uid
, gid_t gid
) {
1461 q
= prefix_roota(root
, path
);
1462 if (mkdir(q
, mode
) < 0) {
1463 if (errno
== EEXIST
)
1468 return userns_lchown(q
, uid
, gid
);
1471 static int setup_timezone(const char *dest
) {
1472 _cleanup_free_
char *p
= NULL
, *q
= NULL
;
1473 const char *where
, *check
, *what
;
1479 /* Fix the timezone, if possible */
1480 r
= readlink_malloc("/etc/localtime", &p
);
1482 log_warning("/etc/localtime is not a symlink, not updating container timezone.");
1486 z
= path_startswith(p
, "../usr/share/zoneinfo/");
1488 z
= path_startswith(p
, "/usr/share/zoneinfo/");
1490 log_warning("/etc/localtime does not point into /usr/share/zoneinfo/, not updating container timezone.");
1494 where
= prefix_roota(dest
, "/etc/localtime");
1495 r
= readlink_malloc(where
, &q
);
1497 y
= path_startswith(q
, "../usr/share/zoneinfo/");
1499 y
= path_startswith(q
, "/usr/share/zoneinfo/");
1501 /* Already pointing to the right place? Then do nothing .. */
1502 if (y
&& streq(y
, z
))
1506 check
= strjoina("/usr/share/zoneinfo/", z
);
1507 check
= prefix_root(dest
, check
);
1508 if (laccess(check
, F_OK
) < 0) {
1509 log_warning("Timezone %s does not exist in container, not updating container timezone.", z
);
1514 if (r
< 0 && errno
!= ENOENT
) {
1515 log_error_errno(errno
, "Failed to remove existing timezone info %s in container: %m", where
);
1519 what
= strjoina("../usr/share/zoneinfo/", z
);
1520 if (symlink(what
, where
) < 0) {
1521 log_error_errno(errno
, "Failed to correct timezone of container: %m");
1525 r
= userns_lchown(where
, 0, 0);
1527 return log_warning_errno(r
, "Failed to chown /etc/localtime: %m");
1532 static int setup_resolv_conf(const char *dest
) {
1533 const char *where
= NULL
;
1538 if (arg_private_network
)
1541 /* Fix resolv.conf, if possible */
1542 where
= prefix_roota(dest
, "/etc/resolv.conf");
1544 r
= copy_file("/etc/resolv.conf", where
, O_TRUNC
|O_NOFOLLOW
, 0644, 0);
1546 /* If the file already exists as symlink, let's
1547 * suppress the warning, under the assumption that
1548 * resolved or something similar runs inside and the
1549 * symlink points there.
1551 * If the disk image is read-only, there's also no
1552 * point in complaining.
1554 log_full_errno(IN_SET(r
, -ELOOP
, -EROFS
) ? LOG_DEBUG
: LOG_WARNING
, r
,
1555 "Failed to copy /etc/resolv.conf to %s: %m", where
);
1559 r
= userns_lchown(where
, 0, 0);
1561 log_warning_errno(r
, "Failed to chown /etc/resolv.conf: %m");
1566 static int setup_volatile_state(const char *directory
) {
1567 _cleanup_free_
char *buf
= NULL
;
1568 const char *p
, *options
;
1573 if (arg_volatile
!= VOLATILE_STATE
)
1576 /* --volatile=state means we simply overmount /var
1577 with a tmpfs, and the rest read-only. */
1579 r
= bind_remount_recursive(directory
, true);
1581 return log_error_errno(r
, "Failed to remount %s read-only: %m", directory
);
1583 p
= prefix_roota(directory
, "/var");
1585 if (r
< 0 && errno
!= EEXIST
)
1586 return log_error_errno(errno
, "Failed to create %s: %m", directory
);
1588 options
= "mode=755";
1589 r
= tmpfs_patch_options(options
, &buf
);
1595 if (mount("tmpfs", p
, "tmpfs", MS_STRICTATIME
, options
) < 0)
1596 return log_error_errno(errno
, "Failed to mount tmpfs to /var: %m");
1601 static int setup_volatile(const char *directory
) {
1602 bool tmpfs_mounted
= false, bind_mounted
= false;
1603 char template[] = "/tmp/nspawn-volatile-XXXXXX";
1604 _cleanup_free_
char *buf
= NULL
;
1605 const char *f
, *t
, *options
;
1610 if (arg_volatile
!= VOLATILE_YES
)
1613 /* --volatile=yes means we mount a tmpfs to the root dir, and
1614 the original /usr to use inside it, and that read-only. */
1616 if (!mkdtemp(template))
1617 return log_error_errno(errno
, "Failed to create temporary directory: %m");
1619 options
= "mode=755";
1620 r
= tmpfs_patch_options(options
, &buf
);
1626 if (mount("tmpfs", template, "tmpfs", MS_STRICTATIME
, options
) < 0) {
1627 r
= log_error_errno(errno
, "Failed to mount tmpfs for root directory: %m");
1631 tmpfs_mounted
= true;
1633 f
= prefix_roota(directory
, "/usr");
1634 t
= prefix_roota(template, "/usr");
1637 if (r
< 0 && errno
!= EEXIST
) {
1638 r
= log_error_errno(errno
, "Failed to create %s: %m", t
);
1642 if (mount(f
, t
, NULL
, MS_BIND
|MS_REC
, NULL
) < 0) {
1643 r
= log_error_errno(errno
, "Failed to create /usr bind mount: %m");
1647 bind_mounted
= true;
1649 r
= bind_remount_recursive(t
, true);
1651 log_error_errno(r
, "Failed to remount %s read-only: %m", t
);
1655 if (mount(template, directory
, NULL
, MS_MOVE
, NULL
) < 0) {
1656 r
= log_error_errno(errno
, "Failed to move root mount: %m");
1660 (void) rmdir(template);
1669 (void) umount(template);
1670 (void) rmdir(template);
1674 static char* id128_format_as_uuid(sd_id128_t id
, char s
[37]) {
1678 "%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
1679 SD_ID128_FORMAT_VAL(id
));
1684 static int setup_boot_id(const char *dest
) {
1685 const char *from
, *to
;
1686 sd_id128_t rnd
= {};
1690 if (arg_share_system
)
1693 /* Generate a new randomized boot ID, so that each boot-up of
1694 * the container gets a new one */
1696 from
= prefix_roota(dest
, "/run/proc-sys-kernel-random-boot-id");
1697 to
= prefix_roota(dest
, "/proc/sys/kernel/random/boot_id");
1699 r
= sd_id128_randomize(&rnd
);
1701 return log_error_errno(r
, "Failed to generate random boot id: %m");
1703 id128_format_as_uuid(rnd
, as_uuid
);
1705 r
= write_string_file(from
, as_uuid
, WRITE_STRING_FILE_CREATE
);
1707 return log_error_errno(r
, "Failed to write boot id: %m");
1709 if (mount(from
, to
, NULL
, MS_BIND
, NULL
) < 0)
1710 r
= log_error_errno(errno
, "Failed to bind mount boot id: %m");
1711 else if (mount(NULL
, to
, NULL
, MS_BIND
|MS_REMOUNT
|MS_RDONLY
|MS_NOSUID
|MS_NODEV
, NULL
) < 0)
1712 log_warning_errno(errno
, "Failed to make boot id read-only: %m");
1718 static int copy_devnodes(const char *dest
) {
1720 static const char devnodes
[] =
1731 _cleanup_umask_ mode_t u
;
1737 /* Create /dev/net, so that we can create /dev/net/tun in it */
1738 if (userns_mkdir(dest
, "/dev/net", 0755, 0, 0) < 0)
1739 return log_error_errno(r
, "Failed to create /dev/net directory: %m");
1741 NULSTR_FOREACH(d
, devnodes
) {
1742 _cleanup_free_
char *from
= NULL
, *to
= NULL
;
1745 from
= strappend("/dev/", d
);
1746 to
= prefix_root(dest
, from
);
1748 if (stat(from
, &st
) < 0) {
1750 if (errno
!= ENOENT
)
1751 return log_error_errno(errno
, "Failed to stat %s: %m", from
);
1753 } else if (!S_ISCHR(st
.st_mode
) && !S_ISBLK(st
.st_mode
)) {
1755 log_error("%s is not a char or block device, cannot copy.", from
);
1759 if (mknod(to
, st
.st_mode
, st
.st_rdev
) < 0) {
1761 return log_error_errno(errno
, "mknod(%s) failed: %m", to
);
1763 /* Some systems abusively restrict mknod but
1764 * allow bind mounts. */
1767 return log_error_errno(r
, "touch (%s) failed: %m", to
);
1768 if (mount(from
, to
, NULL
, MS_BIND
, NULL
) < 0)
1769 return log_error_errno(errno
, "Both mknod and bind mount (%s) failed: %m", to
);
1772 r
= userns_lchown(to
, 0, 0);
1774 return log_error_errno(r
, "chown() of device node %s failed: %m", to
);
1781 static int setup_pts(const char *dest
) {
1782 _cleanup_free_
char *options
= NULL
;
1786 if (arg_selinux_apifs_context
)
1787 (void) asprintf(&options
,
1788 "newinstance,ptmxmode=0666,mode=620,uid=" UID_FMT
",gid=" GID_FMT
",context=\"%s\"",
1790 arg_uid_shift
+ TTY_GID
,
1791 arg_selinux_apifs_context
);
1794 (void) asprintf(&options
,
1795 "newinstance,ptmxmode=0666,mode=620,uid=" UID_FMT
",gid=" GID_FMT
,
1797 arg_uid_shift
+ TTY_GID
);
1802 /* Mount /dev/pts itself */
1803 p
= prefix_roota(dest
, "/dev/pts");
1804 if (mkdir(p
, 0755) < 0)
1805 return log_error_errno(errno
, "Failed to create /dev/pts: %m");
1806 if (mount("devpts", p
, "devpts", MS_NOSUID
|MS_NOEXEC
, options
) < 0)
1807 return log_error_errno(errno
, "Failed to mount /dev/pts: %m");
1808 if (userns_lchown(p
, 0, 0) < 0)
1809 return log_error_errno(errno
, "Failed to chown /dev/pts: %m");
1811 /* Create /dev/ptmx symlink */
1812 p
= prefix_roota(dest
, "/dev/ptmx");
1813 if (symlink("pts/ptmx", p
) < 0)
1814 return log_error_errno(errno
, "Failed to create /dev/ptmx symlink: %m");
1815 if (userns_lchown(p
, 0, 0) < 0)
1816 return log_error_errno(errno
, "Failed to chown /dev/ptmx: %m");
1818 /* And fix /dev/pts/ptmx ownership */
1819 p
= prefix_roota(dest
, "/dev/pts/ptmx");
1820 if (userns_lchown(p
, 0, 0) < 0)
1821 return log_error_errno(errno
, "Failed to chown /dev/pts/ptmx: %m");
1826 static int setup_dev_console(const char *dest
, const char *console
) {
1827 _cleanup_umask_ mode_t u
;
1836 r
= chmod_and_chown(console
, 0600, arg_uid_shift
, arg_uid_shift
);
1838 return log_error_errno(r
, "Failed to correct access mode for TTY: %m");
1840 /* We need to bind mount the right tty to /dev/console since
1841 * ptys can only exist on pts file systems. To have something
1842 * to bind mount things on we create a empty regular file. */
1844 to
= prefix_roota(dest
, "/dev/console");
1847 return log_error_errno(r
, "touch() for /dev/console failed: %m");
1849 if (mount(console
, to
, NULL
, MS_BIND
, NULL
) < 0)
1850 return log_error_errno(errno
, "Bind mount for /dev/console failed: %m");
1855 static int setup_kmsg(const char *dest
, int kmsg_socket
) {
1856 const char *from
, *to
;
1857 _cleanup_umask_ mode_t u
;
1860 struct cmsghdr cmsghdr
;
1861 uint8_t buf
[CMSG_SPACE(sizeof(int))];
1863 struct msghdr mh
= {
1864 .msg_control
= &control
,
1865 .msg_controllen
= sizeof(control
),
1867 struct cmsghdr
*cmsg
;
1869 assert(kmsg_socket
>= 0);
1873 /* We create the kmsg FIFO as /run/kmsg, but immediately
1874 * delete it after bind mounting it to /proc/kmsg. While FIFOs
1875 * on the reading side behave very similar to /proc/kmsg,
1876 * their writing side behaves differently from /dev/kmsg in
1877 * that writing blocks when nothing is reading. In order to
1878 * avoid any problems with containers deadlocking due to this
1879 * we simply make /dev/kmsg unavailable to the container. */
1880 from
= prefix_roota(dest
, "/run/kmsg");
1881 to
= prefix_roota(dest
, "/proc/kmsg");
1883 if (mkfifo(from
, 0600) < 0)
1884 return log_error_errno(errno
, "mkfifo() for /run/kmsg failed: %m");
1885 if (mount(from
, to
, NULL
, MS_BIND
, NULL
) < 0)
1886 return log_error_errno(errno
, "Bind mount for /proc/kmsg failed: %m");
1888 fd
= open(from
, O_RDWR
|O_NDELAY
|O_CLOEXEC
);
1890 return log_error_errno(errno
, "Failed to open fifo: %m");
1892 cmsg
= CMSG_FIRSTHDR(&mh
);
1893 cmsg
->cmsg_level
= SOL_SOCKET
;
1894 cmsg
->cmsg_type
= SCM_RIGHTS
;
1895 cmsg
->cmsg_len
= CMSG_LEN(sizeof(int));
1896 memcpy(CMSG_DATA(cmsg
), &fd
, sizeof(int));
1898 mh
.msg_controllen
= cmsg
->cmsg_len
;
1900 /* Store away the fd in the socket, so that it stays open as
1901 * long as we run the child */
1902 k
= sendmsg(kmsg_socket
, &mh
, MSG_NOSIGNAL
);
1906 return log_error_errno(errno
, "Failed to send FIFO fd: %m");
1908 /* And now make the FIFO unavailable as /run/kmsg... */
1909 (void) unlink(from
);
1914 static int send_rtnl(int send_fd
) {
1916 struct cmsghdr cmsghdr
;
1917 uint8_t buf
[CMSG_SPACE(sizeof(int))];
1919 struct msghdr mh
= {
1920 .msg_control
= &control
,
1921 .msg_controllen
= sizeof(control
),
1923 struct cmsghdr
*cmsg
;
1924 _cleanup_close_
int fd
= -1;
1927 assert(send_fd
>= 0);
1929 if (!arg_expose_ports
)
1932 fd
= socket(PF_NETLINK
, SOCK_RAW
|SOCK_CLOEXEC
|SOCK_NONBLOCK
, NETLINK_ROUTE
);
1934 return log_error_errno(errno
, "Failed to allocate container netlink: %m");
1936 cmsg
= CMSG_FIRSTHDR(&mh
);
1937 cmsg
->cmsg_level
= SOL_SOCKET
;
1938 cmsg
->cmsg_type
= SCM_RIGHTS
;
1939 cmsg
->cmsg_len
= CMSG_LEN(sizeof(int));
1940 memcpy(CMSG_DATA(cmsg
), &fd
, sizeof(int));
1942 mh
.msg_controllen
= cmsg
->cmsg_len
;
1944 /* Store away the fd in the socket, so that it stays open as
1945 * long as we run the child */
1946 k
= sendmsg(send_fd
, &mh
, MSG_NOSIGNAL
);
1948 return log_error_errno(errno
, "Failed to send netlink fd: %m");
1953 static int flush_ports(union in_addr_union
*exposed
) {
1955 int r
, af
= AF_INET
;
1959 if (!arg_expose_ports
)
1962 if (in_addr_is_null(af
, exposed
))
1965 log_debug("Lost IP address.");
1967 LIST_FOREACH(ports
, p
, arg_expose_ports
) {
1968 r
= fw_add_local_dnat(false,
1979 log_warning_errno(r
, "Failed to modify firewall: %m");
1982 *exposed
= IN_ADDR_NULL
;
1986 static int expose_ports(sd_netlink
*rtnl
, union in_addr_union
*exposed
) {
1987 _cleanup_free_
struct local_address
*addresses
= NULL
;
1988 _cleanup_free_
char *pretty
= NULL
;
1989 union in_addr_union new_exposed
;
1992 int af
= AF_INET
, r
;
1996 /* Invoked each time an address is added or removed inside the
1999 if (!arg_expose_ports
)
2002 r
= local_addresses(rtnl
, 0, af
, &addresses
);
2004 return log_error_errno(r
, "Failed to enumerate local addresses: %m");
2007 addresses
[0].family
== af
&&
2008 addresses
[0].scope
< RT_SCOPE_LINK
;
2011 return flush_ports(exposed
);
2013 new_exposed
= addresses
[0].address
;
2014 if (in_addr_equal(af
, exposed
, &new_exposed
))
2017 in_addr_to_string(af
, &new_exposed
, &pretty
);
2018 log_debug("New container IP is %s.", strna(pretty
));
2020 LIST_FOREACH(ports
, p
, arg_expose_ports
) {
2022 r
= fw_add_local_dnat(true,
2031 in_addr_is_null(af
, exposed
) ? NULL
: exposed
);
2033 log_warning_errno(r
, "Failed to modify firewall: %m");
2036 *exposed
= new_exposed
;
2040 static int on_address_change(sd_netlink
*rtnl
, sd_netlink_message
*m
, void *userdata
) {
2041 union in_addr_union
*exposed
= userdata
;
2047 expose_ports(rtnl
, exposed
);
2051 static int watch_rtnl(sd_event
*event
, int recv_fd
, union in_addr_union
*exposed
, sd_netlink
**ret
) {
2053 struct cmsghdr cmsghdr
;
2054 uint8_t buf
[CMSG_SPACE(sizeof(int))];
2056 struct msghdr mh
= {
2057 .msg_control
= &control
,
2058 .msg_controllen
= sizeof(control
),
2060 struct cmsghdr
*cmsg
;
2061 _cleanup_netlink_unref_ sd_netlink
*rtnl
= NULL
;
2066 assert(recv_fd
>= 0);
2069 if (!arg_expose_ports
)
2072 k
= recvmsg(recv_fd
, &mh
, MSG_NOSIGNAL
);
2074 return log_error_errno(errno
, "Failed to recv netlink fd: %m");
2076 cmsg
= CMSG_FIRSTHDR(&mh
);
2077 assert(cmsg
->cmsg_level
== SOL_SOCKET
);
2078 assert(cmsg
->cmsg_type
== SCM_RIGHTS
);
2079 assert(cmsg
->cmsg_len
== CMSG_LEN(sizeof(int)));
2080 memcpy(&fd
, CMSG_DATA(cmsg
), sizeof(int));
2082 r
= sd_netlink_open_fd(&rtnl
, fd
);
2085 return log_error_errno(r
, "Failed to create rtnl object: %m");
2088 r
= sd_netlink_add_match(rtnl
, RTM_NEWADDR
, on_address_change
, exposed
);
2090 return log_error_errno(r
, "Failed to subscribe to RTM_NEWADDR messages: %m");
2092 r
= sd_netlink_add_match(rtnl
, RTM_DELADDR
, on_address_change
, exposed
);
2094 return log_error_errno(r
, "Failed to subscribe to RTM_DELADDR messages: %m");
2096 r
= sd_netlink_attach_event(rtnl
, event
, 0);
2098 return log_error_errno(r
, "Failed to add to even loop: %m");
2106 static int setup_hostname(void) {
2108 if (arg_share_system
)
2111 if (sethostname_idempotent(arg_machine
) < 0)
2117 static int setup_journal(const char *directory
) {
2118 sd_id128_t machine_id
, this_id
;
2119 _cleanup_free_
char *b
= NULL
, *d
= NULL
;
2120 const char *etc_machine_id
, *p
, *q
;
2124 /* Don't link journals in ephemeral mode */
2128 etc_machine_id
= prefix_roota(directory
, "/etc/machine-id");
2130 r
= read_one_line_file(etc_machine_id
, &b
);
2131 if (r
== -ENOENT
&& arg_link_journal
== LINK_AUTO
)
2134 return log_error_errno(r
, "Failed to read machine ID from %s: %m", etc_machine_id
);
2137 if (isempty(id
) && arg_link_journal
== LINK_AUTO
)
2140 /* Verify validity */
2141 r
= sd_id128_from_string(id
, &machine_id
);
2143 return log_error_errno(r
, "Failed to parse machine ID from %s: %m", etc_machine_id
);
2145 r
= sd_id128_get_machine(&this_id
);
2147 return log_error_errno(r
, "Failed to retrieve machine ID: %m");
2149 if (sd_id128_equal(machine_id
, this_id
)) {
2150 log_full(arg_link_journal
== LINK_AUTO
? LOG_WARNING
: LOG_ERR
,
2151 "Host and machine ids are equal (%s): refusing to link journals", id
);
2152 if (arg_link_journal
== LINK_AUTO
)
2157 if (arg_link_journal
== LINK_NO
)
2160 r
= userns_mkdir(directory
, "/var", 0755, 0, 0);
2162 return log_error_errno(r
, "Failed to create /var: %m");
2164 r
= userns_mkdir(directory
, "/var/log", 0755, 0, 0);
2166 return log_error_errno(r
, "Failed to create /var/log: %m");
2168 r
= userns_mkdir(directory
, "/var/log/journal", 0755, 0, 0);
2170 return log_error_errno(r
, "Failed to create /var/log/journal: %m");
2172 p
= strjoina("/var/log/journal/", id
);
2173 q
= prefix_roota(directory
, p
);
2175 if (path_is_mount_point(p
, 0) > 0) {
2176 if (arg_link_journal
!= LINK_AUTO
) {
2177 log_error("%s: already a mount point, refusing to use for journal", p
);
2184 if (path_is_mount_point(q
, 0) > 0) {
2185 if (arg_link_journal
!= LINK_AUTO
) {
2186 log_error("%s: already a mount point, refusing to use for journal", q
);
2193 r
= readlink_and_make_absolute(p
, &d
);
2195 if ((arg_link_journal
== LINK_GUEST
||
2196 arg_link_journal
== LINK_AUTO
) &&
2199 r
= userns_mkdir(directory
, p
, 0755, 0, 0);
2201 log_warning_errno(errno
, "Failed to create directory %s: %m", q
);
2206 return log_error_errno(errno
, "Failed to remove symlink %s: %m", p
);
2207 } else if (r
== -EINVAL
) {
2209 if (arg_link_journal
== LINK_GUEST
&&
2212 if (errno
== ENOTDIR
) {
2213 log_error("%s already exists and is neither a symlink nor a directory", p
);
2216 log_error_errno(errno
, "Failed to remove %s: %m", p
);
2220 } else if (r
!= -ENOENT
) {
2221 log_error_errno(errno
, "readlink(%s) failed: %m", p
);
2225 if (arg_link_journal
== LINK_GUEST
) {
2227 if (symlink(q
, p
) < 0) {
2228 if (arg_link_journal_try
) {
2229 log_debug_errno(errno
, "Failed to symlink %s to %s, skipping journal setup: %m", q
, p
);
2232 log_error_errno(errno
, "Failed to symlink %s to %s: %m", q
, p
);
2237 r
= userns_mkdir(directory
, p
, 0755, 0, 0);
2239 log_warning_errno(errno
, "Failed to create directory %s: %m", q
);
2243 if (arg_link_journal
== LINK_HOST
) {
2244 /* don't create parents here -- if the host doesn't have
2245 * permanent journal set up, don't force it here */
2248 if (arg_link_journal_try
) {
2249 log_debug_errno(errno
, "Failed to create %s, skipping journal setup: %m", p
);
2252 log_error_errno(errno
, "Failed to create %s: %m", p
);
2257 } else if (access(p
, F_OK
) < 0)
2260 if (dir_is_empty(q
) == 0)
2261 log_warning("%s is not empty, proceeding anyway.", q
);
2263 r
= userns_mkdir(directory
, p
, 0755, 0, 0);
2265 log_error_errno(errno
, "Failed to create %s: %m", q
);
2269 if (mount(p
, q
, NULL
, MS_BIND
, NULL
) < 0)
2270 return log_error_errno(errno
, "Failed to bind mount journal from host into guest: %m");
2275 static int drop_capabilities(void) {
2276 return capability_bounding_set_drop(~arg_retain
, false);
2279 static int register_machine(pid_t pid
, int local_ifindex
) {
2280 _cleanup_bus_error_free_ sd_bus_error error
= SD_BUS_ERROR_NULL
;
2281 _cleanup_bus_flush_close_unref_ sd_bus
*bus
= NULL
;
2287 r
= sd_bus_default_system(&bus
);
2289 return log_error_errno(r
, "Failed to open system bus: %m");
2291 if (arg_keep_unit
) {
2292 r
= sd_bus_call_method(
2294 "org.freedesktop.machine1",
2295 "/org/freedesktop/machine1",
2296 "org.freedesktop.machine1.Manager",
2297 "RegisterMachineWithNetwork",
2302 SD_BUS_MESSAGE_APPEND_ID128(arg_uuid
),
2306 strempty(arg_directory
),
2307 local_ifindex
> 0 ? 1 : 0, local_ifindex
);
2309 _cleanup_bus_message_unref_ sd_bus_message
*m
= NULL
;
2313 r
= sd_bus_message_new_method_call(
2316 "org.freedesktop.machine1",
2317 "/org/freedesktop/machine1",
2318 "org.freedesktop.machine1.Manager",
2319 "CreateMachineWithNetwork");
2321 return bus_log_create_error(r
);
2323 r
= sd_bus_message_append(
2327 SD_BUS_MESSAGE_APPEND_ID128(arg_uuid
),
2331 strempty(arg_directory
),
2332 local_ifindex
> 0 ? 1 : 0, local_ifindex
);
2334 return bus_log_create_error(r
);
2336 r
= sd_bus_message_open_container(m
, 'a', "(sv)");
2338 return bus_log_create_error(r
);
2340 if (!isempty(arg_slice
)) {
2341 r
= sd_bus_message_append(m
, "(sv)", "Slice", "s", arg_slice
);
2343 return bus_log_create_error(r
);
2346 r
= sd_bus_message_append(m
, "(sv)", "DevicePolicy", "s", "strict");
2348 return bus_log_create_error(r
);
2350 /* If you make changes here, also make sure to update
2351 * systemd-nspawn@.service, to keep the device
2352 * policies in sync regardless if we are run with or
2353 * without the --keep-unit switch. */
2354 r
= sd_bus_message_append(m
, "(sv)", "DeviceAllow", "a(ss)", 9,
2355 /* Allow the container to
2356 * access and create the API
2357 * device nodes, so that
2358 * PrivateDevices= in the
2359 * container can work
2364 "/dev/random", "rwm",
2365 "/dev/urandom", "rwm",
2367 "/dev/net/tun", "rwm",
2368 /* Allow the container
2369 * access to ptys. However,
2371 * container to ever create
2372 * these device nodes. */
2373 "/dev/pts/ptmx", "rw",
2376 return bus_log_create_error(r
);
2378 for (j
= 0; j
< arg_n_custom_mounts
; j
++) {
2379 CustomMount
*cm
= &arg_custom_mounts
[j
];
2381 if (cm
->type
!= CUSTOM_MOUNT_BIND
)
2384 r
= is_device_node(cm
->source
);
2386 return log_error_errno(r
, "Failed to stat %s: %m", cm
->source
);
2389 r
= sd_bus_message_append(m
, "(sv)", "DeviceAllow", "a(ss)", 1,
2390 cm
->source
, cm
->read_only
? "r" : "rw");
2392 return log_error_errno(r
, "Failed to append message arguments: %m");
2396 if (arg_kill_signal
!= 0) {
2397 r
= sd_bus_message_append(m
, "(sv)", "KillSignal", "i", arg_kill_signal
);
2399 return bus_log_create_error(r
);
2401 r
= sd_bus_message_append(m
, "(sv)", "KillMode", "s", "mixed");
2403 return bus_log_create_error(r
);
2406 STRV_FOREACH(i
, arg_property
) {
2407 r
= sd_bus_message_open_container(m
, 'r', "sv");
2409 return bus_log_create_error(r
);
2411 r
= bus_append_unit_property_assignment(m
, *i
);
2415 r
= sd_bus_message_close_container(m
);
2417 return bus_log_create_error(r
);
2420 r
= sd_bus_message_close_container(m
);
2422 return bus_log_create_error(r
);
2424 r
= sd_bus_call(bus
, m
, 0, &error
, NULL
);
2428 log_error("Failed to register machine: %s", bus_error_message(&error
, r
));
2435 static int terminate_machine(pid_t pid
) {
2436 _cleanup_bus_error_free_ sd_bus_error error
= SD_BUS_ERROR_NULL
;
2437 _cleanup_bus_message_unref_ sd_bus_message
*reply
= NULL
;
2438 _cleanup_bus_flush_close_unref_ sd_bus
*bus
= NULL
;
2445 /* If we are reusing the unit, then just exit, systemd will do
2446 * the right thing when we exit. */
2450 r
= sd_bus_default_system(&bus
);
2452 return log_error_errno(r
, "Failed to open system bus: %m");
2454 r
= sd_bus_call_method(
2456 "org.freedesktop.machine1",
2457 "/org/freedesktop/machine1",
2458 "org.freedesktop.machine1.Manager",
2465 /* Note that the machine might already have been
2466 * cleaned up automatically, hence don't consider it a
2467 * failure if we cannot get the machine object. */
2468 log_debug("Failed to get machine: %s", bus_error_message(&error
, r
));
2472 r
= sd_bus_message_read(reply
, "o", &path
);
2474 return bus_log_parse_error(r
);
2476 r
= sd_bus_call_method(
2478 "org.freedesktop.machine1",
2480 "org.freedesktop.machine1.Machine",
2486 log_debug("Failed to terminate machine: %s", bus_error_message(&error
, r
));
2493 static int reset_audit_loginuid(void) {
2494 _cleanup_free_
char *p
= NULL
;
2497 if (arg_share_system
)
2500 r
= read_one_line_file("/proc/self/loginuid", &p
);
2504 return log_error_errno(r
, "Failed to read /proc/self/loginuid: %m");
2506 /* Already reset? */
2507 if (streq(p
, "4294967295"))
2510 r
= write_string_file("/proc/self/loginuid", "4294967295", 0);
2513 "Failed to reset audit login UID. This probably means that your kernel is too\n"
2514 "old and you have audit enabled. Note that the auditing subsystem is known to\n"
2515 "be incompatible with containers on old kernels. Please make sure to upgrade\n"
2516 "your kernel or to off auditing with 'audit=0' on the kernel command line before\n"
2517 "using systemd-nspawn. Sleeping for 5s... (%m)");
2525 #define HOST_HASH_KEY SD_ID128_MAKE(1a,37,6f,c7,46,ec,45,0b,ad,a3,d5,31,06,60,5d,b1)
2526 #define CONTAINER_HASH_KEY SD_ID128_MAKE(c3,c4,f9,19,b5,57,b2,1c,e6,cf,14,27,03,9c,ee,a2)
2527 #define MACVLAN_HASH_KEY SD_ID128_MAKE(00,13,6d,bc,66,83,44,81,bb,0c,f9,51,1f,24,a6,6f)
2529 static int generate_mac(struct ether_addr
*mac
, sd_id128_t hash_key
, uint64_t idx
) {
2535 l
= strlen(arg_machine
);
2536 sz
= sizeof(sd_id128_t
) + l
;
2542 /* fetch some persistent data unique to the host */
2543 r
= sd_id128_get_machine((sd_id128_t
*) v
);
2547 /* combine with some data unique (on this host) to this
2548 * container instance */
2549 i
= mempcpy(v
+ sizeof(sd_id128_t
), arg_machine
, l
);
2552 memcpy(i
, &idx
, sizeof(idx
));
2555 /* Let's hash the host machine ID plus the container name. We
2556 * use a fixed, but originally randomly created hash key here. */
2557 siphash24(result
, v
, sz
, hash_key
.bytes
);
2559 assert_cc(ETH_ALEN
<= sizeof(result
));
2560 memcpy(mac
->ether_addr_octet
, result
, ETH_ALEN
);
2562 /* see eth_random_addr in the kernel */
2563 mac
->ether_addr_octet
[0] &= 0xfe; /* clear multicast bit */
2564 mac
->ether_addr_octet
[0] |= 0x02; /* set local assignment bit (IEEE802) */
2569 static int setup_veth(pid_t pid
, char iface_name
[IFNAMSIZ
], int *ifi
) {
2570 _cleanup_netlink_message_unref_ sd_netlink_message
*m
= NULL
;
2571 _cleanup_netlink_unref_ sd_netlink
*rtnl
= NULL
;
2572 struct ether_addr mac_host
, mac_container
;
2575 if (!arg_private_network
)
2578 if (!arg_network_veth
)
2581 /* Use two different interface name prefixes depending whether
2582 * we are in bridge mode or not. */
2583 snprintf(iface_name
, IFNAMSIZ
- 1, "%s-%s",
2584 arg_network_bridge
? "vb" : "ve", arg_machine
);
2586 r
= generate_mac(&mac_container
, CONTAINER_HASH_KEY
, 0);
2588 return log_error_errno(r
, "Failed to generate predictable MAC address for container side: %m");
2590 r
= generate_mac(&mac_host
, HOST_HASH_KEY
, 0);
2592 return log_error_errno(r
, "Failed to generate predictable MAC address for host side: %m");
2594 r
= sd_netlink_open(&rtnl
);
2596 return log_error_errno(r
, "Failed to connect to netlink: %m");
2598 r
= sd_rtnl_message_new_link(rtnl
, &m
, RTM_NEWLINK
, 0);
2600 return log_error_errno(r
, "Failed to allocate netlink message: %m");
2602 r
= sd_netlink_message_append_string(m
, IFLA_IFNAME
, iface_name
);
2604 return log_error_errno(r
, "Failed to add netlink interface name: %m");
2606 r
= sd_netlink_message_append_ether_addr(m
, IFLA_ADDRESS
, &mac_host
);
2608 return log_error_errno(r
, "Failed to add netlink MAC address: %m");
2610 r
= sd_netlink_message_open_container(m
, IFLA_LINKINFO
);
2612 return log_error_errno(r
, "Failed to open netlink container: %m");
2614 r
= sd_netlink_message_open_container_union(m
, IFLA_INFO_DATA
, "veth");
2616 return log_error_errno(r
, "Failed to open netlink container: %m");
2618 r
= sd_netlink_message_open_container(m
, VETH_INFO_PEER
);
2620 return log_error_errno(r
, "Failed to open netlink container: %m");
2622 r
= sd_netlink_message_append_string(m
, IFLA_IFNAME
, "host0");
2624 return log_error_errno(r
, "Failed to add netlink interface name: %m");
2626 r
= sd_netlink_message_append_ether_addr(m
, IFLA_ADDRESS
, &mac_container
);
2628 return log_error_errno(r
, "Failed to add netlink MAC address: %m");
2630 r
= sd_netlink_message_append_u32(m
, IFLA_NET_NS_PID
, pid
);
2632 return log_error_errno(r
, "Failed to add netlink namespace field: %m");
2634 r
= sd_netlink_message_close_container(m
);
2636 return log_error_errno(r
, "Failed to close netlink container: %m");
2638 r
= sd_netlink_message_close_container(m
);
2640 return log_error_errno(r
, "Failed to close netlink container: %m");
2642 r
= sd_netlink_message_close_container(m
);
2644 return log_error_errno(r
, "Failed to close netlink container: %m");
2646 r
= sd_netlink_call(rtnl
, m
, 0, NULL
);
2648 return log_error_errno(r
, "Failed to add new veth interfaces (host0, %s): %m", iface_name
);
2650 i
= (int) if_nametoindex(iface_name
);
2652 return log_error_errno(errno
, "Failed to resolve interface %s: %m", iface_name
);
2659 static int setup_bridge(const char veth_name
[], int *ifi
) {
2660 _cleanup_netlink_message_unref_ sd_netlink_message
*m
= NULL
;
2661 _cleanup_netlink_unref_ sd_netlink
*rtnl
= NULL
;
2664 if (!arg_private_network
)
2667 if (!arg_network_veth
)
2670 if (!arg_network_bridge
)
2673 bridge
= (int) if_nametoindex(arg_network_bridge
);
2675 return log_error_errno(errno
, "Failed to resolve interface %s: %m", arg_network_bridge
);
2679 r
= sd_netlink_open(&rtnl
);
2681 return log_error_errno(r
, "Failed to connect to netlink: %m");
2683 r
= sd_rtnl_message_new_link(rtnl
, &m
, RTM_SETLINK
, 0);
2685 return log_error_errno(r
, "Failed to allocate netlink message: %m");
2687 r
= sd_rtnl_message_link_set_flags(m
, IFF_UP
, IFF_UP
);
2689 return log_error_errno(r
, "Failed to set IFF_UP flag: %m");
2691 r
= sd_netlink_message_append_string(m
, IFLA_IFNAME
, veth_name
);
2693 return log_error_errno(r
, "Failed to add netlink interface name field: %m");
2695 r
= sd_netlink_message_append_u32(m
, IFLA_MASTER
, bridge
);
2697 return log_error_errno(r
, "Failed to add netlink master field: %m");
2699 r
= sd_netlink_call(rtnl
, m
, 0, NULL
);
2701 return log_error_errno(r
, "Failed to add veth interface to bridge: %m");
2706 static int parse_interface(struct udev
*udev
, const char *name
) {
2707 _cleanup_udev_device_unref_
struct udev_device
*d
= NULL
;
2708 char ifi_str
[2 + DECIMAL_STR_MAX(int)];
2711 ifi
= (int) if_nametoindex(name
);
2713 return log_error_errno(errno
, "Failed to resolve interface %s: %m", name
);
2715 sprintf(ifi_str
, "n%i", ifi
);
2716 d
= udev_device_new_from_device_id(udev
, ifi_str
);
2718 return log_error_errno(errno
, "Failed to get udev device for interface %s: %m", name
);
2720 if (udev_device_get_is_initialized(d
) <= 0) {
2721 log_error("Network interface %s is not initialized yet.", name
);
2728 static int move_network_interfaces(pid_t pid
) {
2729 _cleanup_udev_unref_
struct udev
*udev
= NULL
;
2730 _cleanup_netlink_unref_ sd_netlink
*rtnl
= NULL
;
2734 if (!arg_private_network
)
2737 if (strv_isempty(arg_network_interfaces
))
2740 r
= sd_netlink_open(&rtnl
);
2742 return log_error_errno(r
, "Failed to connect to netlink: %m");
2746 log_error("Failed to connect to udev.");
2750 STRV_FOREACH(i
, arg_network_interfaces
) {
2751 _cleanup_netlink_message_unref_ sd_netlink_message
*m
= NULL
;
2754 ifi
= parse_interface(udev
, *i
);
2758 r
= sd_rtnl_message_new_link(rtnl
, &m
, RTM_SETLINK
, ifi
);
2760 return log_error_errno(r
, "Failed to allocate netlink message: %m");
2762 r
= sd_netlink_message_append_u32(m
, IFLA_NET_NS_PID
, pid
);
2764 return log_error_errno(r
, "Failed to append namespace PID to netlink message: %m");
2766 r
= sd_netlink_call(rtnl
, m
, 0, NULL
);
2768 return log_error_errno(r
, "Failed to move interface %s to namespace: %m", *i
);
2774 static int setup_macvlan(pid_t pid
) {
2775 _cleanup_udev_unref_
struct udev
*udev
= NULL
;
2776 _cleanup_netlink_unref_ sd_netlink
*rtnl
= NULL
;
2781 if (!arg_private_network
)
2784 if (strv_isempty(arg_network_macvlan
))
2787 r
= sd_netlink_open(&rtnl
);
2789 return log_error_errno(r
, "Failed to connect to netlink: %m");
2793 log_error("Failed to connect to udev.");
2797 STRV_FOREACH(i
, arg_network_macvlan
) {
2798 _cleanup_netlink_message_unref_ sd_netlink_message
*m
= NULL
;
2799 _cleanup_free_
char *n
= NULL
;
2800 struct ether_addr mac
;
2803 ifi
= parse_interface(udev
, *i
);
2807 r
= generate_mac(&mac
, MACVLAN_HASH_KEY
, idx
++);
2809 return log_error_errno(r
, "Failed to create MACVLAN MAC address: %m");
2811 r
= sd_rtnl_message_new_link(rtnl
, &m
, RTM_NEWLINK
, 0);
2813 return log_error_errno(r
, "Failed to allocate netlink message: %m");
2815 r
= sd_netlink_message_append_u32(m
, IFLA_LINK
, ifi
);
2817 return log_error_errno(r
, "Failed to add netlink interface index: %m");
2819 n
= strappend("mv-", *i
);
2823 strshorten(n
, IFNAMSIZ
-1);
2825 r
= sd_netlink_message_append_string(m
, IFLA_IFNAME
, n
);
2827 return log_error_errno(r
, "Failed to add netlink interface name: %m");
2829 r
= sd_netlink_message_append_ether_addr(m
, IFLA_ADDRESS
, &mac
);
2831 return log_error_errno(r
, "Failed to add netlink MAC address: %m");
2833 r
= sd_netlink_message_append_u32(m
, IFLA_NET_NS_PID
, pid
);
2835 return log_error_errno(r
, "Failed to add netlink namespace field: %m");
2837 r
= sd_netlink_message_open_container(m
, IFLA_LINKINFO
);
2839 return log_error_errno(r
, "Failed to open netlink container: %m");
2841 r
= sd_netlink_message_open_container_union(m
, IFLA_INFO_DATA
, "macvlan");
2843 return log_error_errno(r
, "Failed to open netlink container: %m");
2845 r
= sd_netlink_message_append_u32(m
, IFLA_MACVLAN_MODE
, MACVLAN_MODE_BRIDGE
);
2847 return log_error_errno(r
, "Failed to append macvlan mode: %m");
2849 r
= sd_netlink_message_close_container(m
);
2851 return log_error_errno(r
, "Failed to close netlink container: %m");
2853 r
= sd_netlink_message_close_container(m
);
2855 return log_error_errno(r
, "Failed to close netlink container: %m");
2857 r
= sd_netlink_call(rtnl
, m
, 0, NULL
);
2859 return log_error_errno(r
, "Failed to add new macvlan interfaces: %m");
2865 static int setup_ipvlan(pid_t pid
) {
2866 _cleanup_udev_unref_
struct udev
*udev
= NULL
;
2867 _cleanup_netlink_unref_ sd_netlink
*rtnl
= NULL
;
2871 if (!arg_private_network
)
2874 if (strv_isempty(arg_network_ipvlan
))
2877 r
= sd_netlink_open(&rtnl
);
2879 return log_error_errno(r
, "Failed to connect to netlink: %m");
2883 log_error("Failed to connect to udev.");
2887 STRV_FOREACH(i
, arg_network_ipvlan
) {
2888 _cleanup_netlink_message_unref_ sd_netlink_message
*m
= NULL
;
2889 _cleanup_free_
char *n
= NULL
;
2892 ifi
= parse_interface(udev
, *i
);
2896 r
= sd_rtnl_message_new_link(rtnl
, &m
, RTM_NEWLINK
, 0);
2898 return log_error_errno(r
, "Failed to allocate netlink message: %m");
2900 r
= sd_netlink_message_append_u32(m
, IFLA_LINK
, ifi
);
2902 return log_error_errno(r
, "Failed to add netlink interface index: %m");
2904 n
= strappend("iv-", *i
);
2908 strshorten(n
, IFNAMSIZ
-1);
2910 r
= sd_netlink_message_append_string(m
, IFLA_IFNAME
, n
);
2912 return log_error_errno(r
, "Failed to add netlink interface name: %m");
2914 r
= sd_netlink_message_append_u32(m
, IFLA_NET_NS_PID
, pid
);
2916 return log_error_errno(r
, "Failed to add netlink namespace field: %m");
2918 r
= sd_netlink_message_open_container(m
, IFLA_LINKINFO
);
2920 return log_error_errno(r
, "Failed to open netlink container: %m");
2922 r
= sd_netlink_message_open_container_union(m
, IFLA_INFO_DATA
, "ipvlan");
2924 return log_error_errno(r
, "Failed to open netlink container: %m");
2926 r
= sd_netlink_message_append_u16(m
, IFLA_IPVLAN_MODE
, IPVLAN_MODE_L2
);
2928 return log_error_errno(r
, "Failed to add ipvlan mode: %m");
2930 r
= sd_netlink_message_close_container(m
);
2932 return log_error_errno(r
, "Failed to close netlink container: %m");
2934 r
= sd_netlink_message_close_container(m
);
2936 return log_error_errno(r
, "Failed to close netlink container: %m");
2938 r
= sd_netlink_call(rtnl
, m
, 0, NULL
);
2940 return log_error_errno(r
, "Failed to add new ipvlan interfaces: %m");
2946 static int setup_seccomp(void) {
2949 static const struct {
2950 uint64_t capability
;
2953 { CAP_SYS_RAWIO
, SCMP_SYS(iopl
) },
2954 { CAP_SYS_RAWIO
, SCMP_SYS(ioperm
) },
2955 { CAP_SYS_BOOT
, SCMP_SYS(kexec_load
) },
2956 { CAP_SYS_ADMIN
, SCMP_SYS(swapon
) },
2957 { CAP_SYS_ADMIN
, SCMP_SYS(swapoff
) },
2958 { CAP_SYS_ADMIN
, SCMP_SYS(open_by_handle_at
) },
2959 { CAP_SYS_MODULE
, SCMP_SYS(init_module
) },
2960 { CAP_SYS_MODULE
, SCMP_SYS(finit_module
) },
2961 { CAP_SYS_MODULE
, SCMP_SYS(delete_module
) },
2962 { CAP_SYSLOG
, SCMP_SYS(syslog
) },
2965 scmp_filter_ctx seccomp
;
2969 seccomp
= seccomp_init(SCMP_ACT_ALLOW
);
2973 r
= seccomp_add_secondary_archs(seccomp
);
2975 log_error_errno(r
, "Failed to add secondary archs to seccomp filter: %m");
2979 for (i
= 0; i
< ELEMENTSOF(blacklist
); i
++) {
2980 if (arg_retain
& (1ULL << blacklist
[i
].capability
))
2983 r
= seccomp_rule_add(seccomp
, SCMP_ACT_ERRNO(EPERM
), blacklist
[i
].syscall_num
, 0);
2985 continue; /* unknown syscall */
2987 log_error_errno(r
, "Failed to block syscall: %m");
2994 Audit is broken in containers, much of the userspace audit
2995 hookup will fail if running inside a container. We don't
2996 care and just turn off creation of audit sockets.
2998 This will make socket(AF_NETLINK, *, NETLINK_AUDIT) fail
2999 with EAFNOSUPPORT which audit userspace uses as indication
3000 that audit is disabled in the kernel.
3003 r
= seccomp_rule_add(
3005 SCMP_ACT_ERRNO(EAFNOSUPPORT
),
3008 SCMP_A0(SCMP_CMP_EQ
, AF_NETLINK
),
3009 SCMP_A2(SCMP_CMP_EQ
, NETLINK_AUDIT
));
3011 log_error_errno(r
, "Failed to add audit seccomp rule: %m");
3015 r
= seccomp_attr_set(seccomp
, SCMP_FLTATR_CTL_NNP
, 0);
3017 log_error_errno(r
, "Failed to unset NO_NEW_PRIVS: %m");
3021 r
= seccomp_load(seccomp
);
3023 log_debug_errno(r
, "Kernel is probably not configured with CONFIG_SECCOMP. Disabling seccomp audit filter: %m");
3028 log_error_errno(r
, "Failed to install seccomp audit filter: %m");
3033 seccomp_release(seccomp
);
3041 static int setup_propagate(const char *root
) {
3044 (void) mkdir_p("/run/systemd/nspawn/", 0755);
3045 (void) mkdir_p("/run/systemd/nspawn/propagate", 0600);
3046 p
= strjoina("/run/systemd/nspawn/propagate/", arg_machine
);
3047 (void) mkdir_p(p
, 0600);
3049 if (userns_mkdir(root
, "/run/systemd", 0755, 0, 0) < 0)
3050 return log_error_errno(errno
, "Failed to create /run/systemd: %m");
3052 if (userns_mkdir(root
, "/run/systemd/nspawn", 0755, 0, 0) < 0)
3053 return log_error_errno(errno
, "Failed to create /run/systemd/nspawn: %m");
3055 if (userns_mkdir(root
, "/run/systemd/nspawn/incoming", 0600, 0, 0) < 0)
3056 return log_error_errno(errno
, "Failed to create /run/systemd/nspawn/incoming: %m");
3058 q
= prefix_roota(root
, "/run/systemd/nspawn/incoming");
3059 if (mount(p
, q
, NULL
, MS_BIND
, NULL
) < 0)
3060 return log_error_errno(errno
, "Failed to install propagation bind mount.");
3062 if (mount(NULL
, q
, NULL
, MS_BIND
|MS_REMOUNT
|MS_RDONLY
, NULL
) < 0)
3063 return log_error_errno(errno
, "Failed to make propagation mount read-only");
3068 static int setup_image(char **device_path
, int *loop_nr
) {
3069 struct loop_info64 info
= {
3070 .lo_flags
= LO_FLAGS_AUTOCLEAR
|LO_FLAGS_PARTSCAN
3072 _cleanup_close_
int fd
= -1, control
= -1, loop
= -1;
3073 _cleanup_free_
char* loopdev
= NULL
;
3077 assert(device_path
);
3081 fd
= open(arg_image
, O_CLOEXEC
|(arg_read_only
? O_RDONLY
: O_RDWR
)|O_NONBLOCK
|O_NOCTTY
);
3083 return log_error_errno(errno
, "Failed to open %s: %m", arg_image
);
3085 if (fstat(fd
, &st
) < 0)
3086 return log_error_errno(errno
, "Failed to stat %s: %m", arg_image
);
3088 if (S_ISBLK(st
.st_mode
)) {
3091 p
= strdup(arg_image
);
3105 if (!S_ISREG(st
.st_mode
)) {
3106 log_error_errno(errno
, "%s is not a regular file or block device: %m", arg_image
);
3110 control
= open("/dev/loop-control", O_RDWR
|O_CLOEXEC
|O_NOCTTY
|O_NONBLOCK
);
3112 return log_error_errno(errno
, "Failed to open /dev/loop-control: %m");
3114 nr
= ioctl(control
, LOOP_CTL_GET_FREE
);
3116 return log_error_errno(errno
, "Failed to allocate loop device: %m");
3118 if (asprintf(&loopdev
, "/dev/loop%i", nr
) < 0)
3121 loop
= open(loopdev
, O_CLOEXEC
|(arg_read_only
? O_RDONLY
: O_RDWR
)|O_NONBLOCK
|O_NOCTTY
);
3123 return log_error_errno(errno
, "Failed to open loop device %s: %m", loopdev
);
3125 if (ioctl(loop
, LOOP_SET_FD
, fd
) < 0)
3126 return log_error_errno(errno
, "Failed to set loopback file descriptor on %s: %m", loopdev
);
3129 info
.lo_flags
|= LO_FLAGS_READ_ONLY
;
3131 if (ioctl(loop
, LOOP_SET_STATUS64
, &info
) < 0)
3132 return log_error_errno(errno
, "Failed to set loopback settings on %s: %m", loopdev
);
3134 *device_path
= loopdev
;
3145 #define PARTITION_TABLE_BLURB \
3146 "Note that the disk image needs to either contain only a single MBR partition of\n" \
3147 "type 0x83 that is marked bootable, or a single GPT partition of type " \
3148 "0FC63DAF-8483-4772-8E79-3D69D8477DE4 or follow\n" \
3149 " http://www.freedesktop.org/wiki/Specifications/DiscoverablePartitionsSpec/\n" \
3150 "to be bootable with systemd-nspawn."
3152 static int dissect_image(
3154 char **root_device
, bool *root_device_rw
,
3155 char **home_device
, bool *home_device_rw
,
3156 char **srv_device
, bool *srv_device_rw
,
3160 int home_nr
= -1, srv_nr
= -1;
3161 #ifdef GPT_ROOT_NATIVE
3164 #ifdef GPT_ROOT_SECONDARY
3165 int secondary_root_nr
= -1;
3167 _cleanup_free_
char *home
= NULL
, *root
= NULL
, *secondary_root
= NULL
, *srv
= NULL
, *generic
= NULL
;
3168 _cleanup_udev_enumerate_unref_
struct udev_enumerate
*e
= NULL
;
3169 _cleanup_udev_device_unref_
struct udev_device
*d
= NULL
;
3170 _cleanup_blkid_free_probe_ blkid_probe b
= NULL
;
3171 _cleanup_udev_unref_
struct udev
*udev
= NULL
;
3172 struct udev_list_entry
*first
, *item
;
3173 bool home_rw
= true, root_rw
= true, secondary_root_rw
= true, srv_rw
= true, generic_rw
= true;
3174 bool is_gpt
, is_mbr
, multiple_generic
= false;
3175 const char *pttype
= NULL
;
3182 assert(root_device
);
3183 assert(home_device
);
3188 b
= blkid_new_probe();
3193 r
= blkid_probe_set_device(b
, fd
, 0, 0);
3198 log_error_errno(errno
, "Failed to set device on blkid probe: %m");
3202 blkid_probe_enable_partitions(b
, 1);
3203 blkid_probe_set_partitions_flags(b
, BLKID_PARTS_ENTRY_DETAILS
);
3206 r
= blkid_do_safeprobe(b
);
3207 if (r
== -2 || r
== 1) {
3208 log_error("Failed to identify any partition table on\n"
3210 PARTITION_TABLE_BLURB
, arg_image
);
3212 } else if (r
!= 0) {
3215 log_error_errno(errno
, "Failed to probe: %m");
3219 (void) blkid_probe_lookup_value(b
, "PTTYPE", &pttype
, NULL
);
3221 is_gpt
= streq_ptr(pttype
, "gpt");
3222 is_mbr
= streq_ptr(pttype
, "dos");
3224 if (!is_gpt
&& !is_mbr
) {
3225 log_error("No GPT or MBR partition table discovered on\n"
3227 PARTITION_TABLE_BLURB
, arg_image
);
3232 pl
= blkid_probe_get_partitions(b
);
3237 log_error("Failed to list partitions of %s", arg_image
);
3245 if (fstat(fd
, &st
) < 0)
3246 return log_error_errno(errno
, "Failed to stat block device: %m");
3248 d
= udev_device_new_from_devnum(udev
, 'b', st
.st_rdev
);
3256 log_error("Kernel partitions never appeared.");
3260 e
= udev_enumerate_new(udev
);
3264 r
= udev_enumerate_add_match_parent(e
, d
);
3268 r
= udev_enumerate_scan_devices(e
);
3270 return log_error_errno(r
, "Failed to scan for partition devices of %s: %m", arg_image
);
3272 /* Count the partitions enumerated by the kernel */
3274 first
= udev_enumerate_get_list_entry(e
);
3275 udev_list_entry_foreach(item
, first
)
3278 /* Count the partitions enumerated by blkid */
3279 m
= blkid_partlist_numof_partitions(pl
);
3283 log_error("blkid and kernel partition list do not match.");
3289 /* The kernel has probed fewer partitions than
3290 * blkid? Maybe the kernel prober is still
3291 * running or it got EBUSY because udev
3292 * already opened the device. Let's reprobe
3293 * the device, which is a synchronous call
3294 * that waits until probing is complete. */
3296 for (j
= 0; j
< 20; j
++) {
3298 r
= ioctl(fd
, BLKRRPART
, 0);
3301 if (r
>= 0 || r
!= -EBUSY
)
3304 /* If something else has the device
3305 * open, such as an udev rule, the
3306 * ioctl will return EBUSY. Since
3307 * there's no way to wait until it
3308 * isn't busy anymore, let's just wait
3309 * a bit, and try again.
3311 * This is really something they
3312 * should fix in the kernel! */
3314 usleep(50 * USEC_PER_MSEC
);
3318 return log_error_errno(r
, "Failed to reread partition table: %m");
3321 e
= udev_enumerate_unref(e
);
3324 first
= udev_enumerate_get_list_entry(e
);
3325 udev_list_entry_foreach(item
, first
) {
3326 _cleanup_udev_device_unref_
struct udev_device
*q
;
3328 unsigned long long flags
;
3334 q
= udev_device_new_from_syspath(udev
, udev_list_entry_get_name(item
));
3339 log_error_errno(errno
, "Failed to get partition device of %s: %m", arg_image
);
3343 qn
= udev_device_get_devnum(q
);
3347 if (st
.st_rdev
== qn
)
3350 node
= udev_device_get_devnode(q
);
3354 pp
= blkid_partlist_devno_to_partition(pl
, qn
);
3358 flags
= blkid_partition_get_flags(pp
);
3360 nr
= blkid_partition_get_partno(pp
);
3368 if (flags
& GPT_FLAG_NO_AUTO
)
3371 stype
= blkid_partition_get_type_string(pp
);
3375 if (sd_id128_from_string(stype
, &type_id
) < 0)
3378 if (sd_id128_equal(type_id
, GPT_HOME
)) {
3380 if (home
&& nr
>= home_nr
)
3384 home_rw
= !(flags
& GPT_FLAG_READ_ONLY
);
3386 r
= free_and_strdup(&home
, node
);
3390 } else if (sd_id128_equal(type_id
, GPT_SRV
)) {
3392 if (srv
&& nr
>= srv_nr
)
3396 srv_rw
= !(flags
& GPT_FLAG_READ_ONLY
);
3398 r
= free_and_strdup(&srv
, node
);
3402 #ifdef GPT_ROOT_NATIVE
3403 else if (sd_id128_equal(type_id
, GPT_ROOT_NATIVE
)) {
3405 if (root
&& nr
>= root_nr
)
3409 root_rw
= !(flags
& GPT_FLAG_READ_ONLY
);
3411 r
= free_and_strdup(&root
, node
);
3416 #ifdef GPT_ROOT_SECONDARY
3417 else if (sd_id128_equal(type_id
, GPT_ROOT_SECONDARY
)) {
3419 if (secondary_root
&& nr
>= secondary_root_nr
)
3422 secondary_root_nr
= nr
;
3423 secondary_root_rw
= !(flags
& GPT_FLAG_READ_ONLY
);
3425 r
= free_and_strdup(&secondary_root
, node
);
3430 else if (sd_id128_equal(type_id
, GPT_LINUX_GENERIC
)) {
3433 multiple_generic
= true;
3435 generic_rw
= !(flags
& GPT_FLAG_READ_ONLY
);
3437 r
= free_and_strdup(&generic
, node
);
3443 } else if (is_mbr
) {
3446 if (flags
!= 0x80) /* Bootable flag */
3449 type
= blkid_partition_get_type(pp
);
3450 if (type
!= 0x83) /* Linux partition */
3454 multiple_generic
= true;
3458 r
= free_and_strdup(&root
, node
);
3466 *root_device
= root
;
3469 *root_device_rw
= root_rw
;
3471 } else if (secondary_root
) {
3472 *root_device
= secondary_root
;
3473 secondary_root
= NULL
;
3475 *root_device_rw
= secondary_root_rw
;
3477 } else if (generic
) {
3479 /* There were no partitions with precise meanings
3480 * around, but we found generic partitions. In this
3481 * case, if there's only one, we can go ahead and boot
3482 * it, otherwise we bail out, because we really cannot
3483 * make any sense of it. */
3485 if (multiple_generic
) {
3486 log_error("Identified multiple bootable Linux partitions on\n"
3488 PARTITION_TABLE_BLURB
, arg_image
);
3492 *root_device
= generic
;
3495 *root_device_rw
= generic_rw
;
3498 log_error("Failed to identify root partition in disk image\n"
3500 PARTITION_TABLE_BLURB
, arg_image
);
3505 *home_device
= home
;
3508 *home_device_rw
= home_rw
;
3515 *srv_device_rw
= srv_rw
;
3520 log_error("--image= is not supported, compiled without blkid support.");
3525 static int mount_device(const char *what
, const char *where
, const char *directory
, bool rw
) {
3527 _cleanup_blkid_free_probe_ blkid_probe b
= NULL
;
3528 const char *fstype
, *p
;
3538 p
= strjoina(where
, directory
);
3543 b
= blkid_new_probe_from_filename(what
);
3547 log_error_errno(errno
, "Failed to allocate prober for %s: %m", what
);
3551 blkid_probe_enable_superblocks(b
, 1);
3552 blkid_probe_set_superblocks_flags(b
, BLKID_SUBLKS_TYPE
);
3555 r
= blkid_do_safeprobe(b
);
3556 if (r
== -1 || r
== 1) {
3557 log_error("Cannot determine file system type of %s", what
);
3559 } else if (r
!= 0) {
3562 log_error_errno(errno
, "Failed to probe %s: %m", what
);
3567 if (blkid_probe_lookup_value(b
, "TYPE", &fstype
, NULL
) < 0) {
3570 log_error("Failed to determine file system type of %s", what
);
3574 if (streq(fstype
, "crypto_LUKS")) {
3575 log_error("nspawn currently does not support LUKS disk images.");
3579 if (mount(what
, p
, fstype
, MS_NODEV
|(rw
? 0 : MS_RDONLY
), NULL
) < 0)
3580 return log_error_errno(errno
, "Failed to mount %s: %m", what
);
3584 log_error("--image= is not supported, compiled without blkid support.");
3589 static int mount_devices(
3591 const char *root_device
, bool root_device_rw
,
3592 const char *home_device
, bool home_device_rw
,
3593 const char *srv_device
, bool srv_device_rw
) {
3599 r
= mount_device(root_device
, arg_directory
, NULL
, root_device_rw
);
3601 return log_error_errno(r
, "Failed to mount root directory: %m");
3605 r
= mount_device(home_device
, arg_directory
, "/home", home_device_rw
);
3607 return log_error_errno(r
, "Failed to mount home directory: %m");
3611 r
= mount_device(srv_device
, arg_directory
, "/srv", srv_device_rw
);
3613 return log_error_errno(r
, "Failed to mount server data directory: %m");
3619 static void loop_remove(int nr
, int *image_fd
) {
3620 _cleanup_close_
int control
= -1;
3626 if (image_fd
&& *image_fd
>= 0) {
3627 r
= ioctl(*image_fd
, LOOP_CLR_FD
);
3629 log_debug_errno(errno
, "Failed to close loop image: %m");
3630 *image_fd
= safe_close(*image_fd
);
3633 control
= open("/dev/loop-control", O_RDWR
|O_CLOEXEC
|O_NOCTTY
|O_NONBLOCK
);
3635 log_warning_errno(errno
, "Failed to open /dev/loop-control: %m");
3639 r
= ioctl(control
, LOOP_CTL_REMOVE
, nr
);
3641 log_debug_errno(errno
, "Failed to remove loop %d: %m", nr
);
3644 static int spawn_getent(const char *database
, const char *key
, pid_t
*rpid
) {
3652 if (pipe2(pipe_fds
, O_CLOEXEC
) < 0)
3653 return log_error_errno(errno
, "Failed to allocate pipe: %m");
3657 return log_error_errno(errno
, "Failed to fork getent child: %m");
3658 else if (pid
== 0) {
3660 char *empty_env
= NULL
;
3662 if (dup3(pipe_fds
[1], STDOUT_FILENO
, 0) < 0)
3663 _exit(EXIT_FAILURE
);
3665 if (pipe_fds
[0] > 2)
3666 safe_close(pipe_fds
[0]);
3667 if (pipe_fds
[1] > 2)
3668 safe_close(pipe_fds
[1]);
3670 nullfd
= open("/dev/null", O_RDWR
);
3672 _exit(EXIT_FAILURE
);
3674 if (dup3(nullfd
, STDIN_FILENO
, 0) < 0)
3675 _exit(EXIT_FAILURE
);
3677 if (dup3(nullfd
, STDERR_FILENO
, 0) < 0)
3678 _exit(EXIT_FAILURE
);
3683 (void) reset_all_signal_handlers();
3684 (void) reset_signal_mask();
3685 close_all_fds(NULL
, 0);
3687 execle("/usr/bin/getent", "getent", database
, key
, NULL
, &empty_env
);
3688 execle("/bin/getent", "getent", database
, key
, NULL
, &empty_env
);
3689 _exit(EXIT_FAILURE
);
3692 pipe_fds
[1] = safe_close(pipe_fds
[1]);
3699 static int change_uid_gid(char **_home
) {
3700 char line
[LINE_MAX
], *x
, *u
, *g
, *h
;
3701 const char *word
, *state
;
3702 _cleanup_free_ uid_t
*uids
= NULL
;
3703 _cleanup_free_
char *home
= NULL
;
3704 _cleanup_fclose_
FILE *f
= NULL
;
3705 _cleanup_close_
int fd
= -1;
3706 unsigned n_uids
= 0;
3715 if (!arg_user
|| streq(arg_user
, "root") || streq(arg_user
, "0")) {
3716 /* Reset everything fully to 0, just in case */
3718 r
= reset_uid_gid();
3720 return log_error_errno(r
, "Failed to become root: %m");
3726 /* First, get user credentials */
3727 fd
= spawn_getent("passwd", arg_user
, &pid
);
3731 f
= fdopen(fd
, "r");
3736 if (!fgets(line
, sizeof(line
), f
)) {
3739 log_error("Failed to resolve user %s.", arg_user
);
3743 log_error_errno(errno
, "Failed to read from getent: %m");
3749 wait_for_terminate_and_warn("getent passwd", pid
, true);
3751 x
= strchr(line
, ':');
3753 log_error("/etc/passwd entry has invalid user field.");
3757 u
= strchr(x
+1, ':');
3759 log_error("/etc/passwd entry has invalid password field.");
3766 log_error("/etc/passwd entry has invalid UID field.");
3774 log_error("/etc/passwd entry has invalid GID field.");
3779 h
= strchr(x
+1, ':');
3781 log_error("/etc/passwd entry has invalid GECOS field.");
3788 log_error("/etc/passwd entry has invalid home directory field.");
3794 r
= parse_uid(u
, &uid
);
3796 log_error("Failed to parse UID of user.");
3800 r
= parse_gid(g
, &gid
);
3802 log_error("Failed to parse GID of user.");
3810 /* Second, get group memberships */
3811 fd
= spawn_getent("initgroups", arg_user
, &pid
);
3816 f
= fdopen(fd
, "r");
3821 if (!fgets(line
, sizeof(line
), f
)) {
3823 log_error("Failed to resolve user %s.", arg_user
);
3827 log_error_errno(errno
, "Failed to read from getent: %m");
3833 wait_for_terminate_and_warn("getent initgroups", pid
, true);
3835 /* Skip over the username and subsequent separator whitespace */
3837 x
+= strcspn(x
, WHITESPACE
);
3838 x
+= strspn(x
, WHITESPACE
);
3840 FOREACH_WORD(word
, l
, x
, state
) {
3846 if (!GREEDY_REALLOC(uids
, sz
, n_uids
+1))
3849 r
= parse_uid(c
, &uids
[n_uids
++]);
3851 log_error("Failed to parse group data from getent.");
3856 r
= mkdir_parents(home
, 0775);
3858 return log_error_errno(r
, "Failed to make home root directory: %m");
3860 r
= mkdir_safe(home
, 0755, uid
, gid
);
3861 if (r
< 0 && r
!= -EEXIST
)
3862 return log_error_errno(r
, "Failed to make home directory: %m");
3864 (void) fchown(STDIN_FILENO
, uid
, gid
);
3865 (void) fchown(STDOUT_FILENO
, uid
, gid
);
3866 (void) fchown(STDERR_FILENO
, uid
, gid
);
3868 if (setgroups(n_uids
, uids
) < 0)
3869 return log_error_errno(errno
, "Failed to set auxiliary groups: %m");
3871 if (setresgid(gid
, gid
, gid
) < 0)
3872 return log_error_errno(errno
, "setregid() failed: %m");
3874 if (setresuid(uid
, uid
, uid
) < 0)
3875 return log_error_errno(errno
, "setreuid() failed: %m");
3887 * < 0 : wait_for_terminate() failed to get the state of the
3888 * container, the container was terminated by a signal, or
3889 * failed for an unknown reason. No change is made to the
3890 * container argument.
3891 * > 0 : The program executed in the container terminated with an
3892 * error. The exit code of the program executed in the
3893 * container is returned. The container argument has been set
3894 * to CONTAINER_TERMINATED.
3895 * 0 : The container is being rebooted, has been shut down or exited
3896 * successfully. The container argument has been set to either
3897 * CONTAINER_TERMINATED or CONTAINER_REBOOTED.
3899 * That is, success is indicated by a return value of zero, and an
3900 * error is indicated by a non-zero value.
3902 static int wait_for_container(pid_t pid
, ContainerStatus
*container
) {
3906 r
= wait_for_terminate(pid
, &status
);
3908 return log_warning_errno(r
, "Failed to wait for container: %m");
3910 switch (status
.si_code
) {
3913 if (status
.si_status
== 0) {
3914 log_full(arg_quiet
? LOG_DEBUG
: LOG_INFO
, "Container %s exited successfully.", arg_machine
);
3917 log_full(arg_quiet
? LOG_DEBUG
: LOG_INFO
, "Container %s failed with error code %i.", arg_machine
, status
.si_status
);
3919 *container
= CONTAINER_TERMINATED
;
3920 return status
.si_status
;
3923 if (status
.si_status
== SIGINT
) {
3925 log_full(arg_quiet
? LOG_DEBUG
: LOG_INFO
, "Container %s has been shut down.", arg_machine
);
3926 *container
= CONTAINER_TERMINATED
;
3929 } else if (status
.si_status
== SIGHUP
) {
3931 log_full(arg_quiet
? LOG_DEBUG
: LOG_INFO
, "Container %s is being rebooted.", arg_machine
);
3932 *container
= CONTAINER_REBOOTED
;
3936 /* CLD_KILLED fallthrough */
3939 log_error("Container %s terminated by signal %s.", arg_machine
, signal_to_string(status
.si_status
));
3943 log_error("Container %s failed due to unknown reason.", arg_machine
);
3950 static void nop_handler(int sig
) {}
3952 static int on_orderly_shutdown(sd_event_source
*s
, const struct signalfd_siginfo
*si
, void *userdata
) {
3955 pid
= PTR_TO_UINT32(userdata
);
3957 if (kill(pid
, arg_kill_signal
) >= 0) {
3958 log_info("Trying to halt container. Send SIGTERM again to trigger immediate termination.");
3959 sd_event_source_set_userdata(s
, NULL
);
3964 sd_event_exit(sd_event_source_get_event(s
), 0);
3968 static int determine_names(void) {
3971 if (!arg_image
&& !arg_directory
) {
3973 _cleanup_(image_unrefp
) Image
*i
= NULL
;
3975 r
= image_find(arg_machine
, &i
);
3977 return log_error_errno(r
, "Failed to find image for machine '%s': %m", arg_machine
);
3979 log_error("No image for machine '%s': %m", arg_machine
);
3983 if (i
->type
== IMAGE_RAW
)
3984 r
= set_sanitized_path(&arg_image
, i
->path
);
3986 r
= set_sanitized_path(&arg_directory
, i
->path
);
3988 return log_error_errno(r
, "Invalid image directory: %m");
3991 arg_read_only
= arg_read_only
|| i
->read_only
;
3993 arg_directory
= get_current_dir_name();
3995 if (!arg_directory
&& !arg_machine
) {
3996 log_error("Failed to determine path, please use -D or -i.");
4002 if (arg_directory
&& path_equal(arg_directory
, "/"))
4003 arg_machine
= gethostname_malloc();
4005 arg_machine
= strdup(basename(arg_image
?: arg_directory
));
4010 hostname_cleanup(arg_machine
, false);
4011 if (!machine_name_is_valid(arg_machine
)) {
4012 log_error("Failed to determine machine name automatically, please use -M.");
4016 if (arg_ephemeral
) {
4019 /* Add a random suffix when this is an
4020 * ephemeral machine, so that we can run many
4021 * instances at once without manually having
4022 * to specify -M each time. */
4024 if (asprintf(&b
, "%s-%016" PRIx64
, arg_machine
, random_u64()) < 0)
4035 static int determine_uid_shift(const char *directory
) {
4043 if (arg_uid_shift
== UID_INVALID
) {
4046 r
= stat(directory
, &st
);
4048 return log_error_errno(errno
, "Failed to determine UID base of %s: %m", directory
);
4050 arg_uid_shift
= st
.st_uid
& UINT32_C(0xffff0000);
4052 if (arg_uid_shift
!= (st
.st_gid
& UINT32_C(0xffff0000))) {
4053 log_error("UID and GID base of %s don't match.", directory
);
4057 arg_uid_range
= UINT32_C(0x10000);
4060 if (arg_uid_shift
> (uid_t
) -1 - arg_uid_range
) {
4061 log_error("UID base too high for UID range.");
4065 log_info("Using user namespaces with base " UID_FMT
" and range " UID_FMT
".", arg_uid_shift
, arg_uid_range
);
4069 static int inner_child(
4071 const char *directory
,
4079 _cleanup_free_
char *home
= NULL
;
4081 const char *envp
[] = {
4082 "PATH=" DEFAULT_PATH_SPLIT_USR
,
4083 "container=systemd-nspawn", /* LXC sets container=lxc, so follow the scheme here */
4088 NULL
, /* container_uuid */
4089 NULL
, /* LISTEN_FDS */
4090 NULL
, /* LISTEN_PID */
4094 _cleanup_strv_free_
char **env_use
= NULL
;
4099 assert(kmsg_socket
>= 0);
4102 /* Tell the parent, that it now can write the UID map. */
4103 (void) barrier_place(barrier
); /* #1 */
4105 /* Wait until the parent wrote the UID map */
4106 if (!barrier_place_and_sync(barrier
)) { /* #2 */
4107 log_error("Parent died too early");
4112 r
= mount_all(NULL
, true);
4116 /* Wait until we are cgroup-ified, so that we
4117 * can mount the right cgroup path writable */
4118 if (!barrier_place_and_sync(barrier
)) { /* #3 */
4119 log_error("Parent died too early");
4123 r
= mount_systemd_cgroup_writable("");
4127 r
= reset_uid_gid();
4129 return log_error_errno(r
, "Couldn't become new root: %m");
4131 r
= setup_boot_id(NULL
);
4135 r
= setup_kmsg(NULL
, kmsg_socket
);
4138 kmsg_socket
= safe_close(kmsg_socket
);
4143 return log_error_errno(errno
, "setsid() failed: %m");
4145 if (arg_private_network
)
4148 r
= send_rtnl(rtnl_socket
);
4151 rtnl_socket
= safe_close(rtnl_socket
);
4153 if (drop_capabilities() < 0)
4154 return log_error_errno(errno
, "drop_capabilities() failed: %m");
4158 if (arg_personality
!= PERSONALITY_INVALID
) {
4159 if (personality(arg_personality
) < 0)
4160 return log_error_errno(errno
, "personality() failed: %m");
4161 } else if (secondary
) {
4162 if (personality(PER_LINUX32
) < 0)
4163 return log_error_errno(errno
, "personality() failed: %m");
4167 if (arg_selinux_context
)
4168 if (setexeccon((security_context_t
) arg_selinux_context
) < 0)
4169 return log_error_errno(errno
, "setexeccon(\"%s\") failed: %m", arg_selinux_context
);
4172 r
= change_uid_gid(&home
);
4176 envp
[n_env
] = strv_find_prefix(environ
, "TERM=");
4180 if ((asprintf((char**)(envp
+ n_env
++), "HOME=%s", home
? home
: "/root") < 0) ||
4181 (asprintf((char**)(envp
+ n_env
++), "USER=%s", arg_user
? arg_user
: "root") < 0) ||
4182 (asprintf((char**)(envp
+ n_env
++), "LOGNAME=%s", arg_user
? arg_user
: "root") < 0))
4185 if (!sd_id128_equal(arg_uuid
, SD_ID128_NULL
)) {
4188 if (asprintf((char**)(envp
+ n_env
++), "container_uuid=%s", id128_format_as_uuid(arg_uuid
, as_uuid
)) < 0)
4192 if (fdset_size(fds
) > 0) {
4193 r
= fdset_cloexec(fds
, false);
4195 return log_error_errno(r
, "Failed to unset O_CLOEXEC for file descriptors.");
4197 if ((asprintf((char **)(envp
+ n_env
++), "LISTEN_FDS=%u", fdset_size(fds
)) < 0) ||
4198 (asprintf((char **)(envp
+ n_env
++), "LISTEN_PID=1") < 0))
4202 env_use
= strv_env_merge(2, envp
, arg_setenv
);
4206 /* Let the parent know that we are ready and
4207 * wait until the parent is ready with the
4209 if (!barrier_place_and_sync(barrier
)) { /* #4 */
4210 log_error("Parent died too early");
4214 /* Now, explicitly close the log, so that we
4215 * then can close all remaining fds. Closing
4216 * the log explicitly first has the benefit
4217 * that the logging subsystem knows about it,
4218 * and is thus ready to be reopened should we
4219 * need it again. Note that the other fds
4220 * closed here are at least the locking and
4223 (void) fdset_close_others(fds
);
4229 /* Automatically search for the init system */
4231 m
= 1 + argc
- optind
;
4232 a
= newa(char*, m
+ 1);
4233 memcpy(a
+ 1, argv
+ optind
, m
* sizeof(char*));
4235 a
[0] = (char*) "/usr/lib/systemd/systemd";
4236 execve(a
[0], a
, env_use
);
4238 a
[0] = (char*) "/lib/systemd/systemd";
4239 execve(a
[0], a
, env_use
);
4241 a
[0] = (char*) "/sbin/init";
4242 execve(a
[0], a
, env_use
);
4243 } else if (argc
> optind
)
4244 execvpe(argv
[optind
], argv
+ optind
, env_use
);
4246 chdir(home
? home
: "/root");
4247 execle("/bin/bash", "-bash", NULL
, env_use
);
4248 execle("/bin/sh", "-sh", NULL
, env_use
);
4252 return log_error_errno(errno
, "execv() failed: %m");
4255 static int outer_child(
4257 const char *directory
,
4258 const char *console
,
4259 const char *root_device
, bool root_device_rw
,
4260 const char *home_device
, bool home_device_rw
,
4261 const char *srv_device
, bool srv_device_rw
,
4267 int uid_shift_socket
,
4279 assert(pid_socket
>= 0);
4280 assert(kmsg_socket
>= 0);
4282 if (prctl(PR_SET_PDEATHSIG
, SIGKILL
) < 0)
4283 return log_error_errno(errno
, "PR_SET_PDEATHSIG failed: %m");
4286 close_nointr(STDIN_FILENO
);
4287 close_nointr(STDOUT_FILENO
);
4288 close_nointr(STDERR_FILENO
);
4290 r
= open_terminal(console
, O_RDWR
);
4291 if (r
!= STDIN_FILENO
) {
4297 return log_error_errno(r
, "Failed to open console: %m");
4300 if (dup2(STDIN_FILENO
, STDOUT_FILENO
) != STDOUT_FILENO
||
4301 dup2(STDIN_FILENO
, STDERR_FILENO
) != STDERR_FILENO
)
4302 return log_error_errno(errno
, "Failed to duplicate console: %m");
4305 r
= reset_audit_loginuid();
4309 /* Mark everything as slave, so that we still
4310 * receive mounts from the real root, but don't
4311 * propagate mounts to the real root. */
4312 if (mount(NULL
, "/", NULL
, MS_SLAVE
|MS_REC
, NULL
) < 0)
4313 return log_error_errno(errno
, "MS_SLAVE|MS_REC failed: %m");
4315 r
= mount_devices(directory
,
4316 root_device
, root_device_rw
,
4317 home_device
, home_device_rw
,
4318 srv_device
, srv_device_rw
);
4322 r
= determine_uid_shift(directory
);
4327 l
= send(uid_shift_socket
, &arg_uid_shift
, sizeof(arg_uid_shift
), MSG_NOSIGNAL
);
4329 return log_error_errno(errno
, "Failed to send UID shift: %m");
4330 if (l
!= sizeof(arg_uid_shift
)) {
4331 log_error("Short write while sending UID shift.");
4336 /* Turn directory into bind mount */
4337 if (mount(directory
, directory
, NULL
, MS_BIND
|MS_REC
, NULL
) < 0)
4338 return log_error_errno(errno
, "Failed to make bind mount: %m");
4340 r
= setup_volatile(directory
);
4344 r
= setup_volatile_state(directory
);
4348 r
= base_filesystem_create(directory
, arg_uid_shift
, (gid_t
) arg_uid_shift
);
4352 if (arg_read_only
) {
4353 r
= bind_remount_recursive(directory
, true);
4355 return log_error_errno(r
, "Failed to make tree read-only: %m");
4358 r
= mount_all(directory
, false);
4362 if (copy_devnodes(directory
) < 0)
4365 dev_setup(directory
, arg_uid_shift
, arg_uid_shift
);
4367 if (setup_pts(directory
) < 0)
4370 r
= setup_propagate(directory
);
4374 r
= setup_dev_console(directory
, console
);
4378 r
= setup_seccomp();
4382 r
= setup_timezone(directory
);
4386 r
= setup_resolv_conf(directory
);
4390 r
= setup_journal(directory
);
4394 r
= mount_custom(directory
);
4398 r
= mount_cgroup(directory
);
4402 r
= mount_move_root(directory
);
4404 return log_error_errno(r
, "Failed to move root directory: %m");
4406 pid
= raw_clone(SIGCHLD
|CLONE_NEWNS
|
4407 (arg_share_system
? 0 : CLONE_NEWIPC
|CLONE_NEWPID
|CLONE_NEWUTS
) |
4408 (arg_private_network
? CLONE_NEWNET
: 0) |
4409 (arg_userns
? CLONE_NEWUSER
: 0),
4412 return log_error_errno(errno
, "Failed to fork inner child: %m");
4415 pid_socket
= safe_close(pid_socket
);
4416 uid_shift_socket
= safe_close(uid_shift_socket
);
4418 /* The inner child has all namespaces that are
4419 * requested, so that we all are owned by the user if
4420 * user namespaces are turned on. */
4422 r
= inner_child(barrier
, directory
, secondary
, kmsg_socket
, rtnl_socket
, fds
, argc
, argv
);
4424 _exit(EXIT_FAILURE
);
4426 _exit(EXIT_SUCCESS
);
4429 l
= send(pid_socket
, &pid
, sizeof(pid
), MSG_NOSIGNAL
);
4431 return log_error_errno(errno
, "Failed to send PID: %m");
4432 if (l
!= sizeof(pid
)) {
4433 log_error("Short write while sending PID.");
4437 pid_socket
= safe_close(pid_socket
);
4442 static int setup_uid_map(pid_t pid
) {
4443 char uid_map
[strlen("/proc//uid_map") + DECIMAL_STR_MAX(uid_t
) + 1], line
[DECIMAL_STR_MAX(uid_t
)*3+3+1];
4448 xsprintf(uid_map
, "/proc/" PID_FMT
"/uid_map", pid
);
4449 xsprintf(line
, UID_FMT
" " UID_FMT
" " UID_FMT
"\n", 0, arg_uid_shift
, arg_uid_range
);
4450 r
= write_string_file(uid_map
, line
, 0);
4452 return log_error_errno(r
, "Failed to write UID map: %m");
4454 /* We always assign the same UID and GID ranges */
4455 xsprintf(uid_map
, "/proc/" PID_FMT
"/gid_map", pid
);
4456 r
= write_string_file(uid_map
, line
, 0);
4458 return log_error_errno(r
, "Failed to write GID map: %m");
4463 static int chown_cgroup(pid_t pid
) {
4464 _cleanup_free_
char *path
= NULL
, *fs
= NULL
;
4465 _cleanup_close_
int fd
= -1;
4469 r
= cg_pid_get_path(NULL
, pid
, &path
);
4471 return log_error_errno(r
, "Failed to get container cgroup path: %m");
4473 r
= cg_get_path(SYSTEMD_CGROUP_CONTROLLER
, path
, NULL
, &fs
);
4475 return log_error_errno(r
, "Failed to get file system path for container cgroup: %m");
4477 fd
= open(fs
, O_RDONLY
|O_CLOEXEC
|O_DIRECTORY
);
4479 return log_error_errno(errno
, "Failed to open %s: %m", fs
);
4481 FOREACH_STRING(fn
, ".", "tasks", "notify_on_release", "cgroup.procs", "cgroup.clone_children")
4482 if (fchownat(fd
, fn
, arg_uid_shift
, arg_uid_shift
, 0) < 0)
4483 log_warning_errno(errno
, "Failed to chown() cgroup file %s, ignoring: %m", fn
);
4488 int main(int argc
, char *argv
[]) {
4490 _cleanup_free_
char *device_path
= NULL
, *root_device
= NULL
, *home_device
= NULL
, *srv_device
= NULL
, *console
= NULL
;
4491 bool root_device_rw
= true, home_device_rw
= true, srv_device_rw
= true;
4492 _cleanup_close_
int master
= -1, image_fd
= -1;
4493 _cleanup_fdset_free_ FDSet
*fds
= NULL
;
4494 int r
, n_fd_passed
, loop_nr
= -1;
4495 char veth_name
[IFNAMSIZ
];
4496 bool secondary
= false, remove_subvol
= false;
4499 int ret
= EXIT_SUCCESS
;
4500 union in_addr_union exposed
= {};
4501 _cleanup_release_lock_file_ LockFile tree_global_lock
= LOCK_FILE_INIT
, tree_local_lock
= LOCK_FILE_INIT
;
4504 log_parse_environment();
4507 r
= parse_argv(argc
, argv
);
4511 r
= determine_names();
4515 if (geteuid() != 0) {
4516 log_error("Need to be root.");
4521 n_fd_passed
= sd_listen_fds(false);
4522 if (n_fd_passed
> 0) {
4523 r
= fdset_new_listen_fds(&fds
, false);
4525 log_error_errno(r
, "Failed to collect file descriptors: %m");
4530 if (arg_directory
) {
4533 if (path_equal(arg_directory
, "/") && !arg_ephemeral
) {
4534 log_error("Spawning container on root directory is not supported. Consider using --ephemeral.");
4539 if (arg_ephemeral
) {
4540 _cleanup_free_
char *np
= NULL
;
4542 /* If the specified path is a mount point we
4543 * generate the new snapshot immediately
4544 * inside it under a random name. However if
4545 * the specified is not a mount point we
4546 * create the new snapshot in the parent
4547 * directory, just next to it. */
4548 r
= path_is_mount_point(arg_directory
, 0);
4550 log_error_errno(r
, "Failed to determine whether directory %s is mount point: %m", arg_directory
);
4554 r
= tempfn_random_child(arg_directory
, "machine.", &np
);
4556 r
= tempfn_random(arg_directory
, "machine.", &np
);
4558 log_error_errno(r
, "Failed to generate name for snapshot: %m");
4562 r
= image_path_lock(np
, (arg_read_only
? LOCK_SH
: LOCK_EX
) | LOCK_NB
, &tree_global_lock
, &tree_local_lock
);
4564 log_error_errno(r
, "Failed to lock %s: %m", np
);
4568 r
= btrfs_subvol_snapshot(arg_directory
, np
, (arg_read_only
? BTRFS_SNAPSHOT_READ_ONLY
: 0) | BTRFS_SNAPSHOT_FALLBACK_COPY
| BTRFS_SNAPSHOT_RECURSIVE
);
4570 log_error_errno(r
, "Failed to create snapshot %s from %s: %m", np
, arg_directory
);
4574 free(arg_directory
);
4578 remove_subvol
= true;
4581 r
= image_path_lock(arg_directory
, (arg_read_only
? LOCK_SH
: LOCK_EX
) | LOCK_NB
, &tree_global_lock
, &tree_local_lock
);
4583 log_error_errno(r
, "Directory tree %s is currently busy.", arg_directory
);
4587 log_error_errno(r
, "Failed to lock %s: %m", arg_directory
);
4592 r
= btrfs_subvol_snapshot(arg_template
, arg_directory
, (arg_read_only
? BTRFS_SNAPSHOT_READ_ONLY
: 0) | BTRFS_SNAPSHOT_FALLBACK_COPY
| BTRFS_SNAPSHOT_RECURSIVE
);
4595 log_info("Directory %s already exists, not populating from template %s.", arg_directory
, arg_template
);
4597 log_error_errno(r
, "Couldn't create snapshot %s from %s: %m", arg_directory
, arg_template
);
4601 log_info("Populated %s from template %s.", arg_directory
, arg_template
);
4607 if (path_is_os_tree(arg_directory
) <= 0) {
4608 log_error("Directory %s doesn't look like an OS root directory (os-release file is missing). Refusing.", arg_directory
);
4615 p
= strjoina(arg_directory
,
4616 argc
> optind
&& path_is_absolute(argv
[optind
]) ? argv
[optind
] : "/usr/bin/");
4617 if (access(p
, F_OK
) < 0) {
4618 log_error("Directory %s lacks the binary to execute or doesn't look like a binary tree. Refusing.", arg_directory
);
4625 char template[] = "/tmp/nspawn-root-XXXXXX";
4628 assert(!arg_template
);
4630 r
= image_path_lock(arg_image
, (arg_read_only
? LOCK_SH
: LOCK_EX
) | LOCK_NB
, &tree_global_lock
, &tree_local_lock
);
4632 r
= log_error_errno(r
, "Disk image %s is currently busy.", arg_image
);
4636 r
= log_error_errno(r
, "Failed to create image lock: %m");
4640 if (!mkdtemp(template)) {
4641 log_error_errno(errno
, "Failed to create temporary directory: %m");
4646 arg_directory
= strdup(template);
4647 if (!arg_directory
) {
4652 image_fd
= setup_image(&device_path
, &loop_nr
);
4658 r
= dissect_image(image_fd
,
4659 &root_device
, &root_device_rw
,
4660 &home_device
, &home_device_rw
,
4661 &srv_device
, &srv_device_rw
,
4667 r
= custom_mounts_prepare();
4672 isatty(STDIN_FILENO
) > 0 &&
4673 isatty(STDOUT_FILENO
) > 0;
4675 master
= posix_openpt(O_RDWR
|O_NOCTTY
|O_CLOEXEC
|O_NDELAY
);
4677 r
= log_error_errno(errno
, "Failed to acquire pseudo tty: %m");
4681 r
= ptsname_malloc(master
, &console
);
4683 r
= log_error_errno(r
, "Failed to determine tty name: %m");
4687 if (unlockpt(master
) < 0) {
4688 r
= log_error_errno(errno
, "Failed to unlock tty: %m");
4693 log_info("Spawning container %s on %s.\nPress ^] three times within 1s to kill container.",
4694 arg_machine
, arg_image
?: arg_directory
);
4696 assert_se(sigprocmask_many(SIG_BLOCK
, NULL
, SIGCHLD
, SIGWINCH
, SIGTERM
, SIGINT
, -1) >= 0);
4698 assert_se(sigemptyset(&mask_chld
) == 0);
4699 assert_se(sigaddset(&mask_chld
, SIGCHLD
) == 0);
4701 if (prctl(PR_SET_CHILD_SUBREAPER
, 1) < 0) {
4702 r
= log_error_errno(errno
, "Failed to become subreaper: %m");
4707 _cleanup_close_pair_
int kmsg_socket_pair
[2] = { -1, -1 }, rtnl_socket_pair
[2] = { -1, -1 }, pid_socket_pair
[2] = { -1, -1 },
4708 uid_shift_socket_pair
[2] = { -1, -1 };
4709 ContainerStatus container_status
;
4710 _cleanup_(barrier_destroy
) Barrier barrier
= BARRIER_NULL
;
4711 static const struct sigaction sa
= {
4712 .sa_handler
= nop_handler
,
4713 .sa_flags
= SA_NOCLDSTOP
,
4717 _cleanup_event_unref_ sd_event
*event
= NULL
;
4718 _cleanup_(pty_forward_freep
) PTYForward
*forward
= NULL
;
4719 _cleanup_netlink_unref_ sd_netlink
*rtnl
= NULL
;
4722 r
= barrier_create(&barrier
);
4724 log_error_errno(r
, "Cannot initialize IPC barrier: %m");
4728 if (socketpair(AF_UNIX
, SOCK_DGRAM
|SOCK_CLOEXEC
, 0, kmsg_socket_pair
) < 0) {
4729 r
= log_error_errno(errno
, "Failed to create kmsg socket pair: %m");
4733 if (socketpair(AF_UNIX
, SOCK_DGRAM
|SOCK_CLOEXEC
, 0, rtnl_socket_pair
) < 0) {
4734 r
= log_error_errno(errno
, "Failed to create rtnl socket pair: %m");
4738 if (socketpair(AF_UNIX
, SOCK_DGRAM
|SOCK_CLOEXEC
, 0, pid_socket_pair
) < 0) {
4739 r
= log_error_errno(errno
, "Failed to create pid socket pair: %m");
4744 if (socketpair(AF_UNIX
, SOCK_DGRAM
|SOCK_CLOEXEC
, 0, uid_shift_socket_pair
) < 0) {
4745 r
= log_error_errno(errno
, "Failed to create uid shift socket pair: %m");
4749 /* Child can be killed before execv(), so handle SIGCHLD
4750 * in order to interrupt parent's blocking calls and
4751 * give it a chance to call wait() and terminate. */
4752 r
= sigprocmask(SIG_UNBLOCK
, &mask_chld
, NULL
);
4754 r
= log_error_errno(errno
, "Failed to change the signal mask: %m");
4758 r
= sigaction(SIGCHLD
, &sa
, NULL
);
4760 r
= log_error_errno(errno
, "Failed to install SIGCHLD handler: %m");
4764 pid
= raw_clone(SIGCHLD
|CLONE_NEWNS
, NULL
);
4766 if (errno
== EINVAL
)
4767 r
= log_error_errno(errno
, "clone() failed, do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in): %m");
4769 r
= log_error_errno(errno
, "clone() failed: %m");
4775 /* The outer child only has a file system namespace. */
4776 barrier_set_role(&barrier
, BARRIER_CHILD
);
4778 master
= safe_close(master
);
4780 kmsg_socket_pair
[0] = safe_close(kmsg_socket_pair
[0]);
4781 rtnl_socket_pair
[0] = safe_close(rtnl_socket_pair
[0]);
4782 pid_socket_pair
[0] = safe_close(pid_socket_pair
[0]);
4783 uid_shift_socket_pair
[0] = safe_close(uid_shift_socket_pair
[0]);
4785 (void) reset_all_signal_handlers();
4786 (void) reset_signal_mask();
4788 r
= outer_child(&barrier
,
4791 root_device
, root_device_rw
,
4792 home_device
, home_device_rw
,
4793 srv_device
, srv_device_rw
,
4797 kmsg_socket_pair
[1],
4798 rtnl_socket_pair
[1],
4799 uid_shift_socket_pair
[1],
4803 _exit(EXIT_FAILURE
);
4805 _exit(EXIT_SUCCESS
);
4808 barrier_set_role(&barrier
, BARRIER_PARENT
);
4813 kmsg_socket_pair
[1] = safe_close(kmsg_socket_pair
[1]);
4814 rtnl_socket_pair
[1] = safe_close(rtnl_socket_pair
[1]);
4815 pid_socket_pair
[1] = safe_close(pid_socket_pair
[1]);
4817 /* Wait for the outer child. */
4818 r
= wait_for_terminate_and_warn("namespace helper", pid
, NULL
);
4827 /* And now retrieve the PID of the inner child. */
4828 l
= recv(pid_socket_pair
[0], &pid
, sizeof(pid
), 0);
4830 r
= log_error_errno(errno
, "Failed to read inner child PID: %m");
4833 if (l
!= sizeof(pid
)) {
4834 log_error("Short read while reading inner child PID: %m");
4839 log_debug("Init process invoked as PID " PID_FMT
, pid
);
4842 if (!barrier_place_and_sync(&barrier
)) { /* #1 */
4843 log_error("Child died too early.");
4848 l
= recv(uid_shift_socket_pair
[0], &arg_uid_shift
, sizeof(arg_uid_shift
), 0);
4850 r
= log_error_errno(errno
, "Failed to read UID shift: %m");
4853 if (l
!= sizeof(arg_uid_shift
)) {
4854 log_error("Short read while reading UID shift: %m");
4859 r
= setup_uid_map(pid
);
4863 (void) barrier_place(&barrier
); /* #2 */
4866 r
= move_network_interfaces(pid
);
4870 r
= setup_veth(pid
, veth_name
, &ifi
);
4874 r
= setup_bridge(veth_name
, &ifi
);
4878 r
= setup_macvlan(pid
);
4882 r
= setup_ipvlan(pid
);
4886 r
= register_machine(pid
, ifi
);
4890 r
= chown_cgroup(pid
);
4894 /* Notify the child that the parent is ready with all
4895 * its setup (including cgroup-ification), and that
4896 * the child can now hand over control to the code to
4897 * run inside the container. */
4898 (void) barrier_place(&barrier
); /* #3 */
4900 /* Block SIGCHLD here, before notifying child.
4901 * process_pty() will handle it with the other signals. */
4902 assert_se(sigprocmask(SIG_BLOCK
, &mask_chld
, NULL
) >= 0);
4904 /* Reset signal to default */
4905 r
= default_signals(SIGCHLD
, -1);
4907 log_error_errno(r
, "Failed to reset SIGCHLD: %m");
4911 /* Let the child know that we are ready and wait that the child is completely ready now. */
4912 if (!barrier_place_and_sync(&barrier
)) { /* #5 */
4913 log_error("Client died too early.");
4920 "STATUS=Container running.\n"
4921 "X_NSPAWN_LEADER_PID=" PID_FMT
, pid
);
4923 r
= sd_event_new(&event
);
4925 log_error_errno(r
, "Failed to get default event source: %m");
4929 if (arg_kill_signal
> 0) {
4930 /* Try to kill the init system on SIGINT or SIGTERM */
4931 sd_event_add_signal(event
, NULL
, SIGINT
, on_orderly_shutdown
, UINT32_TO_PTR(pid
));
4932 sd_event_add_signal(event
, NULL
, SIGTERM
, on_orderly_shutdown
, UINT32_TO_PTR(pid
));
4934 /* Immediately exit */
4935 sd_event_add_signal(event
, NULL
, SIGINT
, NULL
, NULL
);
4936 sd_event_add_signal(event
, NULL
, SIGTERM
, NULL
, NULL
);
4939 /* simply exit on sigchld */
4940 sd_event_add_signal(event
, NULL
, SIGCHLD
, NULL
, NULL
);
4942 if (arg_expose_ports
) {
4943 r
= watch_rtnl(event
, rtnl_socket_pair
[0], &exposed
, &rtnl
);
4947 (void) expose_ports(rtnl
, &exposed
);
4950 rtnl_socket_pair
[0] = safe_close(rtnl_socket_pair
[0]);
4952 r
= pty_forward_new(event
, master
, true, !interactive
, &forward
);
4954 log_error_errno(r
, "Failed to create PTY forwarder: %m");
4958 r
= sd_event_loop(event
);
4960 log_error_errno(r
, "Failed to run event loop: %m");
4964 pty_forward_get_last_char(forward
, &last_char
);
4966 forward
= pty_forward_free(forward
);
4968 if (!arg_quiet
&& last_char
!= '\n')
4971 /* Kill if it is not dead yet anyway */
4972 terminate_machine(pid
);
4974 /* Normally redundant, but better safe than sorry */
4977 r
= wait_for_container(pid
, &container_status
);
4981 /* We failed to wait for the container, or the
4982 * container exited abnormally */
4984 else if (r
> 0 || container_status
== CONTAINER_TERMINATED
){
4985 /* The container exited with a non-zero
4986 * status, or with zero status and no reboot
4992 /* CONTAINER_REBOOTED, loop again */
4994 if (arg_keep_unit
) {
4995 /* Special handling if we are running as a
4996 * service: instead of simply restarting the
4997 * machine we want to restart the entire
4998 * service, so let's inform systemd about this
4999 * with the special exit code 133. The service
5000 * file uses RestartForceExitStatus=133 so
5001 * that this results in a full nspawn
5002 * restart. This is necessary since we might
5003 * have cgroup parameters set we want to have
5010 flush_ports(&exposed
);
5016 "STATUS=Terminating...");
5021 /* Try to flush whatever is still queued in the pty */
5023 (void) copy_bytes(master
, STDOUT_FILENO
, (off_t
) -1, false);
5025 loop_remove(loop_nr
, &image_fd
);
5027 if (remove_subvol
&& arg_directory
) {
5030 k
= btrfs_subvol_remove(arg_directory
, true);
5032 log_warning_errno(k
, "Cannot remove subvolume '%s', ignoring: %m", arg_directory
);
5038 p
= strjoina("/run/systemd/nspawn/propagate/", arg_machine
);
5039 (void) rm_rf(p
, REMOVE_ROOT
);
5042 free(arg_directory
);
5047 strv_free(arg_setenv
);
5048 strv_free(arg_network_interfaces
);
5049 strv_free(arg_network_macvlan
);
5050 strv_free(arg_network_ipvlan
);
5051 custom_mount_free_all();
5053 flush_ports(&exposed
);
5055 while (arg_expose_ports
) {
5056 ExposePort
*p
= arg_expose_ports
;
5057 LIST_REMOVE(ports
, arg_expose_ports
, p
);
5061 return r
< 0 ? EXIT_FAILURE
: ret
;