1 /* SPDX-License-Identifier: LGPL-2.1-or-later */
8 #include <linux/loop.h>
10 #include <selinux/selinux.h>
14 #include <sys/ioctl.h>
15 #include <sys/personality.h>
16 #include <sys/prctl.h>
17 #include <sys/types.h>
23 #include "sd-daemon.h"
26 #include "alloc-util.h"
27 #include "ether-addr-util.h"
29 #include "base-filesystem.h"
30 #include "blkid-util.h"
31 #include "btrfs-util.h"
33 #include "bus-error.h"
34 #include "bus-locator.h"
37 #include "capability-util.h"
38 #include "cgroup-util.h"
40 #include "common-signal.h"
42 #include "cpu-set-util.h"
43 #include "creds-util.h"
44 #include "dev-setup.h"
45 #include "discover-image.h"
46 #include "dissect-image.h"
52 #include "format-util.h"
55 #include "hexdecoct.h"
56 #include "hostname-setup.h"
57 #include "hostname-util.h"
58 #include "id128-util.h"
61 #include "loop-util.h"
62 #include "loopback-setup.h"
64 #include "main-func.h"
65 #include "missing_sched.h"
67 #include "mount-util.h"
68 #include "mountpoint-util.h"
69 #include "namespace-util.h"
70 #include "netlink-util.h"
71 #include "nspawn-bind-user.h"
72 #include "nspawn-cgroup.h"
73 #include "nspawn-creds.h"
74 #include "nspawn-def.h"
75 #include "nspawn-expose-ports.h"
76 #include "nspawn-mount.h"
77 #include "nspawn-network.h"
78 #include "nspawn-oci.h"
79 #include "nspawn-patch-uid.h"
80 #include "nspawn-register.h"
81 #include "nspawn-seccomp.h"
82 #include "nspawn-settings.h"
83 #include "nspawn-setuid.h"
84 #include "nspawn-stub-pid1.h"
85 #include "nspawn-util.h"
87 #include "nulstr-util.h"
90 #include "parse-argument.h"
91 #include "parse-util.h"
92 #include "pretty-print.h"
93 #include "process-util.h"
95 #include "random-util.h"
96 #include "raw-clone.h"
97 #include "resolve-util.h"
98 #include "rlimit-util.h"
100 #include "seccomp-util.h"
101 #include "selinux-util.h"
102 #include "signal-util.h"
103 #include "socket-util.h"
104 #include "stat-util.h"
105 #include "stdio-util.h"
106 #include "string-table.h"
107 #include "string-util.h"
109 #include "sysctl-util.h"
110 #include "terminal-util.h"
111 #include "tmpfile-util.h"
112 #include "umask-util.h"
113 #include "unit-name.h"
114 #include "user-util.h"
116 /* The notify socket inside the container it can use to talk to nspawn using the sd_notify(3) protocol */
117 #define NSPAWN_NOTIFY_SOCKET_PATH "/run/host/notify"
118 #define NSPAWN_MOUNT_TUNNEL "/run/host/incoming"
120 #define EXIT_FORCE_RESTART 133
122 typedef enum ContainerStatus
{
123 CONTAINER_TERMINATED
,
127 static char *arg_directory
= NULL
;
128 static char *arg_template
= NULL
;
129 static char *arg_chdir
= NULL
;
130 static char *arg_pivot_root_new
= NULL
;
131 static char *arg_pivot_root_old
= NULL
;
132 static char *arg_user
= NULL
;
133 static uid_t arg_uid
= UID_INVALID
;
134 static gid_t arg_gid
= GID_INVALID
;
135 static gid_t
* arg_supplementary_gids
= NULL
;
136 static size_t arg_n_supplementary_gids
= 0;
137 static sd_id128_t arg_uuid
= {};
138 static char *arg_machine
= NULL
; /* The name used by the host to refer to this */
139 static char *arg_hostname
= NULL
; /* The name the payload sees by default */
140 static const char *arg_selinux_context
= NULL
;
141 static const char *arg_selinux_apifs_context
= NULL
;
142 static char *arg_slice
= NULL
;
143 static bool arg_private_network
= false;
144 static bool arg_read_only
= false;
145 static StartMode arg_start_mode
= START_PID1
;
146 static bool arg_ephemeral
= false;
147 static LinkJournal arg_link_journal
= LINK_AUTO
;
148 static bool arg_link_journal_try
= false;
149 static uint64_t arg_caps_retain
=
150 (1ULL << CAP_AUDIT_CONTROL
) |
151 (1ULL << CAP_AUDIT_WRITE
) |
152 (1ULL << CAP_CHOWN
) |
153 (1ULL << CAP_DAC_OVERRIDE
) |
154 (1ULL << CAP_DAC_READ_SEARCH
) |
155 (1ULL << CAP_FOWNER
) |
156 (1ULL << CAP_FSETID
) |
157 (1ULL << CAP_IPC_OWNER
) |
159 (1ULL << CAP_LEASE
) |
160 (1ULL << CAP_LINUX_IMMUTABLE
) |
161 (1ULL << CAP_MKNOD
) |
162 (1ULL << CAP_NET_BIND_SERVICE
) |
163 (1ULL << CAP_NET_BROADCAST
) |
164 (1ULL << CAP_NET_RAW
) |
165 (1ULL << CAP_SETFCAP
) |
166 (1ULL << CAP_SETGID
) |
167 (1ULL << CAP_SETPCAP
) |
168 (1ULL << CAP_SETUID
) |
169 (1ULL << CAP_SYS_ADMIN
) |
170 (1ULL << CAP_SYS_BOOT
) |
171 (1ULL << CAP_SYS_CHROOT
) |
172 (1ULL << CAP_SYS_NICE
) |
173 (1ULL << CAP_SYS_PTRACE
) |
174 (1ULL << CAP_SYS_RESOURCE
) |
175 (1ULL << CAP_SYS_TTY_CONFIG
);
176 static uint64_t arg_caps_ambient
= 0;
177 static CapabilityQuintet arg_full_capabilities
= CAPABILITY_QUINTET_NULL
;
178 static CustomMount
*arg_custom_mounts
= NULL
;
179 static size_t arg_n_custom_mounts
= 0;
180 static char **arg_setenv
= NULL
;
181 static bool arg_quiet
= false;
182 static bool arg_register
= true;
183 static bool arg_keep_unit
= false;
184 static char **arg_network_interfaces
= NULL
;
185 static char **arg_network_macvlan
= NULL
;
186 static char **arg_network_ipvlan
= NULL
;
187 static bool arg_network_veth
= false;
188 static char **arg_network_veth_extra
= NULL
;
189 static char *arg_network_bridge
= NULL
;
190 static char *arg_network_zone
= NULL
;
191 static char *arg_network_namespace_path
= NULL
;
192 struct ether_addr arg_network_provided_mac
= {};
193 static PagerFlags arg_pager_flags
= 0;
194 static unsigned long arg_personality
= PERSONALITY_INVALID
;
195 static char *arg_image
= NULL
;
196 static char *arg_oci_bundle
= NULL
;
197 static VolatileMode arg_volatile_mode
= VOLATILE_NO
;
198 static ExposePort
*arg_expose_ports
= NULL
;
199 static char **arg_property
= NULL
;
200 static sd_bus_message
*arg_property_message
= NULL
;
201 static UserNamespaceMode arg_userns_mode
= USER_NAMESPACE_NO
;
202 static uid_t arg_uid_shift
= UID_INVALID
, arg_uid_range
= 0x10000U
;
203 static UserNamespaceOwnership arg_userns_ownership
= _USER_NAMESPACE_OWNERSHIP_INVALID
;
204 static int arg_kill_signal
= 0;
205 static CGroupUnified arg_unified_cgroup_hierarchy
= CGROUP_UNIFIED_UNKNOWN
;
206 static SettingsMask arg_settings_mask
= 0;
207 static int arg_settings_trusted
= -1;
208 static char **arg_parameters
= NULL
;
209 static const char *arg_container_service_name
= "systemd-nspawn";
210 static bool arg_notify_ready
= false;
211 static bool arg_use_cgns
= true;
212 static unsigned long arg_clone_ns_flags
= CLONE_NEWIPC
|CLONE_NEWPID
|CLONE_NEWUTS
;
213 static MountSettingsMask arg_mount_settings
= MOUNT_APPLY_APIVFS_RO
|MOUNT_APPLY_TMPFS_TMP
;
214 static VeritySettings arg_verity_settings
= VERITY_SETTINGS_DEFAULT
;
215 static char **arg_syscall_allow_list
= NULL
;
216 static char **arg_syscall_deny_list
= NULL
;
218 static scmp_filter_ctx arg_seccomp
= NULL
;
220 static struct rlimit
*arg_rlimit
[_RLIMIT_MAX
] = {};
221 static bool arg_no_new_privileges
= false;
222 static int arg_oom_score_adjust
= 0;
223 static bool arg_oom_score_adjust_set
= false;
224 static CPUSet arg_cpu_set
= {};
225 static ResolvConfMode arg_resolv_conf
= RESOLV_CONF_AUTO
;
226 static TimezoneMode arg_timezone
= TIMEZONE_AUTO
;
227 static unsigned arg_console_width
= UINT_MAX
, arg_console_height
= UINT_MAX
;
228 static DeviceNode
* arg_extra_nodes
= NULL
;
229 static size_t arg_n_extra_nodes
= 0;
230 static char **arg_sysctl
= NULL
;
231 static ConsoleMode arg_console_mode
= _CONSOLE_MODE_INVALID
;
232 static Credential
*arg_credentials
= NULL
;
233 static size_t arg_n_credentials
= 0;
234 static char **arg_bind_user
= NULL
;
235 static bool arg_suppress_sync
= false;
236 static char *arg_settings_filename
= NULL
;
237 static Architecture arg_architecture
= _ARCHITECTURE_INVALID
;
238 static ImagePolicy
*arg_image_policy
= NULL
;
240 STATIC_DESTRUCTOR_REGISTER(arg_directory
, freep
);
241 STATIC_DESTRUCTOR_REGISTER(arg_template
, freep
);
242 STATIC_DESTRUCTOR_REGISTER(arg_chdir
, freep
);
243 STATIC_DESTRUCTOR_REGISTER(arg_pivot_root_new
, freep
);
244 STATIC_DESTRUCTOR_REGISTER(arg_pivot_root_old
, freep
);
245 STATIC_DESTRUCTOR_REGISTER(arg_user
, freep
);
246 STATIC_DESTRUCTOR_REGISTER(arg_supplementary_gids
, freep
);
247 STATIC_DESTRUCTOR_REGISTER(arg_machine
, freep
);
248 STATIC_DESTRUCTOR_REGISTER(arg_hostname
, freep
);
249 STATIC_DESTRUCTOR_REGISTER(arg_slice
, freep
);
250 STATIC_DESTRUCTOR_REGISTER(arg_setenv
, strv_freep
);
251 STATIC_DESTRUCTOR_REGISTER(arg_network_interfaces
, strv_freep
);
252 STATIC_DESTRUCTOR_REGISTER(arg_network_macvlan
, strv_freep
);
253 STATIC_DESTRUCTOR_REGISTER(arg_network_ipvlan
, strv_freep
);
254 STATIC_DESTRUCTOR_REGISTER(arg_network_veth_extra
, strv_freep
);
255 STATIC_DESTRUCTOR_REGISTER(arg_network_bridge
, freep
);
256 STATIC_DESTRUCTOR_REGISTER(arg_network_zone
, freep
);
257 STATIC_DESTRUCTOR_REGISTER(arg_network_namespace_path
, freep
);
258 STATIC_DESTRUCTOR_REGISTER(arg_image
, freep
);
259 STATIC_DESTRUCTOR_REGISTER(arg_oci_bundle
, freep
);
260 STATIC_DESTRUCTOR_REGISTER(arg_property
, strv_freep
);
261 STATIC_DESTRUCTOR_REGISTER(arg_property_message
, sd_bus_message_unrefp
);
262 STATIC_DESTRUCTOR_REGISTER(arg_parameters
, strv_freep
);
263 STATIC_DESTRUCTOR_REGISTER(arg_verity_settings
, verity_settings_done
);
264 STATIC_DESTRUCTOR_REGISTER(arg_syscall_allow_list
, strv_freep
);
265 STATIC_DESTRUCTOR_REGISTER(arg_syscall_deny_list
, strv_freep
);
267 STATIC_DESTRUCTOR_REGISTER(arg_seccomp
, seccomp_releasep
);
269 STATIC_DESTRUCTOR_REGISTER(arg_cpu_set
, cpu_set_reset
);
270 STATIC_DESTRUCTOR_REGISTER(arg_sysctl
, strv_freep
);
271 STATIC_DESTRUCTOR_REGISTER(arg_bind_user
, strv_freep
);
272 STATIC_DESTRUCTOR_REGISTER(arg_settings_filename
, freep
);
273 STATIC_DESTRUCTOR_REGISTER(arg_image_policy
, image_policy_freep
);
275 static int handle_arg_console(const char *arg
) {
276 if (streq(arg
, "help")) {
285 if (streq(arg
, "interactive"))
286 arg_console_mode
= CONSOLE_INTERACTIVE
;
287 else if (streq(arg
, "read-only"))
288 arg_console_mode
= CONSOLE_READ_ONLY
;
289 else if (streq(arg
, "passive"))
290 arg_console_mode
= CONSOLE_PASSIVE
;
291 else if (streq(arg
, "pipe")) {
292 if (isatty(STDIN_FILENO
) > 0 && isatty(STDOUT_FILENO
) > 0)
293 log_full(arg_quiet
? LOG_DEBUG
: LOG_NOTICE
,
294 "Console mode 'pipe' selected, but standard input/output are connected to an interactive TTY. "
295 "Most likely you want to use 'interactive' console mode for proper interactivity and shell job control. "
296 "Proceeding anyway.");
298 arg_console_mode
= CONSOLE_PIPE
;
299 } else if (streq(arg
, "autopipe")) {
300 if (isatty(STDIN_FILENO
) > 0 && isatty(STDOUT_FILENO
) > 0)
301 arg_console_mode
= CONSOLE_INTERACTIVE
;
303 arg_console_mode
= CONSOLE_PIPE
;
305 return log_error_errno(SYNTHETIC_ERRNO(EINVAL
), "Unknown console mode: %s", optarg
);
307 arg_settings_mask
|= SETTING_CONSOLE_MODE
;
311 static int help(void) {
312 _cleanup_free_
char *link
= NULL
;
315 pager_open(arg_pager_flags
);
317 r
= terminal_urlify_man("systemd-nspawn", "1", &link
);
321 printf("%1$s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
322 "%5$sSpawn a command or OS in a light-weight container.%6$s\n\n"
323 " -h --help Show this help\n"
324 " --version Print version string\n"
325 " -q --quiet Do not show status information\n"
326 " --no-pager Do not pipe output into a pager\n"
327 " --settings=BOOLEAN Load additional settings from .nspawn file\n\n"
329 " -D --directory=PATH Root directory for the container\n"
330 " --template=PATH Initialize root directory from template directory,\n"
332 " -x --ephemeral Run container with snapshot of root directory, and\n"
333 " remove it after exit\n"
334 " -i --image=PATH Root file system disk image (or device node) for\n"
336 " --image-policy=POLICY Specify disk image dissection policy\n"
337 " --oci-bundle=PATH OCI bundle directory\n"
338 " --read-only Mount the root directory read-only\n"
339 " --volatile[=MODE] Run the system in volatile mode\n"
340 " --root-hash=HASH Specify verity root hash for root disk image\n"
341 " --root-hash-sig=SIG Specify pkcs7 signature of root hash for verity\n"
342 " as a DER encoded PKCS7, either as a path to a file\n"
343 " or as an ASCII base64 encoded string prefixed by\n"
345 " --verity-data=PATH Specify hash device for verity\n"
346 " --pivot-root=PATH[:PATH]\n"
347 " Pivot root to given directory in the container\n\n"
348 "%3$sExecution:%4$s\n"
349 " -a --as-pid2 Maintain a stub init as PID1, invoke binary as PID2\n"
350 " -b --boot Boot up full system (i.e. invoke init)\n"
351 " --chdir=PATH Set working directory in the container\n"
352 " -E --setenv=NAME[=VALUE] Pass an environment variable to PID 1\n"
353 " -u --user=USER Run the command under specified user or UID\n"
354 " --kill-signal=SIGNAL Select signal to use for shutting down PID 1\n"
355 " --notify-ready=BOOLEAN Receive notifications from the child init process\n"
356 " --suppress-sync=BOOLEAN\n"
357 " Suppress any form of disk data synchronization\n\n"
358 "%3$sSystem Identity:%4$s\n"
359 " -M --machine=NAME Set the machine name for the container\n"
360 " --hostname=NAME Override the hostname for the container\n"
361 " --uuid=UUID Set a specific machine UUID for the container\n\n"
362 "%3$sProperties:%4$s\n"
363 " -S --slice=SLICE Place the container in the specified slice\n"
364 " --property=NAME=VALUE Set scope unit property\n"
365 " --register=BOOLEAN Register container as machine\n"
366 " --keep-unit Do not register a scope for the machine, reuse\n"
367 " the service unit nspawn is running in\n\n"
368 "%3$sUser Namespacing:%4$s\n"
369 " --private-users=no Run without user namespacing\n"
370 " --private-users=yes|pick|identity\n"
371 " Run within user namespace, autoselect UID/GID range\n"
372 " --private-users=UIDBASE[:NUIDS]\n"
373 " Similar, but with user configured UID/GID range\n"
374 " --private-users-ownership=MODE\n"
375 " Adjust ('chown') or map ('map') OS tree ownership\n"
376 " to private UID/GID range\n"
377 " -U Equivalent to --private-users=pick and\n"
378 " --private-users-ownership=auto\n\n"
379 "%3$sNetworking:%4$s\n"
380 " --private-network Disable network in container\n"
381 " --network-interface=HOSTIF[:CONTAINERIF]\n"
382 " Assign an existing network interface to the\n"
384 " --network-macvlan=HOSTIF[:CONTAINERIF]\n"
385 " Create a macvlan network interface based on an\n"
386 " existing network interface to the container\n"
387 " --network-ipvlan=HOSTIF[:CONTAINERIF]\n"
388 " Create an ipvlan network interface based on an\n"
389 " existing network interface to the container\n"
390 " -n --network-veth Add a virtual Ethernet connection between host\n"
392 " --network-veth-extra=HOSTIF[:CONTAINERIF]\n"
393 " Add an additional virtual Ethernet link between\n"
394 " host and container\n"
395 " --network-bridge=INTERFACE\n"
396 " Add a virtual Ethernet connection to the container\n"
397 " and attach it to an existing bridge on the host\n"
398 " --network-zone=NAME Similar, but attach the new interface to an\n"
399 " an automatically managed bridge interface\n"
400 " --network-namespace-path=PATH\n"
401 " Set network namespace to the one represented by\n"
402 " the specified kernel namespace file node\n"
403 " -p --port=[PROTOCOL:]HOSTPORT[:CONTAINERPORT]\n"
404 " Expose a container IP port on the host\n\n"
405 "%3$sSecurity:%4$s\n"
406 " --capability=CAP In addition to the default, retain specified\n"
408 " --drop-capability=CAP Drop the specified capability from the default set\n"
409 " --ambient-capability=CAP\n"
410 " Sets the specified capability for the started\n"
411 " process. Not useful if booting a machine.\n"
412 " --no-new-privileges Set PR_SET_NO_NEW_PRIVS flag for container payload\n"
413 " --system-call-filter=LIST|~LIST\n"
414 " Permit/prohibit specific system calls\n"
415 " -Z --selinux-context=SECLABEL\n"
416 " Set the SELinux security context to be used by\n"
417 " processes in the container\n"
418 " -L --selinux-apifs-context=SECLABEL\n"
419 " Set the SELinux security context to be used by\n"
420 " API/tmpfs file systems in the container\n\n"
421 "%3$sResources:%4$s\n"
422 " --rlimit=NAME=LIMIT Set a resource limit for the payload\n"
423 " --oom-score-adjust=VALUE\n"
424 " Adjust the OOM score value for the payload\n"
425 " --cpu-affinity=CPUS Adjust the CPU affinity of the container\n"
426 " --personality=ARCH Pick personality for this container\n\n"
427 "%3$sIntegration:%4$s\n"
428 " --resolv-conf=MODE Select mode of /etc/resolv.conf initialization\n"
429 " --timezone=MODE Select mode of /etc/localtime initialization\n"
430 " --link-journal=MODE Link up guest journal, one of no, auto, guest, \n"
431 " host, try-guest, try-host\n"
432 " -j Equivalent to --link-journal=try-guest\n\n"
434 " --bind=PATH[:PATH[:OPTIONS]]\n"
435 " Bind mount a file or directory from the host into\n"
437 " --bind-ro=PATH[:PATH[:OPTIONS]\n"
438 " Similar, but creates a read-only bind mount\n"
439 " --inaccessible=PATH Over-mount file node with inaccessible node to mask\n"
441 " --tmpfs=PATH:[OPTIONS] Mount an empty tmpfs to the specified directory\n"
442 " --overlay=PATH[:PATH...]:PATH\n"
443 " Create an overlay mount from the host to \n"
445 " --overlay-ro=PATH[:PATH...]:PATH\n"
446 " Similar, but creates a read-only overlay mount\n"
447 " --bind-user=NAME Bind user from host to container\n\n"
448 "%3$sInput/Output:%4$s\n"
449 " --console=MODE Select how stdin/stdout/stderr and /dev/console are\n"
450 " set up for the container.\n"
451 " -P --pipe Equivalent to --console=pipe\n\n"
452 "%3$sCredentials:%4$s\n"
453 " --set-credential=ID:VALUE\n"
454 " Pass a credential with literal value to container.\n"
455 " --load-credential=ID:PATH\n"
456 " Load credential to pass to container from file or\n"
457 " AF_UNIX stream socket.\n"
458 "\nSee the %2$s for details.\n",
459 program_invocation_short_name
,
469 static int custom_mount_check_all(void) {
472 for (i
= 0; i
< arg_n_custom_mounts
; i
++) {
473 CustomMount
*m
= &arg_custom_mounts
[i
];
475 if (path_equal(m
->destination
, "/") && arg_userns_mode
!= USER_NAMESPACE_NO
) {
476 if (arg_userns_ownership
!= USER_NAMESPACE_OWNERSHIP_OFF
)
477 return log_error_errno(SYNTHETIC_ERRNO(EINVAL
),
478 "--private-users-ownership=own may not be combined with custom root mounts.");
479 if (arg_uid_shift
== UID_INVALID
)
480 return log_error_errno(SYNTHETIC_ERRNO(EINVAL
),
481 "--private-users with automatic UID shift may not be combined with custom root mounts.");
488 static int detect_unified_cgroup_hierarchy_from_environment(void) {
489 const char *e
, *var
= "SYSTEMD_NSPAWN_UNIFIED_HIERARCHY";
492 /* Allow the user to control whether the unified hierarchy is used */
496 /* $UNIFIED_CGROUP_HIERARCHY has been renamed to $SYSTEMD_NSPAWN_UNIFIED_HIERARCHY. */
497 var
= "UNIFIED_CGROUP_HIERARCHY";
502 r
= parse_boolean(e
);
504 return log_error_errno(r
, "Failed to parse $%s: %m", var
);
506 arg_unified_cgroup_hierarchy
= CGROUP_UNIFIED_ALL
;
508 arg_unified_cgroup_hierarchy
= CGROUP_UNIFIED_NONE
;
514 static int detect_unified_cgroup_hierarchy_from_image(const char *directory
) {
517 /* Let's inherit the mode to use from the host system, but let's take into consideration what systemd
518 * in the image actually supports. */
519 r
= cg_all_unified();
521 return log_error_errno(r
, "Failed to determine whether we are in all unified mode.");
523 /* Unified cgroup hierarchy support was added in 230. Unfortunately the detection
524 * routine only detects 231, so we'll have a false negative here for 230. */
525 r
= systemd_installation_has_version(directory
, "230");
527 return log_error_errno(r
, "Failed to determine systemd version in container: %m");
529 arg_unified_cgroup_hierarchy
= CGROUP_UNIFIED_ALL
;
531 arg_unified_cgroup_hierarchy
= CGROUP_UNIFIED_NONE
;
532 } else if (cg_unified_controller(SYSTEMD_CGROUP_CONTROLLER
) > 0) {
533 /* Mixed cgroup hierarchy support was added in 233 */
534 r
= systemd_installation_has_version(directory
, "233");
536 return log_error_errno(r
, "Failed to determine systemd version in container: %m");
538 arg_unified_cgroup_hierarchy
= CGROUP_UNIFIED_SYSTEMD
;
540 arg_unified_cgroup_hierarchy
= CGROUP_UNIFIED_NONE
;
542 arg_unified_cgroup_hierarchy
= CGROUP_UNIFIED_NONE
;
544 log_debug("Using %s hierarchy for container.",
545 arg_unified_cgroup_hierarchy
== CGROUP_UNIFIED_NONE
? "legacy" :
546 arg_unified_cgroup_hierarchy
== CGROUP_UNIFIED_SYSTEMD
? "hybrid" : "unified");
551 static int parse_capability_spec(const char *spec
, uint64_t *ret_mask
) {
556 _cleanup_free_
char *t
= NULL
;
558 r
= extract_first_word(&spec
, &t
, ",", 0);
560 return log_error_errno(r
, "Failed to parse capability %s.", t
);
564 if (streq(t
, "help")) {
565 for (int i
= 0; i
< capability_list_length(); i
++) {
568 name
= capability_to_name(i
);
579 r
= capability_from_name(t
);
581 return log_error_errno(r
, "Failed to parse capability %s.", t
);
588 return 1; /* continue */
591 static int parse_share_ns_env(const char *name
, unsigned long ns_flag
) {
594 r
= getenv_bool(name
);
598 return log_error_errno(r
, "Failed to parse $%s: %m", name
);
600 arg_clone_ns_flags
= (arg_clone_ns_flags
& ~ns_flag
) | (r
> 0 ? 0 : ns_flag
);
601 arg_settings_mask
|= SETTING_CLONE_NS_FLAGS
;
605 static int parse_mount_settings_env(void) {
609 r
= getenv_bool("SYSTEMD_NSPAWN_TMPFS_TMP");
610 if (r
< 0 && r
!= -ENXIO
)
611 return log_error_errno(r
, "Failed to parse $SYSTEMD_NSPAWN_TMPFS_TMP: %m");
613 SET_FLAG(arg_mount_settings
, MOUNT_APPLY_TMPFS_TMP
, r
> 0);
615 e
= getenv("SYSTEMD_NSPAWN_API_VFS_WRITABLE");
616 if (streq_ptr(e
, "network"))
617 arg_mount_settings
|= MOUNT_APPLY_APIVFS_RO
|MOUNT_APPLY_APIVFS_NETNS
;
620 r
= parse_boolean(e
);
622 return log_error_errno(r
, "Failed to parse $SYSTEMD_NSPAWN_API_VFS_WRITABLE: %m");
624 SET_FLAG(arg_mount_settings
, MOUNT_APPLY_APIVFS_RO
, r
== 0);
625 SET_FLAG(arg_mount_settings
, MOUNT_APPLY_APIVFS_NETNS
, false);
631 static int parse_environment(void) {
635 r
= parse_share_ns_env("SYSTEMD_NSPAWN_SHARE_NS_IPC", CLONE_NEWIPC
);
638 r
= parse_share_ns_env("SYSTEMD_NSPAWN_SHARE_NS_PID", CLONE_NEWPID
);
641 r
= parse_share_ns_env("SYSTEMD_NSPAWN_SHARE_NS_UTS", CLONE_NEWUTS
);
644 r
= parse_share_ns_env("SYSTEMD_NSPAWN_SHARE_SYSTEM", CLONE_NEWIPC
|CLONE_NEWPID
|CLONE_NEWUTS
);
648 r
= parse_mount_settings_env();
652 /* SYSTEMD_NSPAWN_USE_CGNS=0 can be used to disable CLONE_NEWCGROUP use,
653 * even if it is supported. If not supported, it has no effect. */
654 if (!cg_ns_supported())
655 arg_use_cgns
= false;
657 r
= getenv_bool("SYSTEMD_NSPAWN_USE_CGNS");
660 return log_error_errno(r
, "Failed to parse $SYSTEMD_NSPAWN_USE_CGNS: %m");
664 arg_use_cgns
= r
> 0;
665 arg_settings_mask
|= SETTING_USE_CGNS
;
669 e
= getenv("SYSTEMD_NSPAWN_CONTAINER_SERVICE");
671 arg_container_service_name
= e
;
673 e
= getenv("SYSTEMD_NSPAWN_NETWORK_MAC");
675 r
= parse_ether_addr(e
, &arg_network_provided_mac
);
677 return log_error_errno(r
, "Failed to parse provided MAC address via environment variable");
680 r
= getenv_bool("SYSTEMD_SUPPRESS_SYNC");
682 arg_suppress_sync
= r
;
683 else if (r
!= -ENXIO
)
684 log_debug_errno(r
, "Failed to parse $SYSTEMD_SUPPRESS_SYNC, ignoring: %m");
686 return detect_unified_cgroup_hierarchy_from_environment();
689 static int parse_argv(int argc
, char *argv
[]) {
696 ARG_AMBIENT_CAPABILITY
,
708 ARG_NETWORK_INTERFACE
,
713 ARG_NETWORK_VETH_EXTRA
,
714 ARG_NETWORK_NAMESPACE_PATH
,
724 ARG_PRIVATE_USERS_CHOWN
,
725 ARG_PRIVATE_USERS_OWNERSHIP
,
730 ARG_SYSTEM_CALL_FILTER
,
733 ARG_NO_NEW_PRIVILEGES
,
734 ARG_OOM_SCORE_ADJUST
,
749 static const struct option options
[] = {
750 { "help", no_argument
, NULL
, 'h' },
751 { "version", no_argument
, NULL
, ARG_VERSION
},
752 { "directory", required_argument
, NULL
, 'D' },
753 { "template", required_argument
, NULL
, ARG_TEMPLATE
},
754 { "ephemeral", no_argument
, NULL
, 'x' },
755 { "user", required_argument
, NULL
, 'u' },
756 { "private-network", no_argument
, NULL
, ARG_PRIVATE_NETWORK
},
757 { "as-pid2", no_argument
, NULL
, 'a' },
758 { "boot", no_argument
, NULL
, 'b' },
759 { "uuid", required_argument
, NULL
, ARG_UUID
},
760 { "read-only", no_argument
, NULL
, ARG_READ_ONLY
},
761 { "capability", required_argument
, NULL
, ARG_CAPABILITY
},
762 { "ambient-capability", required_argument
, NULL
, ARG_AMBIENT_CAPABILITY
},
763 { "drop-capability", required_argument
, NULL
, ARG_DROP_CAPABILITY
},
764 { "no-new-privileges", required_argument
, NULL
, ARG_NO_NEW_PRIVILEGES
},
765 { "link-journal", required_argument
, NULL
, ARG_LINK_JOURNAL
},
766 { "bind", required_argument
, NULL
, ARG_BIND
},
767 { "bind-ro", required_argument
, NULL
, ARG_BIND_RO
},
768 { "tmpfs", required_argument
, NULL
, ARG_TMPFS
},
769 { "overlay", required_argument
, NULL
, ARG_OVERLAY
},
770 { "overlay-ro", required_argument
, NULL
, ARG_OVERLAY_RO
},
771 { "inaccessible", required_argument
, NULL
, ARG_INACCESSIBLE
},
772 { "machine", required_argument
, NULL
, 'M' },
773 { "hostname", required_argument
, NULL
, ARG_HOSTNAME
},
774 { "slice", required_argument
, NULL
, 'S' },
775 { "setenv", required_argument
, NULL
, 'E' },
776 { "selinux-context", required_argument
, NULL
, 'Z' },
777 { "selinux-apifs-context", required_argument
, NULL
, 'L' },
778 { "quiet", no_argument
, NULL
, 'q' },
779 { "share-system", no_argument
, NULL
, ARG_SHARE_SYSTEM
}, /* not documented */
780 { "register", required_argument
, NULL
, ARG_REGISTER
},
781 { "keep-unit", no_argument
, NULL
, ARG_KEEP_UNIT
},
782 { "network-interface", required_argument
, NULL
, ARG_NETWORK_INTERFACE
},
783 { "network-macvlan", required_argument
, NULL
, ARG_NETWORK_MACVLAN
},
784 { "network-ipvlan", required_argument
, NULL
, ARG_NETWORK_IPVLAN
},
785 { "network-veth", no_argument
, NULL
, 'n' },
786 { "network-veth-extra", required_argument
, NULL
, ARG_NETWORK_VETH_EXTRA
},
787 { "network-bridge", required_argument
, NULL
, ARG_NETWORK_BRIDGE
},
788 { "network-zone", required_argument
, NULL
, ARG_NETWORK_ZONE
},
789 { "network-namespace-path", required_argument
, NULL
, ARG_NETWORK_NAMESPACE_PATH
},
790 { "personality", required_argument
, NULL
, ARG_PERSONALITY
},
791 { "image", required_argument
, NULL
, 'i' },
792 { "volatile", optional_argument
, NULL
, ARG_VOLATILE
},
793 { "port", required_argument
, NULL
, 'p' },
794 { "property", required_argument
, NULL
, ARG_PROPERTY
},
795 { "private-users", optional_argument
, NULL
, ARG_PRIVATE_USERS
},
796 { "private-users-chown", optional_argument
, NULL
, ARG_PRIVATE_USERS_CHOWN
}, /* obsolete */
797 { "private-users-ownership",required_argument
, NULL
, ARG_PRIVATE_USERS_OWNERSHIP
},
798 { "kill-signal", required_argument
, NULL
, ARG_KILL_SIGNAL
},
799 { "settings", required_argument
, NULL
, ARG_SETTINGS
},
800 { "chdir", required_argument
, NULL
, ARG_CHDIR
},
801 { "pivot-root", required_argument
, NULL
, ARG_PIVOT_ROOT
},
802 { "notify-ready", required_argument
, NULL
, ARG_NOTIFY_READY
},
803 { "root-hash", required_argument
, NULL
, ARG_ROOT_HASH
},
804 { "root-hash-sig", required_argument
, NULL
, ARG_ROOT_HASH_SIG
},
805 { "verity-data", required_argument
, NULL
, ARG_VERITY_DATA
},
806 { "system-call-filter", required_argument
, NULL
, ARG_SYSTEM_CALL_FILTER
},
807 { "rlimit", required_argument
, NULL
, ARG_RLIMIT
},
808 { "oom-score-adjust", required_argument
, NULL
, ARG_OOM_SCORE_ADJUST
},
809 { "cpu-affinity", required_argument
, NULL
, ARG_CPU_AFFINITY
},
810 { "resolv-conf", required_argument
, NULL
, ARG_RESOLV_CONF
},
811 { "timezone", required_argument
, NULL
, ARG_TIMEZONE
},
812 { "console", required_argument
, NULL
, ARG_CONSOLE
},
813 { "pipe", no_argument
, NULL
, ARG_PIPE
},
814 { "oci-bundle", required_argument
, NULL
, ARG_OCI_BUNDLE
},
815 { "no-pager", no_argument
, NULL
, ARG_NO_PAGER
},
816 { "set-credential", required_argument
, NULL
, ARG_SET_CREDENTIAL
},
817 { "load-credential", required_argument
, NULL
, ARG_LOAD_CREDENTIAL
},
818 { "bind-user", required_argument
, NULL
, ARG_BIND_USER
},
819 { "suppress-sync", required_argument
, NULL
, ARG_SUPPRESS_SYNC
},
820 { "image-policy", required_argument
, NULL
, ARG_IMAGE_POLICY
},
825 uint64_t plus
= 0, minus
= 0;
826 bool mask_all_settings
= false, mask_no_settings
= false;
831 /* Resetting to 0 forces the invocation of an internal initialization routine of getopt_long()
832 * that checks for GNU extensions in optstring ('-' or '+' at the beginning). */
834 while ((c
= getopt_long(argc
, argv
, "+hD:u:abL:M:jS:Z:qi:xp:nUE:P", options
, NULL
)) >= 0)
844 r
= parse_path_argument(optarg
, false, &arg_directory
);
848 arg_settings_mask
|= SETTING_DIRECTORY
;
852 r
= parse_path_argument(optarg
, false, &arg_template
);
856 arg_settings_mask
|= SETTING_DIRECTORY
;
860 r
= parse_path_argument(optarg
, false, &arg_image
);
864 arg_settings_mask
|= SETTING_DIRECTORY
;
868 r
= parse_path_argument(optarg
, false, &arg_oci_bundle
);
875 arg_ephemeral
= true;
876 arg_settings_mask
|= SETTING_EPHEMERAL
;
880 r
= free_and_strdup(&arg_user
, optarg
);
884 arg_settings_mask
|= SETTING_USER
;
887 case ARG_NETWORK_ZONE
: {
888 _cleanup_free_
char *j
= NULL
;
890 j
= strjoin("vz-", optarg
);
894 if (!ifname_valid(j
))
895 return log_error_errno(SYNTHETIC_ERRNO(EINVAL
),
896 "Network zone name not valid: %s", j
);
898 free_and_replace(arg_network_zone
, j
);
900 arg_network_veth
= true;
901 arg_private_network
= true;
902 arg_settings_mask
|= SETTING_NETWORK
;
906 case ARG_NETWORK_BRIDGE
:
908 if (!ifname_valid(optarg
))
909 return log_error_errno(SYNTHETIC_ERRNO(EINVAL
),
910 "Bridge interface name not valid: %s", optarg
);
912 r
= free_and_strdup(&arg_network_bridge
, optarg
);
918 arg_network_veth
= true;
919 arg_private_network
= true;
920 arg_settings_mask
|= SETTING_NETWORK
;
923 case ARG_NETWORK_VETH_EXTRA
:
924 r
= veth_extra_parse(&arg_network_veth_extra
, optarg
);
926 return log_error_errno(r
, "Failed to parse --network-veth-extra= parameter: %s", optarg
);
928 arg_private_network
= true;
929 arg_settings_mask
|= SETTING_NETWORK
;
932 case ARG_NETWORK_INTERFACE
:
933 r
= interface_pair_parse(&arg_network_interfaces
, optarg
);
937 arg_private_network
= true;
938 arg_settings_mask
|= SETTING_NETWORK
;
941 case ARG_NETWORK_MACVLAN
:
942 r
= macvlan_pair_parse(&arg_network_macvlan
, optarg
);
946 arg_private_network
= true;
947 arg_settings_mask
|= SETTING_NETWORK
;
950 case ARG_NETWORK_IPVLAN
:
951 r
= ipvlan_pair_parse(&arg_network_ipvlan
, optarg
);
956 case ARG_PRIVATE_NETWORK
:
957 arg_private_network
= true;
958 arg_settings_mask
|= SETTING_NETWORK
;
961 case ARG_NETWORK_NAMESPACE_PATH
:
962 r
= parse_path_argument(optarg
, false, &arg_network_namespace_path
);
966 arg_settings_mask
|= SETTING_NETWORK
;
970 if (arg_start_mode
== START_PID2
)
971 return log_error_errno(SYNTHETIC_ERRNO(EINVAL
),
972 "--boot and --as-pid2 may not be combined.");
974 arg_start_mode
= START_BOOT
;
975 arg_settings_mask
|= SETTING_START_MODE
;
979 if (arg_start_mode
== START_BOOT
)
980 return log_error_errno(SYNTHETIC_ERRNO(EINVAL
),
981 "--boot and --as-pid2 may not be combined.");
983 arg_start_mode
= START_PID2
;
984 arg_settings_mask
|= SETTING_START_MODE
;
988 r
= id128_from_string_nonzero(optarg
, &arg_uuid
);
990 return log_error_errno(SYNTHETIC_ERRNO(EINVAL
),
991 "Machine UUID may not be all zeroes.");
993 return log_error_errno(r
, "Invalid UUID: %s", optarg
);
995 arg_settings_mask
|= SETTING_MACHINE_ID
;
999 _cleanup_free_
char *mangled
= NULL
;
1001 r
= unit_name_mangle_with_suffix(optarg
, NULL
, UNIT_NAME_MANGLE_WARN
, ".slice", &mangled
);
1005 free_and_replace(arg_slice
, mangled
);
1006 arg_settings_mask
|= SETTING_SLICE
;
1011 if (isempty(optarg
))
1012 arg_machine
= mfree(arg_machine
);
1014 if (!hostname_is_valid(optarg
, 0))
1015 return log_error_errno(SYNTHETIC_ERRNO(EINVAL
),
1016 "Invalid machine name: %s", optarg
);
1018 r
= free_and_strdup(&arg_machine
, optarg
);
1025 if (isempty(optarg
))
1026 arg_hostname
= mfree(arg_hostname
);
1028 if (!hostname_is_valid(optarg
, 0))
1029 return log_error_errno(SYNTHETIC_ERRNO(EINVAL
),
1030 "Invalid hostname: %s", optarg
);
1032 r
= free_and_strdup(&arg_hostname
, optarg
);
1037 arg_settings_mask
|= SETTING_HOSTNAME
;
1041 arg_selinux_context
= optarg
;
1045 arg_selinux_apifs_context
= optarg
;
1049 arg_read_only
= true;
1050 arg_settings_mask
|= SETTING_READ_ONLY
;
1053 case ARG_AMBIENT_CAPABILITY
: {
1055 r
= parse_capability_spec(optarg
, &m
);
1058 arg_caps_ambient
|= m
;
1059 arg_settings_mask
|= SETTING_CAPABILITY
;
1062 case ARG_CAPABILITY
:
1063 case ARG_DROP_CAPABILITY
: {
1065 r
= parse_capability_spec(optarg
, &m
);
1069 if (c
== ARG_CAPABILITY
)
1073 arg_settings_mask
|= SETTING_CAPABILITY
;
1076 case ARG_NO_NEW_PRIVILEGES
:
1077 r
= parse_boolean(optarg
);
1079 return log_error_errno(r
, "Failed to parse --no-new-privileges= argument: %s", optarg
);
1081 arg_no_new_privileges
= r
;
1082 arg_settings_mask
|= SETTING_NO_NEW_PRIVILEGES
;
1086 arg_link_journal
= LINK_GUEST
;
1087 arg_link_journal_try
= true;
1088 arg_settings_mask
|= SETTING_LINK_JOURNAL
;
1091 case ARG_LINK_JOURNAL
:
1092 r
= parse_link_journal(optarg
, &arg_link_journal
, &arg_link_journal_try
);
1094 return log_error_errno(r
, "Failed to parse link journal mode %s", optarg
);
1096 arg_settings_mask
|= SETTING_LINK_JOURNAL
;
1101 r
= bind_mount_parse(&arg_custom_mounts
, &arg_n_custom_mounts
, optarg
, c
== ARG_BIND_RO
);
1103 return log_error_errno(r
, "Failed to parse --bind(-ro)= argument %s: %m", optarg
);
1105 arg_settings_mask
|= SETTING_CUSTOM_MOUNTS
;
1109 r
= tmpfs_mount_parse(&arg_custom_mounts
, &arg_n_custom_mounts
, optarg
);
1111 return log_error_errno(r
, "Failed to parse --tmpfs= argument %s: %m", optarg
);
1113 arg_settings_mask
|= SETTING_CUSTOM_MOUNTS
;
1117 case ARG_OVERLAY_RO
:
1118 r
= overlay_mount_parse(&arg_custom_mounts
, &arg_n_custom_mounts
, optarg
, c
== ARG_OVERLAY_RO
);
1119 if (r
== -EADDRNOTAVAIL
)
1120 return log_error_errno(r
, "--overlay(-ro)= needs at least two colon-separated directories specified.");
1122 return log_error_errno(r
, "Failed to parse --overlay(-ro)= argument %s: %m", optarg
);
1124 arg_settings_mask
|= SETTING_CUSTOM_MOUNTS
;
1127 case ARG_INACCESSIBLE
:
1128 r
= inaccessible_mount_parse(&arg_custom_mounts
, &arg_n_custom_mounts
, optarg
);
1130 return log_error_errno(r
, "Failed to parse --inaccessible= argument %s: %m", optarg
);
1132 arg_settings_mask
|= SETTING_CUSTOM_MOUNTS
;
1136 r
= strv_env_replace_strdup_passthrough(&arg_setenv
, optarg
);
1138 return log_error_errno(r
, "Cannot assign environment variable %s: %m", optarg
);
1140 arg_settings_mask
|= SETTING_ENVIRONMENT
;
1147 case ARG_SHARE_SYSTEM
:
1148 /* We don't officially support this anymore, except for compat reasons. People should use the
1149 * $SYSTEMD_NSPAWN_SHARE_* environment variables instead. */
1150 log_warning("Please do not use --share-system anymore, use $SYSTEMD_NSPAWN_SHARE_* instead.");
1151 arg_clone_ns_flags
= 0;
1155 r
= parse_boolean(optarg
);
1157 log_error("Failed to parse --register= argument: %s", optarg
);
1165 arg_keep_unit
= true;
1168 case ARG_PERSONALITY
:
1170 arg_personality
= personality_from_string(optarg
);
1171 if (arg_personality
== PERSONALITY_INVALID
)
1172 return log_error_errno(SYNTHETIC_ERRNO(EINVAL
),
1173 "Unknown or unsupported personality '%s'.", optarg
);
1175 arg_settings_mask
|= SETTING_PERSONALITY
;
1181 arg_volatile_mode
= VOLATILE_YES
;
1182 else if (streq(optarg
, "help")) {
1183 DUMP_STRING_TABLE(volatile_mode
, VolatileMode
, _VOLATILE_MODE_MAX
);
1188 m
= volatile_mode_from_string(optarg
);
1190 return log_error_errno(SYNTHETIC_ERRNO(EINVAL
),
1191 "Failed to parse --volatile= argument: %s", optarg
);
1193 arg_volatile_mode
= m
;
1196 arg_settings_mask
|= SETTING_VOLATILE_MODE
;
1200 r
= expose_port_parse(&arg_expose_ports
, optarg
);
1202 return log_error_errno(r
, "Duplicate port specification: %s", optarg
);
1204 return log_error_errno(r
, "Failed to parse host port %s: %m", optarg
);
1206 arg_settings_mask
|= SETTING_EXPOSE_PORTS
;
1210 if (strv_extend(&arg_property
, optarg
) < 0)
1215 case ARG_PRIVATE_USERS
: {
1220 else if (!in_charset(optarg
, DIGITS
))
1221 /* do *not* parse numbers as booleans */
1222 boolean
= parse_boolean(optarg
);
1227 /* no: User namespacing off */
1228 arg_userns_mode
= USER_NAMESPACE_NO
;
1229 arg_uid_shift
= UID_INVALID
;
1230 arg_uid_range
= UINT32_C(0x10000);
1231 } else if (boolean
> 0) {
1232 /* yes: User namespacing on, UID range is read from root dir */
1233 arg_userns_mode
= USER_NAMESPACE_FIXED
;
1234 arg_uid_shift
= UID_INVALID
;
1235 arg_uid_range
= UINT32_C(0x10000);
1236 } else if (streq(optarg
, "pick")) {
1237 /* pick: User namespacing on, UID range is picked randomly */
1238 arg_userns_mode
= USER_NAMESPACE_PICK
; /* Note that arg_userns_ownership is
1239 * implied by USER_NAMESPACE_PICK
1241 arg_uid_shift
= UID_INVALID
;
1242 arg_uid_range
= UINT32_C(0x10000);
1244 } else if (streq(optarg
, "identity")) {
1245 /* identity: User namespaces on, UID range is map the 0…0xFFFF range to
1246 * itself, i.e. we don't actually map anything, but do take benefit of
1247 * isolation of capability sets. */
1248 arg_userns_mode
= USER_NAMESPACE_FIXED
;
1250 arg_uid_range
= UINT32_C(0x10000);
1252 _cleanup_free_
char *buffer
= NULL
;
1253 const char *range
, *shift
;
1255 /* anything else: User namespacing on, UID range is explicitly configured */
1257 range
= strchr(optarg
, ':');
1259 buffer
= strndup(optarg
, range
- optarg
);
1265 r
= safe_atou32(range
, &arg_uid_range
);
1267 return log_error_errno(r
, "Failed to parse UID range \"%s\": %m", range
);
1271 r
= parse_uid(shift
, &arg_uid_shift
);
1273 return log_error_errno(r
, "Failed to parse UID \"%s\": %m", optarg
);
1275 arg_userns_mode
= USER_NAMESPACE_FIXED
;
1277 if (!userns_shift_range_valid(arg_uid_shift
, arg_uid_range
))
1278 return log_error_errno(SYNTHETIC_ERRNO(EINVAL
), "UID range cannot be empty or go beyond " UID_FMT
".", UID_INVALID
);
1281 arg_settings_mask
|= SETTING_USERNS
;
1286 if (userns_supported()) {
1287 arg_userns_mode
= USER_NAMESPACE_PICK
; /* Note that arg_userns_ownership is
1288 * implied by USER_NAMESPACE_PICK
1290 arg_uid_shift
= UID_INVALID
;
1291 arg_uid_range
= UINT32_C(0x10000);
1293 arg_settings_mask
|= SETTING_USERNS
;
1298 case ARG_PRIVATE_USERS_CHOWN
:
1299 arg_userns_ownership
= USER_NAMESPACE_OWNERSHIP_CHOWN
;
1301 arg_settings_mask
|= SETTING_USERNS
;
1304 case ARG_PRIVATE_USERS_OWNERSHIP
:
1305 if (streq(optarg
, "help")) {
1306 DUMP_STRING_TABLE(user_namespace_ownership
, UserNamespaceOwnership
, _USER_NAMESPACE_OWNERSHIP_MAX
);
1310 arg_userns_ownership
= user_namespace_ownership_from_string(optarg
);
1311 if (arg_userns_ownership
< 0)
1312 return log_error_errno(arg_userns_ownership
, "Cannot parse --user-namespace-ownership= value: %s", optarg
);
1314 arg_settings_mask
|= SETTING_USERNS
;
1317 case ARG_KILL_SIGNAL
:
1318 if (streq(optarg
, "help")) {
1319 DUMP_STRING_TABLE(signal
, int, _NSIG
);
1323 arg_kill_signal
= signal_from_string(optarg
);
1324 if (arg_kill_signal
< 0)
1325 return log_error_errno(arg_kill_signal
, "Cannot parse signal: %s", optarg
);
1327 arg_settings_mask
|= SETTING_KILL_SIGNAL
;
1332 /* no → do not read files
1333 * yes → read files, do not override cmdline, trust only subset
1334 * override → read files, override cmdline, trust only subset
1335 * trusted → read files, do not override cmdline, trust all
1338 r
= parse_boolean(optarg
);
1340 if (streq(optarg
, "trusted")) {
1341 mask_all_settings
= false;
1342 mask_no_settings
= false;
1343 arg_settings_trusted
= true;
1345 } else if (streq(optarg
, "override")) {
1346 mask_all_settings
= false;
1347 mask_no_settings
= true;
1348 arg_settings_trusted
= -1;
1350 return log_error_errno(r
, "Failed to parse --settings= argument: %s", optarg
);
1353 mask_all_settings
= false;
1354 mask_no_settings
= false;
1355 arg_settings_trusted
= -1;
1358 mask_all_settings
= true;
1359 mask_no_settings
= false;
1360 arg_settings_trusted
= false;
1366 if (!path_is_absolute(optarg
))
1367 return log_error_errno(SYNTHETIC_ERRNO(EINVAL
),
1368 "Working directory %s is not an absolute path.", optarg
);
1370 r
= free_and_strdup(&arg_chdir
, optarg
);
1374 arg_settings_mask
|= SETTING_WORKING_DIRECTORY
;
1377 case ARG_PIVOT_ROOT
:
1378 r
= pivot_root_parse(&arg_pivot_root_new
, &arg_pivot_root_old
, optarg
);
1380 return log_error_errno(r
, "Failed to parse --pivot-root= argument %s: %m", optarg
);
1382 arg_settings_mask
|= SETTING_PIVOT_ROOT
;
1385 case ARG_NOTIFY_READY
:
1386 r
= parse_boolean(optarg
);
1388 return log_error_errno(SYNTHETIC_ERRNO(EINVAL
),
1389 "%s is not a valid notify mode. Valid modes are: yes, no, and ready.", optarg
);
1390 arg_notify_ready
= r
;
1391 arg_settings_mask
|= SETTING_NOTIFY_READY
;
1394 case ARG_ROOT_HASH
: {
1395 _cleanup_free_
void *k
= NULL
;
1398 r
= unhexmem(optarg
, strlen(optarg
), &k
, &l
);
1400 return log_error_errno(r
, "Failed to parse root hash: %s", optarg
);
1401 if (l
< sizeof(sd_id128_t
))
1402 return log_error_errno(SYNTHETIC_ERRNO(EINVAL
), "Root hash must be at least 128-bit long: %s", optarg
);
1404 free_and_replace(arg_verity_settings
.root_hash
, k
);
1405 arg_verity_settings
.root_hash_size
= l
;
1409 case ARG_ROOT_HASH_SIG
: {
1414 if ((value
= startswith(optarg
, "base64:"))) {
1415 r
= unbase64mem(value
, strlen(value
), &p
, &l
);
1417 return log_error_errno(r
, "Failed to parse root hash signature '%s': %m", optarg
);
1420 r
= read_full_file(optarg
, (char**) &p
, &l
);
1422 return log_error_errno(r
, "Failed parse root hash signature file '%s': %m", optarg
);
1425 free_and_replace(arg_verity_settings
.root_hash_sig
, p
);
1426 arg_verity_settings
.root_hash_sig_size
= l
;
1430 case ARG_VERITY_DATA
:
1431 r
= parse_path_argument(optarg
, false, &arg_verity_settings
.data_path
);
1436 case ARG_SYSTEM_CALL_FILTER
: {
1440 negative
= optarg
[0] == '~';
1441 items
= negative
? optarg
+ 1 : optarg
;
1444 _cleanup_free_
char *word
= NULL
;
1446 r
= extract_first_word(&items
, &word
, NULL
, 0);
1452 return log_error_errno(r
, "Failed to parse system call filter: %m");
1455 r
= strv_extend(&arg_syscall_deny_list
, word
);
1457 r
= strv_extend(&arg_syscall_allow_list
, word
);
1462 arg_settings_mask
|= SETTING_SYSCALL_FILTER
;
1468 _cleanup_free_
char *name
= NULL
;
1471 if (streq(optarg
, "help")) {
1472 DUMP_STRING_TABLE(rlimit
, int, _RLIMIT_MAX
);
1476 eq
= strchr(optarg
, '=');
1478 return log_error_errno(SYNTHETIC_ERRNO(EINVAL
),
1479 "--rlimit= expects an '=' assignment.");
1481 name
= strndup(optarg
, eq
- optarg
);
1485 rl
= rlimit_from_string_harder(name
);
1487 return log_error_errno(rl
, "Unknown resource limit: %s", name
);
1489 if (!arg_rlimit
[rl
]) {
1490 arg_rlimit
[rl
] = new0(struct rlimit
, 1);
1491 if (!arg_rlimit
[rl
])
1495 r
= rlimit_parse(rl
, eq
+ 1, arg_rlimit
[rl
]);
1497 return log_error_errno(r
, "Failed to parse resource limit: %s", eq
+ 1);
1499 arg_settings_mask
|= SETTING_RLIMIT_FIRST
<< rl
;
1503 case ARG_OOM_SCORE_ADJUST
:
1504 r
= parse_oom_score_adjust(optarg
, &arg_oom_score_adjust
);
1506 return log_error_errno(r
, "Failed to parse --oom-score-adjust= parameter: %s", optarg
);
1508 arg_oom_score_adjust_set
= true;
1509 arg_settings_mask
|= SETTING_OOM_SCORE_ADJUST
;
1512 case ARG_CPU_AFFINITY
: {
1515 r
= parse_cpu_set(optarg
, &cpuset
);
1517 return log_error_errno(r
, "Failed to parse CPU affinity mask %s: %m", optarg
);
1519 cpu_set_reset(&arg_cpu_set
);
1520 arg_cpu_set
= cpuset
;
1521 arg_settings_mask
|= SETTING_CPU_AFFINITY
;
1525 case ARG_RESOLV_CONF
:
1526 if (streq(optarg
, "help")) {
1527 DUMP_STRING_TABLE(resolv_conf_mode
, ResolvConfMode
, _RESOLV_CONF_MODE_MAX
);
1531 arg_resolv_conf
= resolv_conf_mode_from_string(optarg
);
1532 if (arg_resolv_conf
< 0)
1533 return log_error_errno(arg_resolv_conf
,
1534 "Failed to parse /etc/resolv.conf mode: %s", optarg
);
1536 arg_settings_mask
|= SETTING_RESOLV_CONF
;
1540 if (streq(optarg
, "help")) {
1541 DUMP_STRING_TABLE(timezone_mode
, TimezoneMode
, _TIMEZONE_MODE_MAX
);
1545 arg_timezone
= timezone_mode_from_string(optarg
);
1546 if (arg_timezone
< 0)
1547 return log_error_errno(arg_timezone
,
1548 "Failed to parse /etc/localtime mode: %s", optarg
);
1550 arg_settings_mask
|= SETTING_TIMEZONE
;
1554 r
= handle_arg_console(optarg
);
1561 r
= handle_arg_console("pipe");
1567 arg_pager_flags
|= PAGER_DISABLE
;
1570 case ARG_SET_CREDENTIAL
: {
1571 _cleanup_free_
char *word
= NULL
, *data
= NULL
;
1572 const char *p
= optarg
;
1576 r
= extract_first_word(&p
, &word
, ":", EXTRACT_DONT_COALESCE_SEPARATORS
);
1580 return log_error_errno(r
, "Failed to parse --set-credential= parameter: %m");
1582 return log_error_errno(SYNTHETIC_ERRNO(EINVAL
), "Missing value for --set-credential=: %s", optarg
);
1584 if (!credential_name_valid(word
))
1585 return log_error_errno(SYNTHETIC_ERRNO(EINVAL
), "Credential name is not valid: %s", word
);
1587 for (size_t i
= 0; i
< arg_n_credentials
; i
++)
1588 if (streq(arg_credentials
[i
].id
, word
))
1589 return log_error_errno(SYNTHETIC_ERRNO(EEXIST
), "Duplicate credential '%s', refusing.", word
);
1591 l
= cunescape(p
, UNESCAPE_ACCEPT_NUL
, &data
);
1593 return log_error_errno(l
, "Failed to unescape credential data: %s", p
);
1595 a
= reallocarray(arg_credentials
, arg_n_credentials
+ 1, sizeof(Credential
));
1599 a
[arg_n_credentials
++] = (Credential
) {
1600 .id
= TAKE_PTR(word
),
1601 .data
= TAKE_PTR(data
),
1605 arg_credentials
= a
;
1607 arg_settings_mask
|= SETTING_CREDENTIALS
;
1611 case ARG_LOAD_CREDENTIAL
: {
1612 ReadFullFileFlags flags
= READ_FULL_FILE_SECURE
;
1613 _cleanup_(erase_and_freep
) char *data
= NULL
;
1614 _cleanup_free_
char *word
= NULL
, *j
= NULL
;
1615 const char *p
= optarg
;
1619 r
= extract_first_word(&p
, &word
, ":", EXTRACT_DONT_COALESCE_SEPARATORS
);
1623 return log_error_errno(r
, "Failed to parse --load-credential= parameter: %m");
1625 return log_error_errno(SYNTHETIC_ERRNO(EINVAL
), "Missing value for --load-credential=: %s", optarg
);
1627 if (!credential_name_valid(word
))
1628 return log_error_errno(SYNTHETIC_ERRNO(EINVAL
), "Credential name is not valid: %s", word
);
1630 for (i
= 0; i
< arg_n_credentials
; i
++)
1631 if (streq(arg_credentials
[i
].id
, word
))
1632 return log_error_errno(SYNTHETIC_ERRNO(EEXIST
), "Duplicate credential '%s', refusing.", word
);
1634 if (path_is_absolute(p
))
1635 flags
|= READ_FULL_FILE_CONNECT_SOCKET
;
1639 r
= get_credentials_dir(&e
);
1641 return log_error_errno(r
, "Credential not available (no credentials passed at all): %s", word
);
1643 j
= path_join(e
, p
);
1648 r
= read_full_file_full(AT_FDCWD
, j
?: p
, UINT64_MAX
, SIZE_MAX
,
1653 return log_error_errno(r
, "Failed to read credential '%s': %m", j
?: p
);
1655 a
= reallocarray(arg_credentials
, arg_n_credentials
+ 1, sizeof(Credential
));
1659 a
[arg_n_credentials
++] = (Credential
) {
1660 .id
= TAKE_PTR(word
),
1661 .data
= TAKE_PTR(data
),
1665 arg_credentials
= a
;
1667 arg_settings_mask
|= SETTING_CREDENTIALS
;
1672 if (!valid_user_group_name(optarg
, 0))
1673 return log_error_errno(SYNTHETIC_ERRNO(EINVAL
), "Invalid user name to bind: %s", optarg
);
1675 if (strv_extend(&arg_bind_user
, optarg
) < 0)
1678 arg_settings_mask
|= SETTING_BIND_USER
;
1681 case ARG_SUPPRESS_SYNC
:
1682 r
= parse_boolean_argument("--suppress-sync=", optarg
, &arg_suppress_sync
);
1686 arg_settings_mask
|= SETTING_SUPPRESS_SYNC
;
1689 case ARG_IMAGE_POLICY
:
1690 r
= parse_image_policy_argument(optarg
, &arg_image_policy
);
1699 assert_not_reached();
1702 if (argc
> optind
) {
1703 strv_free(arg_parameters
);
1704 arg_parameters
= strv_copy(argv
+ optind
);
1705 if (!arg_parameters
)
1708 arg_settings_mask
|= SETTING_START_MODE
;
1711 if (arg_ephemeral
&& arg_template
&& !arg_directory
)
1712 /* User asked for ephemeral execution but specified --template= instead of --directory=. Semantically
1713 * such an invocation makes some sense, see https://github.com/systemd/systemd/issues/3667. Let's
1714 * accept this here, and silently make "--ephemeral --template=" equivalent to "--ephemeral
1716 arg_directory
= TAKE_PTR(arg_template
);
1718 arg_caps_retain
|= plus
;
1719 arg_caps_retain
|= arg_private_network
? UINT64_C(1) << CAP_NET_ADMIN
: 0;
1721 /* If we're not unsharing the network namespace and are unsharing the user namespace, we won't have
1722 * permissions to bind ports in the container, so let's drop the CAP_NET_BIND_SERVICE capability to
1724 if (!arg_private_network
&& arg_userns_mode
!= USER_NAMESPACE_NO
&& arg_uid_shift
> 0)
1725 arg_caps_retain
&= ~(UINT64_C(1) << CAP_NET_BIND_SERVICE
);
1727 arg_caps_retain
&= ~minus
;
1729 /* Make sure to parse environment before we reset the settings mask below */
1730 r
= parse_environment();
1734 /* Load all settings from .nspawn files */
1735 if (mask_no_settings
)
1736 arg_settings_mask
= 0;
1738 /* Don't load any settings from .nspawn files */
1739 if (mask_all_settings
)
1740 arg_settings_mask
= _SETTINGS_MASK_ALL
;
1745 static int verify_arguments(void) {
1748 if (arg_start_mode
== START_PID2
&& arg_unified_cgroup_hierarchy
== CGROUP_UNIFIED_UNKNOWN
) {
1749 /* If we are running the stub init in the container, we don't need to look at what the init
1750 * in the container supports, because we are not using it. Let's immediately pick the right
1751 * setting based on the host system configuration.
1753 * We only do this, if the user didn't use an environment variable to override the detection.
1756 r
= cg_all_unified();
1758 return log_error_errno(r
, "Failed to determine whether we are in all unified mode.");
1760 arg_unified_cgroup_hierarchy
= CGROUP_UNIFIED_ALL
;
1761 else if (cg_unified_controller(SYSTEMD_CGROUP_CONTROLLER
) > 0)
1762 arg_unified_cgroup_hierarchy
= CGROUP_UNIFIED_SYSTEMD
;
1764 arg_unified_cgroup_hierarchy
= CGROUP_UNIFIED_NONE
;
1767 if (arg_userns_mode
!= USER_NAMESPACE_NO
)
1768 arg_mount_settings
|= MOUNT_USE_USERNS
;
1770 if (arg_private_network
)
1771 arg_mount_settings
|= MOUNT_APPLY_APIVFS_NETNS
;
1773 if (!(arg_clone_ns_flags
& CLONE_NEWPID
) ||
1774 !(arg_clone_ns_flags
& CLONE_NEWUTS
)) {
1775 arg_register
= false;
1776 if (arg_start_mode
!= START_PID1
)
1777 return log_error_errno(SYNTHETIC_ERRNO(EINVAL
), "--boot cannot be used without namespacing.");
1780 if (arg_userns_ownership
< 0)
1781 arg_userns_ownership
=
1782 arg_userns_mode
== USER_NAMESPACE_PICK
? USER_NAMESPACE_OWNERSHIP_AUTO
:
1783 USER_NAMESPACE_OWNERSHIP_OFF
;
1785 if (arg_start_mode
== START_BOOT
&& arg_kill_signal
<= 0)
1786 arg_kill_signal
= SIGRTMIN
+3;
1788 if (arg_volatile_mode
!= VOLATILE_NO
) /* Make sure all file systems contained in the image are mounted read-only if we are in volatile mode */
1789 arg_read_only
= true;
1791 if (has_custom_root_mount(arg_custom_mounts
, arg_n_custom_mounts
))
1792 arg_read_only
= true;
1794 if (arg_keep_unit
&& arg_register
&& cg_pid_get_owner_uid(0, NULL
) >= 0)
1795 /* Save the user from accidentally registering either user-$SESSION.scope or user@.service.
1796 * The latter is not technically a user session, but we don't need to labour the point. */
1797 return log_error_errno(SYNTHETIC_ERRNO(EINVAL
), "--keep-unit --register=yes may not be used when invoked from a user session.");
1799 if (arg_directory
&& arg_image
)
1800 return log_error_errno(SYNTHETIC_ERRNO(EINVAL
), "--directory= and --image= may not be combined.");
1802 if (arg_template
&& arg_image
)
1803 return log_error_errno(SYNTHETIC_ERRNO(EINVAL
), "--template= and --image= may not be combined.");
1805 if (arg_template
&& !(arg_directory
|| arg_machine
))
1806 return log_error_errno(SYNTHETIC_ERRNO(EINVAL
), "--template= needs --directory= or --machine=.");
1808 if (arg_ephemeral
&& arg_template
)
1809 return log_error_errno(SYNTHETIC_ERRNO(EINVAL
), "--ephemeral and --template= may not be combined.");
1811 if (arg_ephemeral
&& !IN_SET(arg_link_journal
, LINK_NO
, LINK_AUTO
))
1812 return log_error_errno(SYNTHETIC_ERRNO(EINVAL
), "--ephemeral and --link-journal= may not be combined.");
1814 if (arg_userns_mode
!= USER_NAMESPACE_NO
&& !userns_supported())
1815 return log_error_errno(SYNTHETIC_ERRNO(EOPNOTSUPP
), "--private-users= is not supported, kernel compiled without user namespace support.");
1817 if (arg_userns_ownership
== USER_NAMESPACE_OWNERSHIP_CHOWN
&& arg_read_only
)
1818 return log_error_errno(SYNTHETIC_ERRNO(EINVAL
),
1819 "--read-only and --private-users-ownership=chown may not be combined.");
1821 /* We don't support --private-users-ownership=chown together with any of the volatile modes since we
1822 * couldn't change the read-only part of the tree (i.e. /usr) anyway, or because it would trigger a
1823 * massive copy-up (in case of overlay) making the entire exercise pointless. */
1824 if (arg_userns_ownership
== USER_NAMESPACE_OWNERSHIP_CHOWN
&& arg_volatile_mode
!= VOLATILE_NO
)
1825 return log_error_errno(SYNTHETIC_ERRNO(EINVAL
), "--volatile= and --private-users-ownership=chown may not be combined.");
1827 /* If --network-namespace-path is given with any other network-related option (except --private-network),
1828 * we need to error out, to avoid conflicts between different network options. */
1829 if (arg_network_namespace_path
&&
1830 (arg_network_interfaces
|| arg_network_macvlan
||
1831 arg_network_ipvlan
|| arg_network_veth_extra
||
1832 arg_network_bridge
|| arg_network_zone
||
1834 return log_error_errno(SYNTHETIC_ERRNO(EINVAL
), "--network-namespace-path= cannot be combined with other network options.");
1836 if (arg_network_bridge
&& arg_network_zone
)
1837 return log_error_errno(SYNTHETIC_ERRNO(EINVAL
),
1838 "--network-bridge= and --network-zone= may not be combined.");
1840 if (arg_userns_mode
!= USER_NAMESPACE_NO
&& (arg_mount_settings
& MOUNT_APPLY_APIVFS_NETNS
) && !arg_private_network
)
1841 return log_error_errno(SYNTHETIC_ERRNO(EINVAL
), "Invalid namespacing settings. Mounting sysfs with --private-users requires --private-network.");
1843 if (arg_userns_mode
!= USER_NAMESPACE_NO
&& !(arg_mount_settings
& MOUNT_APPLY_APIVFS_RO
))
1844 return log_error_errno(SYNTHETIC_ERRNO(EINVAL
), "Cannot combine --private-users with read-write mounts.");
1846 if (arg_expose_ports
&& !arg_private_network
)
1847 return log_error_errno(SYNTHETIC_ERRNO(EINVAL
), "Cannot use --port= without private networking.");
1849 if (arg_caps_ambient
) {
1850 if (arg_caps_ambient
== UINT64_MAX
)
1851 return log_error_errno(SYNTHETIC_ERRNO(EINVAL
), "AmbientCapability= does not support the value all.");
1853 if ((arg_caps_ambient
& arg_caps_retain
) != arg_caps_ambient
)
1854 return log_error_errno(SYNTHETIC_ERRNO(EINVAL
), "AmbientCapability= setting is not fully covered by Capability= setting.");
1856 if (arg_start_mode
== START_BOOT
)
1857 return log_error_errno(SYNTHETIC_ERRNO(EINVAL
), "AmbientCapability= setting is not useful for boot mode.");
1860 if (arg_userns_mode
== USER_NAMESPACE_NO
&& !strv_isempty(arg_bind_user
))
1861 return log_error_errno(SYNTHETIC_ERRNO(EINVAL
), "--bind-user= requires --private-users");
1863 /* Drop duplicate --bind-user= entries */
1864 strv_uniq(arg_bind_user
);
1866 r
= custom_mount_check_all();
1873 static int verify_network_interfaces_initialized(void) {
1875 r
= test_network_interfaces_initialized(arg_network_interfaces
);
1879 r
= test_network_interfaces_initialized(arg_network_macvlan
);
1883 r
= test_network_interfaces_initialized(arg_network_ipvlan
);
1890 int userns_lchown(const char *p
, uid_t uid
, gid_t gid
) {
1893 if (arg_userns_mode
== USER_NAMESPACE_NO
)
1896 if (uid
== UID_INVALID
&& gid
== GID_INVALID
)
1899 if (uid
!= UID_INVALID
) {
1900 uid
+= arg_uid_shift
;
1902 if (uid
< arg_uid_shift
|| uid
>= arg_uid_shift
+ arg_uid_range
)
1906 if (gid
!= GID_INVALID
) {
1907 gid
+= (gid_t
) arg_uid_shift
;
1909 if (gid
< (gid_t
) arg_uid_shift
|| gid
>= (gid_t
) (arg_uid_shift
+ arg_uid_range
))
1913 return RET_NERRNO(lchown(p
, uid
, gid
));
1916 int userns_mkdir(const char *root
, const char *path
, mode_t mode
, uid_t uid
, gid_t gid
) {
1920 q
= prefix_roota(root
, path
);
1921 r
= RET_NERRNO(mkdir(q
, mode
));
1927 return userns_lchown(q
, uid
, gid
);
1930 static const char *timezone_from_path(const char *path
) {
1931 return PATH_STARTSWITH_SET(
1933 "../usr/share/zoneinfo/",
1934 "/usr/share/zoneinfo/");
1937 static bool etc_writable(void) {
1938 return !arg_read_only
|| IN_SET(arg_volatile_mode
, VOLATILE_YES
, VOLATILE_OVERLAY
);
1941 static int setup_timezone(const char *dest
) {
1942 _cleanup_free_
char *p
= NULL
, *etc
= NULL
;
1943 const char *where
, *check
;
1949 if (IN_SET(arg_timezone
, TIMEZONE_AUTO
, TIMEZONE_SYMLINK
)) {
1950 r
= readlink_malloc("/etc/localtime", &p
);
1951 if (r
== -ENOENT
&& arg_timezone
== TIMEZONE_AUTO
)
1952 m
= etc_writable() ? TIMEZONE_DELETE
: TIMEZONE_OFF
;
1953 else if (r
== -EINVAL
&& arg_timezone
== TIMEZONE_AUTO
) /* regular file? */
1954 m
= etc_writable() ? TIMEZONE_COPY
: TIMEZONE_BIND
;
1956 log_warning_errno(r
, "Failed to read host's /etc/localtime symlink, not updating container timezone: %m");
1957 /* To handle warning, delete /etc/localtime and replace it with a symbolic link to a time zone data
1961 * ln -s /usr/share/zoneinfo/UTC /etc/localtime
1964 } else if (arg_timezone
== TIMEZONE_AUTO
)
1965 m
= etc_writable() ? TIMEZONE_SYMLINK
: TIMEZONE_BIND
;
1971 if (m
== TIMEZONE_OFF
)
1974 r
= chase("/etc", dest
, CHASE_PREFIX_ROOT
, &etc
, NULL
);
1976 log_warning_errno(r
, "Failed to resolve /etc path in container, ignoring: %m");
1980 where
= strjoina(etc
, "/localtime");
1984 case TIMEZONE_DELETE
:
1985 if (unlink(where
) < 0)
1986 log_full_errno(errno
== ENOENT
? LOG_DEBUG
: LOG_WARNING
, errno
, "Failed to remove '%s', ignoring: %m", where
);
1990 case TIMEZONE_SYMLINK
: {
1991 _cleanup_free_
char *q
= NULL
;
1992 const char *z
, *what
;
1994 z
= timezone_from_path(p
);
1996 log_warning("/etc/localtime does not point into /usr/share/zoneinfo/, not updating container timezone.");
2000 r
= readlink_malloc(where
, &q
);
2001 if (r
>= 0 && streq_ptr(timezone_from_path(q
), z
))
2002 return 0; /* Already pointing to the right place? Then do nothing .. */
2004 check
= strjoina(dest
, "/usr/share/zoneinfo/", z
);
2005 r
= chase(check
, dest
, 0, NULL
, NULL
);
2007 log_debug_errno(r
, "Timezone %s does not exist (or is not accessible) in container, not creating symlink: %m", z
);
2009 if (unlink(where
) < 0 && errno
!= ENOENT
) {
2010 log_full_errno(IN_SET(errno
, EROFS
, EACCES
, EPERM
) ? LOG_DEBUG
: LOG_WARNING
, /* Don't complain on read-only images */
2011 errno
, "Failed to remove existing timezone info %s in container, ignoring: %m", where
);
2015 what
= strjoina("../usr/share/zoneinfo/", z
);
2016 if (symlink(what
, where
) < 0) {
2017 log_full_errno(IN_SET(errno
, EROFS
, EACCES
, EPERM
) ? LOG_DEBUG
: LOG_WARNING
,
2018 errno
, "Failed to correct timezone of container, ignoring: %m");
2028 case TIMEZONE_BIND
: {
2029 _cleanup_free_
char *resolved
= NULL
;
2032 found
= chase(where
, dest
, CHASE_NONEXISTENT
, &resolved
, NULL
);
2034 log_warning_errno(found
, "Failed to resolve /etc/localtime path in container, ignoring: %m");
2038 if (found
== 0) /* missing? */
2039 (void) touch(resolved
);
2041 r
= mount_nofollow_verbose(LOG_WARNING
, "/etc/localtime", resolved
, NULL
, MS_BIND
, NULL
);
2043 return mount_nofollow_verbose(LOG_ERR
, NULL
, resolved
, NULL
, MS_BIND
|MS_REMOUNT
|MS_RDONLY
|MS_NOSUID
|MS_NODEV
, NULL
);
2049 /* If mounting failed, try to copy */
2050 r
= copy_file_atomic("/etc/localtime", where
, 0644, COPY_REFLINK
|COPY_REPLACE
);
2052 log_full_errno(IN_SET(r
, -EROFS
, -EACCES
, -EPERM
) ? LOG_DEBUG
: LOG_WARNING
, r
,
2053 "Failed to copy /etc/localtime to %s, ignoring: %m", where
);
2060 assert_not_reached();
2063 /* Fix permissions of the symlink or file copy we just created */
2064 r
= userns_lchown(where
, 0, 0);
2066 log_warning_errno(r
, "Failed to chown /etc/localtime, ignoring: %m");
2071 static int have_resolv_conf(const char *path
) {
2074 if (access(path
, F_OK
) < 0) {
2075 if (errno
== ENOENT
)
2078 return log_debug_errno(errno
, "Failed to determine whether '%s' is available: %m", path
);
2084 static int resolved_listening(void) {
2085 _cleanup_(sd_bus_error_free
) sd_bus_error error
= SD_BUS_ERROR_NULL
;
2086 _cleanup_(sd_bus_flush_close_unrefp
) sd_bus
*bus
= NULL
;
2087 _cleanup_free_
char *dns_stub_listener_mode
= NULL
;
2090 /* Check if resolved is listening */
2092 r
= sd_bus_open_system(&bus
);
2094 return log_debug_errno(r
, "Failed to open system bus: %m");
2096 r
= bus_name_has_owner(bus
, "org.freedesktop.resolve1", NULL
);
2098 return log_debug_errno(r
, "Failed to check whether the 'org.freedesktop.resolve1' bus name is taken: %m");
2102 r
= bus_get_property_string(bus
, bus_resolve_mgr
, "DNSStubListener", &error
, &dns_stub_listener_mode
);
2104 return log_debug_errno(r
, "Failed to query DNSStubListener property: %s", bus_error_message(&error
, r
));
2106 return STR_IN_SET(dns_stub_listener_mode
, "udp", "yes");
2109 static int setup_resolv_conf(const char *dest
) {
2110 _cleanup_free_
char *etc
= NULL
;
2111 const char *where
, *what
;
2117 if (arg_resolv_conf
== RESOLV_CONF_AUTO
) {
2118 if (arg_private_network
)
2119 m
= RESOLV_CONF_OFF
;
2120 else if (have_resolv_conf(PRIVATE_STUB_RESOLV_CONF
) > 0 && resolved_listening() > 0)
2121 m
= etc_writable() ? RESOLV_CONF_COPY_STUB
: RESOLV_CONF_BIND_STUB
;
2122 else if (have_resolv_conf("/etc/resolv.conf") > 0)
2123 m
= etc_writable() ? RESOLV_CONF_COPY_HOST
: RESOLV_CONF_BIND_HOST
;
2125 m
= etc_writable() ? RESOLV_CONF_DELETE
: RESOLV_CONF_OFF
;
2128 m
= arg_resolv_conf
;
2130 if (m
== RESOLV_CONF_OFF
)
2133 r
= chase("/etc", dest
, CHASE_PREFIX_ROOT
, &etc
, NULL
);
2135 log_warning_errno(r
, "Failed to resolve /etc path in container, ignoring: %m");
2139 where
= strjoina(etc
, "/resolv.conf");
2141 if (m
== RESOLV_CONF_DELETE
) {
2142 if (unlink(where
) < 0)
2143 log_full_errno(errno
== ENOENT
? LOG_DEBUG
: LOG_WARNING
, errno
, "Failed to remove '%s', ignoring: %m", where
);
2148 if (IN_SET(m
, RESOLV_CONF_BIND_STATIC
, RESOLV_CONF_REPLACE_STATIC
, RESOLV_CONF_COPY_STATIC
))
2149 what
= PRIVATE_STATIC_RESOLV_CONF
;
2150 else if (IN_SET(m
, RESOLV_CONF_BIND_UPLINK
, RESOLV_CONF_REPLACE_UPLINK
, RESOLV_CONF_COPY_UPLINK
))
2151 what
= PRIVATE_UPLINK_RESOLV_CONF
;
2152 else if (IN_SET(m
, RESOLV_CONF_BIND_STUB
, RESOLV_CONF_REPLACE_STUB
, RESOLV_CONF_COPY_STUB
))
2153 what
= PRIVATE_STUB_RESOLV_CONF
;
2155 what
= "/etc/resolv.conf";
2157 if (IN_SET(m
, RESOLV_CONF_BIND_HOST
, RESOLV_CONF_BIND_STATIC
, RESOLV_CONF_BIND_UPLINK
, RESOLV_CONF_BIND_STUB
)) {
2158 _cleanup_free_
char *resolved
= NULL
;
2161 found
= chase(where
, dest
, CHASE_NONEXISTENT
|CHASE_NOFOLLOW
, &resolved
, NULL
);
2163 log_warning_errno(found
, "Failed to resolve /etc/resolv.conf path in container, ignoring: %m");
2167 if (found
== 0) /* missing? */
2168 (void) touch(resolved
);
2170 r
= mount_nofollow_verbose(LOG_WARNING
, what
, resolved
, NULL
, MS_BIND
, NULL
);
2172 return mount_nofollow_verbose(LOG_ERR
, NULL
, resolved
, NULL
, MS_BIND
|MS_REMOUNT
|MS_RDONLY
|MS_NOSUID
|MS_NODEV
, NULL
);
2174 /* If that didn't work, let's copy the file */
2177 if (IN_SET(m
, RESOLV_CONF_REPLACE_HOST
, RESOLV_CONF_REPLACE_STATIC
, RESOLV_CONF_REPLACE_UPLINK
, RESOLV_CONF_REPLACE_STUB
))
2178 r
= copy_file_atomic(what
, where
, 0644, COPY_REFLINK
|COPY_REPLACE
);
2180 r
= copy_file(what
, where
, O_TRUNC
|O_NOFOLLOW
, 0644, COPY_REFLINK
);
2182 /* If the file already exists as symlink, let's suppress the warning, under the assumption that
2183 * resolved or something similar runs inside and the symlink points there.
2185 * If the disk image is read-only, there's also no point in complaining.
2187 log_full_errno(!IN_SET(RESOLV_CONF_COPY_HOST
, RESOLV_CONF_COPY_STATIC
, RESOLV_CONF_COPY_UPLINK
, RESOLV_CONF_COPY_STUB
) &&
2188 IN_SET(r
, -ELOOP
, -EROFS
, -EACCES
, -EPERM
) ? LOG_DEBUG
: LOG_WARNING
, r
,
2189 "Failed to copy /etc/resolv.conf to %s, ignoring: %m", where
);
2193 r
= userns_lchown(where
, 0, 0);
2195 log_warning_errno(r
, "Failed to chown /etc/resolv.conf, ignoring: %m");
2200 static int setup_boot_id(void) {
2201 _cleanup_(unlink_and_freep
) char *from
= NULL
;
2202 _cleanup_free_
char *path
= NULL
;
2203 sd_id128_t rnd
= SD_ID128_NULL
;
2207 /* Generate a new randomized boot ID, so that each boot-up of the container gets a new one */
2209 r
= tempfn_random_child("/run", "proc-sys-kernel-random-boot-id", &path
);
2211 return log_error_errno(r
, "Failed to generate random boot ID path: %m");
2213 r
= sd_id128_randomize(&rnd
);
2215 return log_error_errno(r
, "Failed to generate random boot id: %m");
2217 r
= id128_write(path
, ID128_FORMAT_UUID
, rnd
);
2219 return log_error_errno(r
, "Failed to write boot id: %m");
2221 from
= TAKE_PTR(path
);
2222 to
= "/proc/sys/kernel/random/boot_id";
2224 r
= mount_nofollow_verbose(LOG_ERR
, from
, to
, NULL
, MS_BIND
, NULL
);
2228 return mount_nofollow_verbose(LOG_ERR
, NULL
, to
, NULL
, MS_BIND
|MS_REMOUNT
|MS_RDONLY
|MS_NOSUID
|MS_NOEXEC
|MS_NODEV
, NULL
);
2231 static int copy_devnodes(const char *dest
) {
2232 static const char devnodes
[] =
2245 BLOCK_WITH_UMASK(0000);
2247 /* Create /dev/net, so that we can create /dev/net/tun in it */
2248 if (userns_mkdir(dest
, "/dev/net", 0755, 0, 0) < 0)
2249 return log_error_errno(r
, "Failed to create /dev/net directory: %m");
2251 NULSTR_FOREACH(d
, devnodes
) {
2252 _cleanup_free_
char *from
= NULL
, *to
= NULL
;
2255 from
= path_join("/dev/", d
);
2259 to
= path_join(dest
, from
);
2263 if (stat(from
, &st
) < 0) {
2265 if (errno
!= ENOENT
)
2266 return log_error_errno(errno
, "Failed to stat %s: %m", from
);
2268 } else if (!S_ISCHR(st
.st_mode
) && !S_ISBLK(st
.st_mode
))
2269 return log_error_errno(SYNTHETIC_ERRNO(EIO
),
2270 "%s is not a char or block device, cannot copy.", from
);
2272 _cleanup_free_
char *sl
= NULL
, *prefixed
= NULL
, *dn
= NULL
, *t
= NULL
;
2274 if (mknod(to
, st
.st_mode
, st
.st_rdev
) < 0) {
2275 /* Explicitly warn the user when /dev is already populated. */
2276 if (errno
== EEXIST
)
2277 log_notice("%s/dev is pre-mounted and pre-populated. If a pre-mounted /dev is provided it needs to be an unpopulated file system.", dest
);
2279 return log_error_errno(errno
, "mknod(%s) failed: %m", to
);
2281 /* Some systems abusively restrict mknod but allow bind mounts. */
2284 return log_error_errno(r
, "touch (%s) failed: %m", to
);
2285 r
= mount_nofollow_verbose(LOG_DEBUG
, from
, to
, NULL
, MS_BIND
, NULL
);
2287 return log_error_errno(r
, "Both mknod and bind mount (%s) failed: %m", to
);
2290 r
= userns_lchown(to
, 0, 0);
2292 return log_error_errno(r
, "chown() of device node %s failed: %m", to
);
2294 dn
= path_join("/dev", S_ISCHR(st
.st_mode
) ? "char" : "block");
2298 r
= userns_mkdir(dest
, dn
, 0755, 0, 0);
2300 return log_error_errno(r
, "Failed to create '%s': %m", dn
);
2302 if (asprintf(&sl
, "%s/%u:%u", dn
, major(st
.st_rdev
), minor(st
.st_rdev
)) < 0)
2305 prefixed
= path_join(dest
, sl
);
2309 t
= path_join("..", d
);
2313 if (symlink(t
, prefixed
) < 0)
2314 log_debug_errno(errno
, "Failed to symlink '%s' to '%s': %m", t
, prefixed
);
2321 static int make_extra_nodes(const char *dest
) {
2325 BLOCK_WITH_UMASK(0000);
2327 for (i
= 0; i
< arg_n_extra_nodes
; i
++) {
2328 _cleanup_free_
char *path
= NULL
;
2329 DeviceNode
*n
= arg_extra_nodes
+ i
;
2331 path
= path_join(dest
, n
->path
);
2335 if (mknod(path
, n
->mode
, S_ISCHR(n
->mode
) || S_ISBLK(n
->mode
) ? makedev(n
->major
, n
->minor
) : 0) < 0)
2336 return log_error_errno(errno
, "Failed to create device node '%s': %m", path
);
2338 r
= chmod_and_chown(path
, n
->mode
, n
->uid
, n
->gid
);
2340 return log_error_errno(r
, "Failed to adjust device node ownership of '%s': %m", path
);
2346 static int setup_pts(const char *dest
) {
2347 _cleanup_free_
char *options
= NULL
;
2352 if (arg_selinux_apifs_context
)
2353 (void) asprintf(&options
,
2354 "newinstance,ptmxmode=0666,mode=620,gid=" GID_FMT
",context=\"%s\"",
2355 arg_uid_shift
+ TTY_GID
,
2356 arg_selinux_apifs_context
);
2359 (void) asprintf(&options
,
2360 "newinstance,ptmxmode=0666,mode=620,gid=" GID_FMT
,
2361 arg_uid_shift
+ TTY_GID
);
2366 /* Mount /dev/pts itself */
2367 p
= prefix_roota(dest
, "/dev/pts");
2368 r
= RET_NERRNO(mkdir(p
, 0755));
2370 return log_error_errno(r
, "Failed to create /dev/pts: %m");
2372 r
= mount_nofollow_verbose(LOG_ERR
, "devpts", p
, "devpts", MS_NOSUID
|MS_NOEXEC
, options
);
2375 r
= userns_lchown(p
, 0, 0);
2377 return log_error_errno(r
, "Failed to chown /dev/pts: %m");
2379 /* Create /dev/ptmx symlink */
2380 p
= prefix_roota(dest
, "/dev/ptmx");
2381 if (symlink("pts/ptmx", p
) < 0)
2382 return log_error_errno(errno
, "Failed to create /dev/ptmx symlink: %m");
2383 r
= userns_lchown(p
, 0, 0);
2385 return log_error_errno(r
, "Failed to chown /dev/ptmx: %m");
2387 /* And fix /dev/pts/ptmx ownership */
2388 p
= prefix_roota(dest
, "/dev/pts/ptmx");
2389 r
= userns_lchown(p
, 0, 0);
2391 return log_error_errno(r
, "Failed to chown /dev/pts/ptmx: %m");
2396 static int setup_stdio_as_dev_console(void) {
2397 _cleanup_close_
int terminal
= -EBADF
;
2400 /* We open the TTY in O_NOCTTY mode, so that we do not become controller yet. We'll do that later
2401 * explicitly, if we are configured to. */
2402 terminal
= open_terminal("/dev/console", O_RDWR
|O_NOCTTY
);
2404 return log_error_errno(terminal
, "Failed to open console: %m");
2406 /* Make sure we can continue logging to the original stderr, even if
2407 * stderr points elsewhere now */
2408 r
= log_dup_console();
2410 return log_error_errno(r
, "Failed to duplicate stderr: %m");
2412 /* invalidates 'terminal' on success and failure */
2413 r
= rearrange_stdio(terminal
, terminal
, terminal
);
2416 return log_error_errno(r
, "Failed to move console to stdin/stdout/stderr: %m");
2421 static int setup_dev_console(const char *console
) {
2422 _cleanup_free_
char *p
= NULL
;
2425 /* Create /dev/console symlink */
2426 r
= path_make_relative("/dev", console
, &p
);
2428 return log_error_errno(r
, "Failed to create relative path: %m");
2430 if (symlink(p
, "/dev/console") < 0)
2431 return log_error_errno(errno
, "Failed to create /dev/console symlink: %m");
2436 static int setup_keyring(void) {
2437 key_serial_t keyring
;
2439 /* Allocate a new session keyring for the container. This makes sure the keyring of the session
2440 * systemd-nspawn was invoked from doesn't leak into the container. Note that by default we block
2441 * keyctl() and request_key() anyway via seccomp so doing this operation isn't strictly necessary,
2442 * but in case people explicitly allow-list these system calls let's make sure we don't leak anything
2443 * into the container. */
2445 keyring
= keyctl(KEYCTL_JOIN_SESSION_KEYRING
, 0, 0, 0, 0);
2446 if (keyring
== -1) {
2447 if (errno
== ENOSYS
)
2448 log_debug_errno(errno
, "Kernel keyring not supported, ignoring.");
2449 else if (ERRNO_IS_PRIVILEGE(errno
))
2450 log_debug_errno(errno
, "Kernel keyring access prohibited, ignoring.");
2452 return log_error_errno(errno
, "Setting up kernel keyring failed: %m");
2458 static int setup_credentials(const char *root
) {
2462 if (arg_n_credentials
<= 0)
2465 r
= userns_mkdir(root
, "/run/host", 0755, 0, 0);
2467 return log_error_errno(r
, "Failed to create /run/host: %m");
2469 r
= userns_mkdir(root
, "/run/host/credentials", 0700, 0, 0);
2471 return log_error_errno(r
, "Failed to create /run/host/credentials: %m");
2473 q
= prefix_roota(root
, "/run/host/credentials");
2474 r
= mount_nofollow_verbose(LOG_ERR
, NULL
, q
, "ramfs", MS_NOSUID
|MS_NOEXEC
|MS_NODEV
, "mode=0700");
2478 for (size_t i
= 0; i
< arg_n_credentials
; i
++) {
2479 _cleanup_free_
char *j
= NULL
;
2480 _cleanup_close_
int fd
= -EBADF
;
2482 j
= path_join(q
, arg_credentials
[i
].id
);
2486 fd
= open(j
, O_CREAT
|O_EXCL
|O_WRONLY
|O_CLOEXEC
|O_NOFOLLOW
, 0600);
2488 return log_error_errno(errno
, "Failed to create credential file %s: %m", j
);
2490 r
= loop_write(fd
, arg_credentials
[i
].data
, arg_credentials
[i
].size
);
2492 return log_error_errno(r
, "Failed to write credential to file %s: %m", j
);
2494 if (fchmod(fd
, 0400) < 0)
2495 return log_error_errno(errno
, "Failed to adjust access mode of %s: %m", j
);
2497 if (arg_userns_mode
!= USER_NAMESPACE_NO
) {
2498 if (fchown(fd
, arg_uid_shift
, arg_uid_shift
) < 0)
2499 return log_error_errno(errno
, "Failed to adjust ownership of %s: %m", j
);
2503 if (chmod(q
, 0500) < 0)
2504 return log_error_errno(errno
, "Failed to adjust access mode of %s: %m", q
);
2506 r
= userns_lchown(q
, 0, 0);
2510 /* Make both mount and superblock read-only now */
2511 r
= mount_nofollow_verbose(LOG_ERR
, NULL
, q
, NULL
, MS_REMOUNT
|MS_BIND
|MS_RDONLY
|MS_NOSUID
|MS_NOEXEC
|MS_NODEV
, NULL
);
2515 return mount_nofollow_verbose(LOG_ERR
, NULL
, q
, NULL
, MS_REMOUNT
|MS_RDONLY
|MS_NOSUID
|MS_NOEXEC
|MS_NODEV
, "mode=0500");
2518 static int setup_kmsg(int fd_inner_socket
) {
2519 _cleanup_(unlink_and_freep
) char *from
= NULL
;
2520 _cleanup_free_
char *fifo
= NULL
;
2521 _cleanup_close_
int fd
= -EBADF
;
2524 assert(fd_inner_socket
>= 0);
2526 BLOCK_WITH_UMASK(0000);
2528 /* We create the kmsg FIFO as a temporary file in /run, but immediately delete it after bind mounting it to
2529 * /proc/kmsg. While FIFOs on the reading side behave very similar to /proc/kmsg, their writing side behaves
2530 * differently from /dev/kmsg in that writing blocks when nothing is reading. In order to avoid any problems
2531 * with containers deadlocking due to this we simply make /dev/kmsg unavailable to the container. */
2533 r
= tempfn_random_child("/run", "proc-kmsg", &fifo
);
2535 return log_error_errno(r
, "Failed to generate kmsg path: %m");
2537 if (mkfifo(fifo
, 0600) < 0)
2538 return log_error_errno(errno
, "mkfifo() for /run/kmsg failed: %m");
2540 from
= TAKE_PTR(fifo
);
2542 r
= mount_nofollow_verbose(LOG_ERR
, from
, "/proc/kmsg", NULL
, MS_BIND
, NULL
);
2546 fd
= open(from
, O_RDWR
|O_NONBLOCK
|O_CLOEXEC
);
2548 return log_error_errno(errno
, "Failed to open fifo: %m");
2550 /* Store away the fd in the socket, so that it stays open as long as we run the child */
2551 r
= send_one_fd(fd_inner_socket
, fd
, 0);
2553 return log_error_errno(r
, "Failed to send FIFO fd: %m");
2559 union in_addr_union address4
;
2560 union in_addr_union address6
;
2561 struct FirewallContext
*fw_ctx
;
2564 static int on_address_change(sd_netlink
*rtnl
, sd_netlink_message
*m
, void *userdata
) {
2565 struct ExposeArgs
*args
= ASSERT_PTR(userdata
);
2570 (void) expose_port_execute(rtnl
, &args
->fw_ctx
, arg_expose_ports
, AF_INET
, &args
->address4
);
2571 (void) expose_port_execute(rtnl
, &args
->fw_ctx
, arg_expose_ports
, AF_INET6
, &args
->address6
);
2575 static int setup_hostname(void) {
2578 if ((arg_clone_ns_flags
& CLONE_NEWUTS
) == 0)
2581 r
= sethostname_idempotent(arg_hostname
?: arg_machine
);
2583 return log_error_errno(r
, "Failed to set hostname: %m");
2588 static int setup_journal(const char *directory
) {
2589 _cleanup_free_
char *d
= NULL
;
2595 /* Don't link journals in ephemeral mode */
2599 if (arg_link_journal
== LINK_NO
)
2602 try = arg_link_journal_try
|| arg_link_journal
== LINK_AUTO
;
2604 r
= sd_id128_get_machine(&this_id
);
2606 return log_error_errno(r
, "Failed to retrieve machine ID: %m");
2608 if (sd_id128_equal(arg_uuid
, this_id
)) {
2609 log_full(try ? LOG_WARNING
: LOG_ERR
,
2610 "Host and machine ids are equal (%s): refusing to link journals", SD_ID128_TO_STRING(arg_uuid
));
2616 FOREACH_STRING(dirname
, "/var", "/var/log", "/var/log/journal") {
2617 r
= userns_mkdir(directory
, dirname
, 0755, 0, 0);
2619 bool ignore
= r
== -EROFS
&& try;
2620 log_full_errno(ignore
? LOG_DEBUG
: LOG_ERR
, r
,
2621 "Failed to create %s%s: %m", dirname
, ignore
? ", ignoring" : "");
2622 return ignore
? 0 : r
;
2626 p
= strjoina("/var/log/journal/", SD_ID128_TO_STRING(arg_uuid
));
2627 q
= prefix_roota(directory
, p
);
2629 if (path_is_mount_point(p
, NULL
, 0) > 0) {
2633 return log_error_errno(SYNTHETIC_ERRNO(EEXIST
),
2634 "%s: already a mount point, refusing to use for journal", p
);
2637 if (path_is_mount_point(q
, NULL
, 0) > 0) {
2641 return log_error_errno(SYNTHETIC_ERRNO(EEXIST
),
2642 "%s: already a mount point, refusing to use for journal", q
);
2645 r
= readlink_and_make_absolute(p
, &d
);
2647 if (IN_SET(arg_link_journal
, LINK_GUEST
, LINK_AUTO
) &&
2650 r
= userns_mkdir(directory
, p
, 0755, 0, 0);
2652 log_warning_errno(r
, "Failed to create directory %s: %m", q
);
2657 return log_error_errno(errno
, "Failed to remove symlink %s: %m", p
);
2658 } else if (r
== -EINVAL
) {
2660 if (arg_link_journal
== LINK_GUEST
&&
2663 if (errno
== ENOTDIR
) {
2664 log_error("%s already exists and is neither a symlink nor a directory", p
);
2667 return log_error_errno(errno
, "Failed to remove %s: %m", p
);
2669 } else if (r
!= -ENOENT
)
2670 return log_error_errno(r
, "readlink(%s) failed: %m", p
);
2672 if (arg_link_journal
== LINK_GUEST
) {
2674 if (symlink(q
, p
) < 0) {
2676 log_debug_errno(errno
, "Failed to symlink %s to %s, skipping journal setup: %m", q
, p
);
2679 return log_error_errno(errno
, "Failed to symlink %s to %s: %m", q
, p
);
2682 r
= userns_mkdir(directory
, p
, 0755, 0, 0);
2684 log_warning_errno(r
, "Failed to create directory %s: %m", q
);
2688 if (arg_link_journal
== LINK_HOST
) {
2689 /* don't create parents here — if the host doesn't have
2690 * permanent journal set up, don't force it here */
2692 r
= RET_NERRNO(mkdir(p
, 0755));
2693 if (r
< 0 && r
!= -EEXIST
) {
2695 log_debug_errno(r
, "Failed to create %s, skipping journal setup: %m", p
);
2698 return log_error_errno(r
, "Failed to create %s: %m", p
);
2701 } else if (access(p
, F_OK
) < 0)
2704 if (dir_is_empty(q
, /* ignore_hidden_or_backup= */ false) == 0)
2705 log_warning("%s is not empty, proceeding anyway.", q
);
2707 r
= userns_mkdir(directory
, p
, 0755, 0, 0);
2709 return log_error_errno(r
, "Failed to create %s: %m", q
);
2711 r
= mount_nofollow_verbose(LOG_DEBUG
, p
, q
, NULL
, MS_BIND
, NULL
);
2713 return log_error_errno(errno
, "Failed to bind mount journal from host into guest: %m");
2718 static int drop_capabilities(uid_t uid
) {
2719 CapabilityQuintet q
;
2721 /* Let's initialize all five capability sets to something valid. If the quintet was configured via
2722 * OCI use that, but fill in missing bits. If it wasn't then derive the quintet in full from
2723 * arg_caps_retain. */
2725 if (capability_quintet_is_set(&arg_full_capabilities
)) {
2726 q
= arg_full_capabilities
;
2728 if (q
.bounding
== UINT64_MAX
)
2729 q
.bounding
= uid
== 0 ? arg_caps_retain
: 0;
2731 if (q
.effective
== UINT64_MAX
)
2732 q
.effective
= uid
== 0 ? q
.bounding
: 0;
2734 if (q
.inheritable
== UINT64_MAX
)
2735 q
.inheritable
= uid
== 0 ? q
.bounding
: arg_caps_ambient
;
2737 if (q
.permitted
== UINT64_MAX
)
2738 q
.permitted
= uid
== 0 ? q
.bounding
: arg_caps_ambient
;
2740 if (q
.ambient
== UINT64_MAX
&& ambient_capabilities_supported())
2741 q
.ambient
= arg_caps_ambient
;
2743 if (capability_quintet_mangle(&q
))
2744 return log_error_errno(SYNTHETIC_ERRNO(EPERM
), "Cannot set capabilities that are not in the current bounding set.");
2747 q
= (CapabilityQuintet
) {
2748 .bounding
= arg_caps_retain
,
2749 .effective
= uid
== 0 ? arg_caps_retain
: 0,
2750 .inheritable
= uid
== 0 ? arg_caps_retain
: arg_caps_ambient
,
2751 .permitted
= uid
== 0 ? arg_caps_retain
: arg_caps_ambient
,
2752 .ambient
= ambient_capabilities_supported() ? arg_caps_ambient
: UINT64_MAX
,
2755 /* If we're not using OCI, proceed with mangled capabilities (so we don't error out)
2756 * in order to maintain the same behavior as systemd < 242. */
2757 if (capability_quintet_mangle(&q
))
2758 log_full(arg_quiet
? LOG_DEBUG
: LOG_WARNING
,
2759 "Some capabilities will not be set because they are not in the current bounding set.");
2763 return capability_quintet_enforce(&q
);
2766 static int reset_audit_loginuid(void) {
2767 _cleanup_free_
char *p
= NULL
;
2770 if ((arg_clone_ns_flags
& CLONE_NEWPID
) == 0)
2773 r
= read_one_line_file("/proc/self/loginuid", &p
);
2777 return log_error_errno(r
, "Failed to read /proc/self/loginuid: %m");
2779 /* Already reset? */
2780 if (streq(p
, "4294967295"))
2783 r
= write_string_file("/proc/self/loginuid", "4294967295", WRITE_STRING_FILE_DISABLE_BUFFER
);
2786 "Failed to reset audit login UID. This probably means that your kernel is too\n"
2787 "old and you have audit enabled. Note that the auditing subsystem is known to\n"
2788 "be incompatible with containers on old kernels. Please make sure to upgrade\n"
2789 "your kernel or to off auditing with 'audit=0' on the kernel command line before\n"
2790 "using systemd-nspawn. Sleeping for 5s... (%m)");
2798 static int mount_tunnel_dig(const char *root
) {
2802 (void) mkdir_p("/run/systemd/nspawn/", 0755);
2803 (void) mkdir_p("/run/systemd/nspawn/propagate", 0600);
2804 p
= strjoina("/run/systemd/nspawn/propagate/", arg_machine
);
2805 (void) mkdir_p(p
, 0600);
2807 r
= userns_mkdir(root
, "/run/host", 0755, 0, 0);
2809 return log_error_errno(r
, "Failed to create /run/host: %m");
2811 r
= userns_mkdir(root
, NSPAWN_MOUNT_TUNNEL
, 0600, 0, 0);
2813 return log_error_errno(r
, "Failed to create "NSPAWN_MOUNT_TUNNEL
": %m");
2815 q
= prefix_roota(root
, NSPAWN_MOUNT_TUNNEL
);
2816 r
= mount_nofollow_verbose(LOG_ERR
, p
, q
, NULL
, MS_BIND
, NULL
);
2820 r
= mount_nofollow_verbose(LOG_ERR
, NULL
, q
, NULL
, MS_BIND
|MS_REMOUNT
|MS_RDONLY
, NULL
);
2827 static int mount_tunnel_open(void) {
2830 r
= mount_follow_verbose(LOG_ERR
, NULL
, NSPAWN_MOUNT_TUNNEL
, NULL
, MS_SLAVE
, NULL
);
2837 static int setup_machine_id(const char *directory
) {
2840 /* If the UUID in the container is already set, then that's what counts, and we use. If it isn't set, and the
2841 * caller passed --uuid=, then we'll pass it in the $container_uuid env var to PID 1 of the container. The
2842 * assumption is that PID 1 will then write it to /etc/machine-id to make it persistent. If --uuid= is not
2843 * passed we generate a random UUID, and pass it via $container_uuid. In effect this means that /etc/machine-id
2844 * in the container and our idea of the container UUID will always be in sync (at least if PID 1 in the
2845 * container behaves nicely). */
2847 r
= id128_get_machine(directory
, &arg_uuid
);
2848 if (ERRNO_IS_NEG_MACHINE_ID_UNSET(r
)) {
2849 /* If the file is missing, empty, or uninitialized, we don't mind */
2850 if (sd_id128_is_null(arg_uuid
)) {
2851 r
= sd_id128_randomize(&arg_uuid
);
2853 return log_error_errno(r
, "Failed to acquire randomized machine UUID: %m");
2856 return log_error_errno(r
, "Failed to read machine ID from container image: %m");
2861 static int recursive_chown(const char *directory
, uid_t shift
, uid_t range
) {
2866 if (arg_userns_mode
== USER_NAMESPACE_NO
|| arg_userns_ownership
!= USER_NAMESPACE_OWNERSHIP_CHOWN
)
2869 r
= path_patch_uid(directory
, arg_uid_shift
, arg_uid_range
);
2870 if (r
== -EOPNOTSUPP
)
2871 return log_error_errno(r
, "Automatic UID/GID adjusting is only supported for UID/GID ranges starting at multiples of 2^16 with a range of 2^16.");
2873 return log_error_errno(r
, "Upper 16 bits of root directory UID and GID do not match.");
2875 return log_error_errno(r
, "Failed to adjust UID/GID shift of OS tree: %m");
2877 log_debug("Root directory of image is already owned by the right UID/GID range, skipping recursive chown operation.");
2879 log_debug("Patched directory tree to match UID/GID range.");
2886 * < 0 : wait_for_terminate() failed to get the state of the
2887 * container, the container was terminated by a signal, or
2888 * failed for an unknown reason. No change is made to the
2889 * container argument.
2890 * > 0 : The program executed in the container terminated with an
2891 * error. The exit code of the program executed in the
2892 * container is returned. The container argument has been set
2893 * to CONTAINER_TERMINATED.
2894 * 0 : The container is being rebooted, has been shut down or exited
2895 * successfully. The container argument has been set to either
2896 * CONTAINER_TERMINATED or CONTAINER_REBOOTED.
2898 * That is, success is indicated by a return value of zero, and an
2899 * error is indicated by a non-zero value.
2901 static int wait_for_container(pid_t pid
, ContainerStatus
*container
) {
2905 r
= wait_for_terminate(pid
, &status
);
2907 return log_warning_errno(r
, "Failed to wait for container: %m");
2909 switch (status
.si_code
) {
2912 if (status
.si_status
== 0)
2913 log_full(arg_quiet
? LOG_DEBUG
: LOG_INFO
, "Container %s exited successfully.", arg_machine
);
2915 log_full(arg_quiet
? LOG_DEBUG
: LOG_INFO
, "Container %s failed with error code %i.", arg_machine
, status
.si_status
);
2917 *container
= CONTAINER_TERMINATED
;
2918 return status
.si_status
;
2921 if (status
.si_status
== SIGINT
) {
2922 log_full(arg_quiet
? LOG_DEBUG
: LOG_INFO
, "Container %s has been shut down.", arg_machine
);
2923 *container
= CONTAINER_TERMINATED
;
2926 } else if (status
.si_status
== SIGHUP
) {
2927 log_full(arg_quiet
? LOG_DEBUG
: LOG_INFO
, "Container %s is being rebooted.", arg_machine
);
2928 *container
= CONTAINER_REBOOTED
;
2934 return log_error_errno(SYNTHETIC_ERRNO(EIO
),
2935 "Container %s terminated by signal %s.", arg_machine
, signal_to_string(status
.si_status
));
2938 return log_error_errno(SYNTHETIC_ERRNO(EIO
),
2939 "Container %s failed due to unknown reason.", arg_machine
);
2943 static int on_orderly_shutdown(sd_event_source
*s
, const struct signalfd_siginfo
*si
, void *userdata
) {
2946 pid
= PTR_TO_PID(userdata
);
2948 if (kill(pid
, arg_kill_signal
) >= 0) {
2949 log_info("Trying to halt container. Send SIGTERM again to trigger immediate termination.");
2950 sd_event_source_set_userdata(s
, NULL
);
2955 sd_event_exit(sd_event_source_get_event(s
), 0);
2959 static int on_sigchld(sd_event_source
*s
, const struct signalfd_siginfo
*ssi
, void *userdata
) {
2965 pid
= PTR_TO_PID(userdata
);
2970 if (waitid(P_ALL
, 0, &si
, WNOHANG
|WNOWAIT
|WEXITED
) < 0)
2971 return log_error_errno(errno
, "Failed to waitid(): %m");
2972 if (si
.si_pid
== 0) /* No pending children. */
2974 if (si
.si_pid
== pid
) {
2975 /* The main process we care for has exited. Return from
2976 * signal handler but leave the zombie. */
2977 sd_event_exit(sd_event_source_get_event(s
), 0);
2981 /* Reap all other children. */
2982 (void) waitid(P_PID
, si
.si_pid
, &si
, WNOHANG
|WEXITED
);
2988 static int on_request_stop(sd_bus_message
*m
, void *userdata
, sd_bus_error
*error
) {
2993 pid
= PTR_TO_PID(userdata
);
2995 if (arg_kill_signal
> 0) {
2996 log_info("Container termination requested. Attempting to halt container.");
2997 (void) kill(pid
, arg_kill_signal
);
2999 log_info("Container termination requested. Exiting.");
3000 sd_event_exit(sd_bus_get_event(sd_bus_message_get_bus(m
)), 0);
3006 static int determine_names(void) {
3009 if (arg_template
&& !arg_directory
&& arg_machine
) {
3011 /* If --template= was specified then we should not
3012 * search for a machine, but instead create a new one
3013 * in /var/lib/machine. */
3015 arg_directory
= path_join("/var/lib/machines", arg_machine
);
3020 if (!arg_image
&& !arg_directory
) {
3022 _cleanup_(image_unrefp
) Image
*i
= NULL
;
3024 r
= image_find(IMAGE_MACHINE
, arg_machine
, NULL
, &i
);
3026 return log_error_errno(r
, "No image for machine '%s'.", arg_machine
);
3028 return log_error_errno(r
, "Failed to find image for machine '%s': %m", arg_machine
);
3030 if (IN_SET(i
->type
, IMAGE_RAW
, IMAGE_BLOCK
))
3031 r
= free_and_strdup(&arg_image
, i
->path
);
3033 r
= free_and_strdup(&arg_directory
, i
->path
);
3038 arg_read_only
= arg_read_only
|| i
->read_only
;
3040 r
= safe_getcwd(&arg_directory
);
3042 return log_error_errno(r
, "Failed to determine current directory: %m");
3045 if (!arg_directory
&& !arg_image
)
3046 return log_error_errno(SYNTHETIC_ERRNO(EINVAL
), "Failed to determine path, please use -D or -i.");
3050 if (arg_directory
&& path_equal(arg_directory
, "/"))
3051 arg_machine
= gethostname_malloc();
3052 else if (arg_image
) {
3055 r
= path_extract_filename(arg_image
, &arg_machine
);
3057 return log_error_errno(r
, "Failed to extract file name from '%s': %m", arg_image
);
3059 /* Truncate suffix if there is one */
3060 e
= endswith(arg_machine
, ".raw");
3064 r
= path_extract_filename(arg_directory
, &arg_machine
);
3066 return log_error_errno(r
, "Failed to extract file name from '%s': %m", arg_directory
);
3069 hostname_cleanup(arg_machine
);
3070 if (!hostname_is_valid(arg_machine
, 0))
3071 return log_error_errno(SYNTHETIC_ERRNO(EINVAL
), "Failed to determine machine name automatically, please use -M.");
3073 /* Copy the machine name before the random suffix is added below, otherwise we won't be able
3074 * to match fixed config file names. */
3075 arg_settings_filename
= strjoin(arg_machine
, ".nspawn");
3076 if (!arg_settings_filename
)
3079 /* Add a random suffix when this is an ephemeral machine, so that we can run many
3080 * instances at once without manually having to specify -M each time. */
3082 if (strextendf(&arg_machine
, "-%016" PRIx64
, random_u64()) < 0)
3085 arg_settings_filename
= strjoin(arg_machine
, ".nspawn");
3086 if (!arg_settings_filename
)
3093 static int chase_and_update(char **p
, unsigned flags
) {
3102 r
= chase(*p
, NULL
, flags
, &chased
, NULL
);
3104 return log_error_errno(r
, "Failed to resolve path %s: %m", *p
);
3106 return free_and_replace(*p
, chased
);
3109 static int determine_uid_shift(const char *directory
) {
3111 if (arg_userns_mode
== USER_NAMESPACE_NO
) {
3116 if (arg_uid_shift
== UID_INVALID
) {
3119 /* Read the UID shift off the image. Maybe we can reuse this to avoid chowning. */
3121 if (stat(directory
, &st
) < 0)
3122 return log_error_errno(errno
, "Failed to determine UID base of %s: %m", directory
);
3124 arg_uid_shift
= st
.st_uid
& UINT32_C(0xffff0000);
3126 if (arg_uid_shift
!= (st
.st_gid
& UINT32_C(0xffff0000)))
3127 return log_error_errno(SYNTHETIC_ERRNO(EINVAL
),
3128 "UID and GID base of %s don't match.", directory
);
3130 arg_uid_range
= UINT32_C(0x10000);
3132 if (arg_uid_shift
!= 0) {
3133 /* If the image is shifted already, then we'll fall back to classic chowning, for
3134 * compatibility (and simplicity), or refuse if mapping is explicitly requested. */
3136 if (arg_userns_ownership
== USER_NAMESPACE_OWNERSHIP_AUTO
) {
3137 log_debug("UID base of %s is non-zero, not using UID mapping.", directory
);
3138 arg_userns_ownership
= USER_NAMESPACE_OWNERSHIP_CHOWN
;
3139 } else if (arg_userns_ownership
== USER_NAMESPACE_OWNERSHIP_MAP
)
3140 return log_error_errno(SYNTHETIC_ERRNO(EINVAL
),
3141 "UID base of %s is not zero, UID mapping not supported.", directory
);
3145 if (!userns_shift_range_valid(arg_uid_shift
, arg_uid_range
))
3146 return log_error_errno(SYNTHETIC_ERRNO(EINVAL
), "UID base too high for UID range.");
3151 static unsigned long effective_clone_ns_flags(void) {
3152 unsigned long flags
= arg_clone_ns_flags
;
3154 if (arg_private_network
)
3155 flags
|= CLONE_NEWNET
;
3157 flags
|= CLONE_NEWCGROUP
;
3158 if (arg_userns_mode
!= USER_NAMESPACE_NO
)
3159 flags
|= CLONE_NEWUSER
;
3164 static int patch_sysctl(void) {
3166 /* This table is inspired by runc's sysctl() function */
3167 static const struct {
3170 unsigned long clone_flags
;
3172 { "kernel.hostname", false, CLONE_NEWUTS
},
3173 { "kernel.domainname", false, CLONE_NEWUTS
},
3174 { "kernel.msgmax", false, CLONE_NEWIPC
},
3175 { "kernel.msgmnb", false, CLONE_NEWIPC
},
3176 { "kernel.msgmni", false, CLONE_NEWIPC
},
3177 { "kernel.sem", false, CLONE_NEWIPC
},
3178 { "kernel.shmall", false, CLONE_NEWIPC
},
3179 { "kernel.shmmax", false, CLONE_NEWIPC
},
3180 { "kernel.shmmni", false, CLONE_NEWIPC
},
3181 { "fs.mqueue.", true, CLONE_NEWIPC
},
3182 { "net.", true, CLONE_NEWNET
},
3185 unsigned long flags
;
3188 flags
= effective_clone_ns_flags();
3190 STRV_FOREACH_PAIR(k
, v
, arg_sysctl
) {
3194 for (i
= 0; i
< ELEMENTSOF(safe_sysctl
); i
++) {
3196 if (!FLAGS_SET(flags
, safe_sysctl
[i
].clone_flags
))
3199 if (safe_sysctl
[i
].prefix
)
3200 good
= startswith(*k
, safe_sysctl
[i
].key
);
3202 good
= streq(*k
, safe_sysctl
[i
].key
);
3209 return log_error_errno(SYNTHETIC_ERRNO(EPERM
), "Refusing to write to sysctl '%s', as it is not safe in the selected namespaces.", *k
);
3211 r
= sysctl_write(*k
, *v
);
3213 return log_error_errno(r
, "Failed to write sysctl '%s': %m", *k
);
3219 static int inner_child(
3221 int fd_inner_socket
,
3223 char **os_release_pairs
) {
3225 _cleanup_free_
char *home
= NULL
;
3228 (char*) "PATH=" DEFAULT_PATH_COMPAT
,
3229 NULL
, /* container */
3234 NULL
, /* container_uuid */
3235 NULL
, /* LISTEN_FDS */
3236 NULL
, /* LISTEN_PID */
3237 NULL
, /* NOTIFY_SOCKET */
3238 NULL
, /* CREDENTIALS_DIRECTORY */
3242 const char *exec_target
;
3243 _cleanup_strv_free_
char **env_use
= NULL
;
3244 int r
, which_failed
;
3246 /* This is the "inner" child process, i.e. the one forked off by the "outer" child process, which is the one
3247 * the container manager itself forked off. At the time of clone() it gained its own CLONE_NEWNS, CLONE_NEWPID,
3248 * CLONE_NEWUTS, CLONE_NEWIPC, CLONE_NEWUSER namespaces. Note that it has its own CLONE_NEWNS namespace,
3249 * separate from the CLONE_NEWNS created for the "outer" child, and also separate from the host's CLONE_NEWNS
3250 * namespace. The reason for having two levels of CLONE_NEWNS namespaces is that the "inner" one is owned by
3251 * the CLONE_NEWUSER namespace of the container, while the "outer" one is owned by the host's CLONE_NEWUSER
3254 * Note at this point we have no CLONE_NEWNET namespace yet. We'll acquire that one later through
3255 * unshare(). See below. */
3258 assert(fd_inner_socket
>= 0);
3260 log_debug("Inner child is initializing.");
3262 if (arg_userns_mode
!= USER_NAMESPACE_NO
) {
3263 /* Tell the parent, that it now can write the UID map. */
3264 (void) barrier_place(barrier
); /* #1 */
3266 /* Wait until the parent wrote the UID map */
3267 if (!barrier_place_and_sync(barrier
)) /* #2 */
3268 return log_error_errno(SYNTHETIC_ERRNO(ESRCH
), "Parent died too early");
3270 /* Become the new root user inside our namespace */
3271 r
= reset_uid_gid();
3273 return log_error_errno(r
, "Couldn't become new root: %m");
3275 /* Creating a new user namespace means all MS_SHARED mounts become MS_SLAVE. Let's put them
3276 * back to MS_SHARED here, since that's what we want as defaults. (This will not reconnect
3277 * propagation, but simply create new peer groups for all our mounts). */
3278 r
= mount_follow_verbose(LOG_ERR
, NULL
, "/", NULL
, MS_SHARED
|MS_REC
, NULL
);
3284 arg_mount_settings
| MOUNT_IN_USERNS
,
3286 arg_selinux_apifs_context
);
3290 if (!arg_network_namespace_path
&& arg_private_network
) {
3291 r
= unshare(CLONE_NEWNET
);
3293 return log_error_errno(errno
, "Failed to unshare network namespace: %m");
3295 /* Tell the parent that it can setup network interfaces. */
3296 (void) barrier_place(barrier
); /* #3 */
3299 r
= mount_sysfs(NULL
, arg_mount_settings
);
3303 /* Wait until we are cgroup-ified, so that we
3304 * can mount the right cgroup path writable */
3305 if (!barrier_place_and_sync(barrier
)) /* #4 */
3306 return log_error_errno(SYNTHETIC_ERRNO(ESRCH
),
3307 "Parent died too early");
3310 r
= unshare(CLONE_NEWCGROUP
);
3312 return log_error_errno(errno
, "Failed to unshare cgroup namespace: %m");
3315 arg_unified_cgroup_hierarchy
,
3316 arg_userns_mode
!= USER_NAMESPACE_NO
,
3319 arg_selinux_apifs_context
,
3322 r
= mount_systemd_cgroup_writable("", arg_unified_cgroup_hierarchy
);
3326 r
= setup_boot_id();
3330 r
= setup_kmsg(fd_inner_socket
);
3337 arg_n_custom_mounts
,
3340 arg_selinux_apifs_context
,
3341 MOUNT_NON_ROOT_ONLY
| MOUNT_IN_USERNS
);
3346 return log_error_errno(errno
, "setsid() failed: %m");
3348 if (arg_private_network
)
3349 (void) loopback_setup();
3351 if (arg_expose_ports
) {
3352 r
= expose_port_send_rtnl(fd_inner_socket
);
3357 if (arg_console_mode
!= CONSOLE_PIPE
) {
3358 _cleanup_close_
int master
= -EBADF
;
3359 _cleanup_free_
char *console
= NULL
;
3361 /* Allocate a pty and make it available as /dev/console. */
3362 master
= openpt_allocate(O_RDWR
|O_NONBLOCK
, &console
);
3364 return log_error_errno(master
, "Failed to allocate a pty: %m");
3366 r
= setup_dev_console(console
);
3368 return log_error_errno(r
, "Failed to set up /dev/console: %m");
3370 r
= send_one_fd(fd_inner_socket
, master
, 0);
3372 return log_error_errno(r
, "Failed to send master fd: %m");
3374 r
= setup_stdio_as_dev_console();
3383 if (arg_oom_score_adjust_set
) {
3384 r
= set_oom_score_adjust(arg_oom_score_adjust
);
3386 return log_error_errno(r
, "Failed to adjust OOM score: %m");
3389 if (arg_cpu_set
.set
)
3390 if (sched_setaffinity(0, arg_cpu_set
.allocated
, arg_cpu_set
.set
) < 0)
3391 return log_error_errno(errno
, "Failed to set CPU affinity: %m");
3393 (void) setup_hostname();
3395 if (arg_personality
!= PERSONALITY_INVALID
) {
3396 r
= safe_personality(arg_personality
);
3398 return log_error_errno(r
, "personality() failed: %m");
3399 #ifdef ARCHITECTURE_SECONDARY
3400 } else if (arg_architecture
== ARCHITECTURE_SECONDARY
) {
3401 r
= safe_personality(PER_LINUX32
);
3403 return log_error_errno(r
, "personality() failed: %m");
3405 } else if (!arg_quiet
&& arg_architecture
>= 0 && arg_architecture
!= native_architecture())
3406 log_notice("Selected architecture '%s' not supported natively on the local CPU, assuming "
3407 "invocation with qemu userspace emulator (or equivalent) in effect.",
3408 architecture_to_string(arg_architecture
));
3410 r
= setrlimit_closest_all((const struct rlimit
*const*) arg_rlimit
, &which_failed
);
3412 return log_error_errno(r
, "Failed to apply resource limit RLIMIT_%s: %m", rlimit_to_string(which_failed
));
3417 if (is_seccomp_available()) {
3418 r
= seccomp_load(arg_seccomp
);
3419 if (ERRNO_IS_NEG_SECCOMP_FATAL(r
))
3420 return log_error_errno(r
, "Failed to install seccomp filter: %m");
3422 log_debug_errno(r
, "Failed to install seccomp filter: %m");
3427 r
= setup_seccomp(arg_caps_retain
, arg_syscall_allow_list
, arg_syscall_deny_list
);
3432 if (arg_suppress_sync
) {
3434 r
= seccomp_suppress_sync();
3436 log_debug_errno(r
, "Failed to install sync() suppression seccomp filter, ignoring: %m");
3438 log_debug("systemd is built without SECCOMP support. Ignoring --suppress-sync= command line option and SuppressSync= setting.");
3443 if (arg_selinux_context
)
3444 if (setexeccon(arg_selinux_context
) < 0)
3445 return log_error_errno(errno
, "setexeccon(\"%s\") failed: %m", arg_selinux_context
);
3448 /* Make sure we keep the caps across the uid/gid dropping, so that we can retain some selected caps
3449 * if we need to later on. */
3450 if (prctl(PR_SET_KEEPCAPS
, 1) < 0)
3451 return log_error_errno(errno
, "Failed to set PR_SET_KEEPCAPS: %m");
3453 if (uid_is_valid(arg_uid
) || gid_is_valid(arg_gid
))
3454 r
= change_uid_gid_raw(arg_uid
, arg_gid
, arg_supplementary_gids
, arg_n_supplementary_gids
, arg_console_mode
!= CONSOLE_PIPE
);
3456 r
= change_uid_gid(arg_user
, arg_console_mode
!= CONSOLE_PIPE
, &home
);
3460 r
= drop_capabilities(getuid());
3462 return log_error_errno(r
, "Dropping capabilities failed: %m");
3464 if (arg_no_new_privileges
)
3465 if (prctl(PR_SET_NO_NEW_PRIVS
, 1, 0, 0, 0) < 0)
3466 return log_error_errno(errno
, "Failed to disable new privileges: %m");
3468 /* LXC sets container=lxc, so follow the scheme here */
3469 envp
[n_env
++] = strjoina("container=", arg_container_service_name
);
3471 envp
[n_env
] = strv_find_prefix(environ
, "TERM=");
3475 if (home
|| !uid_is_valid(arg_uid
) || arg_uid
== 0)
3476 if (asprintf(envp
+ n_env
++, "HOME=%s", home
?: "/root") < 0)
3479 if (arg_user
|| !uid_is_valid(arg_uid
) || arg_uid
== 0)
3480 if (asprintf(envp
+ n_env
++, "USER=%s", arg_user
?: "root") < 0 ||
3481 asprintf(envp
+ n_env
++, "LOGNAME=%s", arg_user
?: "root") < 0)
3484 assert(!sd_id128_is_null(arg_uuid
));
3486 if (asprintf(envp
+ n_env
++, "container_uuid=%s", SD_ID128_TO_UUID_STRING(arg_uuid
)) < 0)
3489 if (fdset_size(fds
) > 0) {
3490 r
= fdset_cloexec(fds
, false);
3492 return log_error_errno(r
, "Failed to unset O_CLOEXEC for file descriptors.");
3494 if ((asprintf(envp
+ n_env
++, "LISTEN_FDS=%u", fdset_size(fds
)) < 0) ||
3495 (asprintf(envp
+ n_env
++, "LISTEN_PID=1") < 0))
3498 if (asprintf(envp
+ n_env
++, "NOTIFY_SOCKET=%s", NSPAWN_NOTIFY_SOCKET_PATH
) < 0)
3501 if (arg_n_credentials
> 0) {
3502 envp
[n_env
] = strdup("CREDENTIALS_DIRECTORY=/run/host/credentials");
3508 if (arg_start_mode
!= START_BOOT
) {
3509 envp
[n_env
] = strdup("LANG=" SYSTEMD_NSPAWN_LOCALE
);
3515 env_use
= strv_env_merge(envp
, os_release_pairs
, arg_setenv
);
3519 /* Let the parent know that we are ready and wait until the parent is ready with the setup, too... */
3520 if (!barrier_place_and_sync(barrier
)) /* #5 */
3521 return log_error_errno(SYNTHETIC_ERRNO(ESRCH
), "Parent died too early");
3524 if (chdir(arg_chdir
) < 0)
3525 return log_error_errno(errno
, "Failed to change to specified working directory %s: %m", arg_chdir
);
3527 if (arg_start_mode
== START_PID2
) {
3528 r
= stub_pid1(arg_uuid
);
3533 if (arg_console_mode
!= CONSOLE_PIPE
) {
3534 /* So far our pty wasn't controlled by any process. Finally, it's time to change that, if we
3535 * are configured for that. Acquire it as controlling tty. */
3536 if (ioctl(STDIN_FILENO
, TIOCSCTTY
) < 0)
3537 return log_error_errno(errno
, "Failed to acquire controlling TTY: %m");
3540 log_debug("Inner child completed, invoking payload.");
3542 /* Now, explicitly close the log, so that we then can close all remaining fds. Closing the log explicitly first
3543 * has the benefit that the logging subsystem knows about it, and is thus ready to be reopened should we need
3544 * it again. Note that the other fds closed here are at least the locking and barrier fds. */
3546 log_set_open_when_needed(true);
3547 log_settle_target();
3549 (void) fdset_close_others(fds
);
3551 if (arg_start_mode
== START_BOOT
) {
3555 /* Automatically search for the init system */
3557 m
= strv_length(arg_parameters
);
3558 a
= newa(char*, m
+ 2);
3559 memcpy_safe(a
+ 1, arg_parameters
, m
* sizeof(char*));
3562 FOREACH_STRING(init
,
3563 "/usr/lib/systemd/systemd",
3564 "/lib/systemd/systemd",
3566 a
[0] = (char*) init
;
3567 execve(a
[0], a
, env_use
);
3570 exec_target
= "/usr/lib/systemd/systemd, /lib/systemd/systemd, /sbin/init";
3571 } else if (!strv_isempty(arg_parameters
)) {
3572 const char *dollar_path
;
3574 exec_target
= arg_parameters
[0];
3576 /* Use the user supplied search $PATH if there is one, or DEFAULT_PATH_COMPAT if not to search the
3578 dollar_path
= strv_env_get(env_use
, "PATH");
3580 if (setenv("PATH", dollar_path
, 1) < 0)
3581 return log_error_errno(errno
, "Failed to update $PATH: %m");
3584 execvpe(arg_parameters
[0], arg_parameters
, env_use
);
3587 /* If we cannot change the directory, we'll end up in /, that is expected. */
3588 (void) chdir(home
?: "/root");
3590 execle(DEFAULT_USER_SHELL
, "-" DEFAULT_USER_SHELL_NAME
, NULL
, env_use
);
3591 if (!streq(DEFAULT_USER_SHELL
, "/bin/bash"))
3592 execle("/bin/bash", "-bash", NULL
, env_use
);
3593 if (!streq(DEFAULT_USER_SHELL
, "/bin/sh"))
3594 execle("/bin/sh", "-sh", NULL
, env_use
);
3596 exec_target
= DEFAULT_USER_SHELL
", /bin/bash, /bin/sh";
3599 return log_error_errno(errno
, "execv(%s) failed: %m", exec_target
);
3602 static int setup_notify_child(void) {
3603 _cleanup_close_
int fd
= -EBADF
;
3604 static const union sockaddr_union sa
= {
3605 .un
.sun_family
= AF_UNIX
,
3606 .un
.sun_path
= NSPAWN_NOTIFY_SOCKET_PATH
,
3610 fd
= socket(AF_UNIX
, SOCK_DGRAM
|SOCK_CLOEXEC
|SOCK_NONBLOCK
, 0);
3612 return log_error_errno(errno
, "Failed to allocate notification socket: %m");
3614 (void) mkdir_parents(NSPAWN_NOTIFY_SOCKET_PATH
, 0755);
3615 (void) sockaddr_un_unlink(&sa
.un
);
3617 r
= bind(fd
, &sa
.sa
, SOCKADDR_UN_LEN(sa
.un
));
3619 return log_error_errno(errno
, "bind(" NSPAWN_NOTIFY_SOCKET_PATH
") failed: %m");
3621 r
= userns_lchown(NSPAWN_NOTIFY_SOCKET_PATH
, 0, 0);
3623 return log_error_errno(r
, "Failed to chown " NSPAWN_NOTIFY_SOCKET_PATH
": %m");
3625 r
= setsockopt_int(fd
, SOL_SOCKET
, SO_PASSCRED
, true);
3627 return log_error_errno(r
, "SO_PASSCRED failed: %m");
3632 static int outer_child(
3634 const char *directory
,
3635 DissectedImage
*dissected_image
,
3636 int fd_outer_socket
,
3637 int fd_inner_socket
,
3641 _cleanup_(bind_user_context_freep
) BindUserContext
*bind_user_context
= NULL
;
3642 _cleanup_strv_free_
char **os_release_pairs
= NULL
;
3643 _cleanup_close_
int fd
= -EBADF
, mntns_fd
= -EBADF
;
3650 /* This is the "outer" child process, i.e the one forked off by the container manager itself. It
3651 * already has its own CLONE_NEWNS namespace (which was created by the clone()). It still lives in
3652 * the host's CLONE_NEWPID, CLONE_NEWUTS, CLONE_NEWIPC, CLONE_NEWUSER and CLONE_NEWNET
3653 * namespaces. After it completed a number of initializations a second child (the "inner" one) is
3654 * forked off it, and it exits. */
3658 assert(fd_outer_socket
>= 0);
3659 assert(fd_inner_socket
>= 0);
3661 log_debug("Outer child is initializing.");
3663 r
= load_os_release_pairs_with_prefix("/", "container_host_", &os_release_pairs
);
3665 log_debug_errno(r
, "Failed to read os-release from host for container, ignoring: %m");
3667 if (prctl(PR_SET_PDEATHSIG
, SIGKILL
) < 0)
3668 return log_error_errno(errno
, "PR_SET_PDEATHSIG failed: %m");
3670 r
= reset_audit_loginuid();
3674 /* Mark everything as slave, so that we still receive mounts from the real root, but don't propagate
3675 * mounts to the real root. */
3676 r
= mount_follow_verbose(LOG_ERR
, NULL
, "/", NULL
, MS_SLAVE
|MS_REC
, NULL
);
3680 if (dissected_image
) {
3681 /* If we are operating on a disk image, then mount its root directory now, but leave out the
3682 * rest. We can read the UID shift from it if we need to. Further down we'll mount the rest,
3683 * but then with the uid shift known. That way we can mount VFAT file systems shifted to the
3684 * right place right away. This makes sure ESP partitions and userns are compatible. */
3686 r
= dissected_image_mount_and_warn(
3691 /* userns_fd= */ -EBADF
,
3692 DISSECT_IMAGE_MOUNT_ROOT_ONLY
|
3693 DISSECT_IMAGE_DISCARD_ON_LOOP
|
3694 DISSECT_IMAGE_USR_NO_ROOT
|
3695 (arg_read_only
? DISSECT_IMAGE_READ_ONLY
: DISSECT_IMAGE_FSCK
|DISSECT_IMAGE_GROWFS
)|
3696 (arg_start_mode
== START_BOOT
? DISSECT_IMAGE_VALIDATE_OS
: 0));
3701 r
= determine_uid_shift(directory
);
3705 if (arg_userns_mode
!= USER_NAMESPACE_NO
) {
3706 r
= namespace_open(0, NULL
, &mntns_fd
, NULL
, NULL
, NULL
);
3708 return log_error_errno(r
, "Failed to pin outer mount namespace: %m");
3710 l
= send_one_fd(fd_outer_socket
, mntns_fd
, 0);
3712 return log_error_errno(l
, "Failed to send outer mount namespace fd: %m");
3713 mntns_fd
= safe_close(mntns_fd
);
3715 /* Let the parent know which UID shift we read from the image */
3716 l
= send(fd_outer_socket
, &arg_uid_shift
, sizeof(arg_uid_shift
), MSG_NOSIGNAL
);
3718 return log_error_errno(errno
, "Failed to send UID shift: %m");
3719 if (l
!= sizeof(arg_uid_shift
))
3720 return log_error_errno(SYNTHETIC_ERRNO(EIO
),
3721 "Short write while sending UID shift.");
3723 if (arg_userns_mode
== USER_NAMESPACE_PICK
) {
3724 /* When we are supposed to pick the UID shift, the parent will check now whether the
3725 * UID shift we just read from the image is available. If yes, it will send the UID
3726 * shift back to us, if not it will pick a different one, and send it back to us. */
3728 l
= recv(fd_outer_socket
, &arg_uid_shift
, sizeof(arg_uid_shift
), 0);
3730 return log_error_errno(errno
, "Failed to recv UID shift: %m");
3731 if (l
!= sizeof(arg_uid_shift
))
3732 return log_error_errno(SYNTHETIC_ERRNO(EIO
),
3733 "Short read while receiving UID shift.");
3736 log_full(arg_quiet
? LOG_DEBUG
: LOG_INFO
,
3737 "Selected user namespace base " UID_FMT
" and range " UID_FMT
".", arg_uid_shift
, arg_uid_range
);
3740 if (path_equal(directory
, "/")) {
3741 /* If the directory we shall boot is the host, let's operate on a bind mount at a different
3742 * place, so that we can make changes to its mount structure (for example, to implement
3743 * --volatile=) without this interfering with our ability to access files such as
3744 * /etc/localtime to copy into the container. Note that we use a fixed place for this
3745 * (instead of a temporary directory, since we are living in our own mount namespace here
3746 * already, and thus don't need to be afraid of colliding with anyone else's mounts). */
3747 (void) mkdir_p("/run/systemd/nspawn-root", 0755);
3749 r
= mount_nofollow_verbose(LOG_ERR
, "/", "/run/systemd/nspawn-root", NULL
, MS_BIND
|MS_REC
, NULL
);
3753 directory
= "/run/systemd/nspawn-root";
3756 /* Make sure we always have a mount that we can move to root later on. */
3757 r
= make_mount_point(directory
);
3761 /* So the whole tree is now MS_SLAVE, i.e. we'll still receive mount/umount events from the host
3762 * mount namespace. For the directory we are going to run our container let's turn this off, so that
3763 * we'll live in our own little world from now on, and propagation from the host may only happen via
3764 * the mount tunnel dir, or not at all. */
3765 r
= mount_follow_verbose(LOG_ERR
, NULL
, directory
, NULL
, MS_PRIVATE
|MS_REC
, NULL
);
3769 r
= setup_pivot_root(
3772 arg_pivot_root_old
);
3776 r
= setup_volatile_mode(
3780 arg_selinux_apifs_context
);
3784 r
= bind_user_prepare(
3789 &arg_custom_mounts
, &arg_n_custom_mounts
,
3790 &bind_user_context
);
3794 if (arg_userns_mode
!= USER_NAMESPACE_NO
&& bind_user_context
) {
3795 /* Send the user maps we determined to the parent, so that it installs it in our user
3796 * namespace UID map table */
3798 for (size_t i
= 0; i
< bind_user_context
->n_data
; i
++) {
3800 bind_user_context
->data
[i
].payload_user
->uid
,
3801 bind_user_context
->data
[i
].host_user
->uid
,
3802 (uid_t
) bind_user_context
->data
[i
].payload_group
->gid
,
3803 (uid_t
) bind_user_context
->data
[i
].host_group
->gid
,
3806 l
= send(fd_outer_socket
, map
, sizeof(map
), MSG_NOSIGNAL
);
3808 return log_error_errno(errno
, "Failed to send user UID map: %m");
3809 if (l
!= sizeof(map
))
3810 return log_error_errno(SYNTHETIC_ERRNO(EIO
),
3811 "Short write while sending user UID map.");
3818 arg_n_custom_mounts
,
3821 arg_selinux_apifs_context
,
3826 if (arg_userns_mode
!= USER_NAMESPACE_NO
&&
3827 IN_SET(arg_userns_ownership
, USER_NAMESPACE_OWNERSHIP_MAP
, USER_NAMESPACE_OWNERSHIP_AUTO
) &&
3828 arg_uid_shift
!= 0) {
3829 _cleanup_free_
char *usr_subtree
= NULL
;
3833 dirs
[i
++] = (char*) directory
;
3835 if (dissected_image
&& dissected_image
->partitions
[PARTITION_USR
].found
) {
3836 usr_subtree
= path_join(directory
, "/usr");
3840 dirs
[i
++] = usr_subtree
;
3845 r
= remount_idmap(dirs
, arg_uid_shift
, arg_uid_range
, UID_INVALID
, REMOUNT_IDMAPPING_HOST_ROOT
);
3846 if (r
== -EINVAL
|| ERRNO_IS_NEG_NOT_SUPPORTED(r
)) {
3847 /* This might fail because the kernel or file system doesn't support idmapping. We
3848 * can't really distinguish this nicely, nor do we have any guarantees about the
3849 * error codes we see, could be EOPNOTSUPP or EINVAL. */
3850 if (arg_userns_ownership
!= USER_NAMESPACE_OWNERSHIP_AUTO
)
3851 return log_error_errno(SYNTHETIC_ERRNO(EOPNOTSUPP
),
3852 "ID mapped mounts are apparently not available, sorry.");
3854 log_debug("ID mapped mounts are apparently not available on this kernel or for the selected file system, reverting to recursive chown()ing.");
3855 arg_userns_ownership
= USER_NAMESPACE_OWNERSHIP_CHOWN
;
3857 return log_error_errno(r
, "Failed to set up ID mapped mounts: %m");
3859 log_debug("ID mapped mounts available, making use of them.");
3864 if (dissected_image
) {
3865 /* Now we know the uid shift, let's now mount everything else that might be in the image. */
3866 r
= dissected_image_mount(
3871 /* userns_fd= */ -EBADF
,
3872 DISSECT_IMAGE_MOUNT_NON_ROOT_ONLY
|
3873 DISSECT_IMAGE_DISCARD_ON_LOOP
|
3874 DISSECT_IMAGE_USR_NO_ROOT
|
3875 (arg_read_only
? DISSECT_IMAGE_READ_ONLY
: DISSECT_IMAGE_FSCK
|DISSECT_IMAGE_GROWFS
)|
3876 (idmap
? DISSECT_IMAGE_MOUNT_IDMAPPED
: 0));
3878 return log_error_errno(r
, "File system check for image failed: %m");
3880 return log_error_errno(r
, "Failed to mount image file system: %m");
3883 if (arg_unified_cgroup_hierarchy
== CGROUP_UNIFIED_UNKNOWN
) {
3884 /* OK, we don't know yet which cgroup mode to use yet. Let's figure it out, and tell the parent. */
3886 r
= detect_unified_cgroup_hierarchy_from_image(directory
);
3890 l
= send(fd_outer_socket
, &arg_unified_cgroup_hierarchy
, sizeof(arg_unified_cgroup_hierarchy
), MSG_NOSIGNAL
);
3892 return log_error_errno(errno
, "Failed to send cgroup mode: %m");
3893 if (l
!= sizeof(arg_unified_cgroup_hierarchy
))
3894 return log_error_errno(SYNTHETIC_ERRNO(EIO
),
3895 "Short write while sending cgroup mode.");
3898 r
= recursive_chown(directory
, arg_uid_shift
, arg_uid_range
);
3902 r
= base_filesystem_create(directory
, arg_uid_shift
, (gid_t
) arg_uid_shift
);
3906 if (arg_read_only
&& arg_volatile_mode
== VOLATILE_NO
&&
3907 !has_custom_root_mount(arg_custom_mounts
, arg_n_custom_mounts
)) {
3908 r
= bind_remount_recursive(directory
, MS_RDONLY
, MS_RDONLY
, NULL
);
3910 return log_error_errno(r
, "Failed to make tree read-only: %m");
3913 r
= mount_all(directory
,
3916 arg_selinux_apifs_context
);
3920 r
= copy_devnodes(directory
);
3924 r
= make_extra_nodes(directory
);
3928 (void) dev_setup(directory
, arg_uid_shift
, arg_uid_shift
);
3930 p
= prefix_roota(directory
, "/run/host");
3931 (void) make_inaccessible_nodes(p
, arg_uid_shift
, arg_uid_shift
);
3933 r
= setup_pts(directory
);
3937 r
= mount_tunnel_dig(directory
);
3941 r
= setup_keyring();
3945 r
= setup_credentials(directory
);
3949 r
= bind_user_setup(bind_user_context
, directory
);
3956 arg_n_custom_mounts
,
3959 arg_selinux_apifs_context
,
3960 MOUNT_NON_ROOT_ONLY
);
3964 r
= setup_timezone(directory
);
3968 r
= setup_resolv_conf(directory
);
3972 r
= setup_machine_id(directory
);
3976 r
= setup_journal(directory
);
3980 /* The same stuff as the $container env var, but nicely readable for the entire payload */
3981 p
= prefix_roota(directory
, "/run/host/container-manager");
3982 (void) write_string_file(p
, arg_container_service_name
, WRITE_STRING_FILE_CREATE
);
3984 /* The same stuff as the $container_uuid env var */
3985 p
= prefix_roota(directory
, "/run/host/container-uuid");
3986 (void) write_string_filef(p
, WRITE_STRING_FILE_CREATE
, SD_ID128_UUID_FORMAT_STR
, SD_ID128_FORMAT_VAL(arg_uuid
));
3988 if (!arg_use_cgns
) {
3991 arg_unified_cgroup_hierarchy
,
3992 arg_userns_mode
!= USER_NAMESPACE_NO
,
3995 arg_selinux_apifs_context
,
4001 /* Mark everything as shared so our mounts get propagated down. This is required to make new bind
4002 * mounts available in systemd services inside the container that create a new mount namespace. See
4003 * https://github.com/systemd/systemd/issues/3860 Further submounts (such as /dev) done after this
4004 * will inherit the shared propagation mode.
4006 * IMPORTANT: Do not overmount the root directory anymore from now on to enable moving the root
4007 * directory mount to root later on.
4008 * https://github.com/systemd/systemd/issues/3847#issuecomment-562735251
4010 r
= mount_switch_root(directory
, MS_SHARED
);
4012 return log_error_errno(r
, "Failed to move root directory: %m");
4014 /* We finished setting up the rootfs which is a shared mount. The mount tunnel needs to be a
4015 * dependent mount otherwise we can't MS_MOVE mounts that were propagated from the host into
4017 r
= mount_tunnel_open();
4021 if (arg_userns_mode
!= USER_NAMESPACE_NO
) {
4022 /* In order to mount procfs and sysfs in an unprivileged container the kernel
4023 * requires that a fully visible instance is already present in the target mount
4024 * namespace. Mount one here so the inner child can mount its own instances. Later
4025 * we umount the temporary instances created here before we actually exec the
4026 * payload. Since the rootfs is shared the umount will propagate into the container.
4027 * Note, the inner child wouldn't be able to unmount the instances on its own since
4028 * it doesn't own the originating mount namespace. IOW, the outer child needs to do
4030 r
= pin_fully_visible_fs();
4035 fd
= setup_notify_child();
4039 pid
= raw_clone(SIGCHLD
|CLONE_NEWNS
|
4040 arg_clone_ns_flags
|
4041 (arg_userns_mode
!= USER_NAMESPACE_NO
? CLONE_NEWUSER
: 0));
4043 return log_error_errno(errno
, "Failed to fork inner child: %m");
4045 fd_outer_socket
= safe_close(fd_outer_socket
);
4047 /* The inner child has all namespaces that are requested, so that we all are owned by the
4048 * user if user namespaces are turned on. */
4050 if (arg_network_namespace_path
) {
4051 r
= namespace_enter(-1, -1, netns_fd
, -1, -1);
4053 return log_error_errno(r
, "Failed to join network namespace: %m");
4056 r
= inner_child(barrier
, fd_inner_socket
, fds
, os_release_pairs
);
4058 _exit(EXIT_FAILURE
);
4060 _exit(EXIT_SUCCESS
);
4063 l
= send(fd_outer_socket
, &pid
, sizeof(pid
), MSG_NOSIGNAL
);
4065 return log_error_errno(errno
, "Failed to send PID: %m");
4066 if (l
!= sizeof(pid
))
4067 return log_error_errno(SYNTHETIC_ERRNO(EIO
),
4068 "Short write while sending PID.");
4070 l
= send(fd_outer_socket
, &arg_uuid
, sizeof(arg_uuid
), MSG_NOSIGNAL
);
4072 return log_error_errno(errno
, "Failed to send machine ID: %m");
4073 if (l
!= sizeof(arg_uuid
))
4074 return log_error_errno(SYNTHETIC_ERRNO(EIO
),
4075 "Short write while sending machine ID.");
4077 l
= send_one_fd(fd_outer_socket
, fd
, 0);
4079 return log_error_errno(l
, "Failed to send notify fd: %m");
4081 fd_outer_socket
= safe_close(fd_outer_socket
);
4082 fd_inner_socket
= safe_close(fd_inner_socket
);
4083 netns_fd
= safe_close(netns_fd
);
4088 static int uid_shift_pick(uid_t
*shift
, LockFile
*ret_lock_file
) {
4089 bool tried_hashed
= false;
4090 unsigned n_tries
= 100;
4095 assert(ret_lock_file
);
4096 assert(arg_userns_mode
== USER_NAMESPACE_PICK
);
4097 assert(arg_uid_range
== 0x10000U
);
4101 (void) mkdir("/run/systemd/nspawn-uid", 0755);
4104 char lock_path
[STRLEN("/run/systemd/nspawn-uid/") + DECIMAL_STR_MAX(uid_t
) + 1];
4105 _cleanup_(release_lock_file
) LockFile lf
= LOCK_FILE_INIT
;
4110 if (candidate
< CONTAINER_UID_BASE_MIN
|| candidate
> CONTAINER_UID_BASE_MAX
)
4112 if ((candidate
& UINT32_C(0xFFFF)) != 0)
4115 xsprintf(lock_path
, "/run/systemd/nspawn-uid/" UID_FMT
, candidate
);
4116 r
= make_lock_file(lock_path
, LOCK_EX
|LOCK_NB
, &lf
);
4117 if (r
== -EBUSY
) /* Range already taken by another nspawn instance */
4122 /* Make some superficial checks whether the range is currently known in the user database */
4123 if (getpwuid(candidate
))
4125 if (getpwuid(candidate
+ UINT32_C(0xFFFE)))
4127 if (getgrgid(candidate
))
4129 if (getgrgid(candidate
+ UINT32_C(0xFFFE)))
4132 *ret_lock_file
= lf
;
4133 lf
= (struct LockFile
) LOCK_FILE_INIT
;
4138 if (arg_machine
&& !tried_hashed
) {
4139 /* Try to hash the base from the container name */
4141 static const uint8_t hash_key
[] = {
4142 0xe1, 0x56, 0xe0, 0xf0, 0x4a, 0xf0, 0x41, 0xaf,
4143 0x96, 0x41, 0xcf, 0x41, 0x33, 0x94, 0xff, 0x72
4146 candidate
= (uid_t
) siphash24(arg_machine
, strlen(arg_machine
), hash_key
);
4148 tried_hashed
= true;
4150 random_bytes(&candidate
, sizeof(candidate
));
4152 candidate
= (candidate
% (CONTAINER_UID_BASE_MAX
- CONTAINER_UID_BASE_MIN
)) + CONTAINER_UID_BASE_MIN
;
4153 candidate
&= (uid_t
) UINT32_C(0xFFFF0000);
4157 static int add_one_uid_map(
4159 uid_t container_uid
,
4163 return strextendf(p
,
4164 UID_FMT
" " UID_FMT
" " UID_FMT
"\n",
4165 container_uid
, host_uid
, range
);
4168 static int make_uid_map_string(
4169 const uid_t bind_user_uid
[],
4170 size_t n_bind_user_uid
,
4174 _cleanup_free_
char *s
= NULL
;
4175 uid_t previous_uid
= 0;
4178 assert(n_bind_user_uid
== 0 || bind_user_uid
);
4179 assert(IN_SET(offset
, 0, 2)); /* used to switch between UID and GID map */
4182 /* The bind_user_uid[] array is a series of 4 uid_t values, for each --bind-user= entry one
4183 * quadruplet, consisting of host and container UID + GID. */
4185 for (size_t i
= 0; i
< n_bind_user_uid
; i
++) {
4186 uid_t payload_uid
= bind_user_uid
[i
*4+offset
],
4187 host_uid
= bind_user_uid
[i
*4+offset
+1];
4189 assert(previous_uid
<= payload_uid
);
4190 assert(payload_uid
< arg_uid_range
);
4192 /* Add a range to close the gap to previous entry */
4193 if (payload_uid
> previous_uid
) {
4194 r
= add_one_uid_map(&s
, previous_uid
, arg_uid_shift
+ previous_uid
, payload_uid
- previous_uid
);
4199 /* Map this specific user */
4200 r
= add_one_uid_map(&s
, payload_uid
, host_uid
, 1);
4204 previous_uid
= payload_uid
+ 1;
4207 /* And add a range to close the gap to finish the range */
4208 if (arg_uid_range
> previous_uid
) {
4209 r
= add_one_uid_map(&s
, previous_uid
, arg_uid_shift
+ previous_uid
, arg_uid_range
- previous_uid
);
4220 static int setup_uid_map(
4222 const uid_t bind_user_uid
[],
4223 size_t n_bind_user_uid
) {
4225 char uid_map
[STRLEN("/proc//uid_map") + DECIMAL_STR_MAX(uid_t
) + 1];
4226 _cleanup_free_
char *s
= NULL
;
4231 /* Build the UID map string */
4232 if (make_uid_map_string(bind_user_uid
, n_bind_user_uid
, 0, &s
) < 0) /* offset=0 contains the UID pair */
4235 xsprintf(uid_map
, "/proc/" PID_FMT
"/uid_map", pid
);
4236 r
= write_string_file(uid_map
, s
, WRITE_STRING_FILE_DISABLE_BUFFER
);
4238 return log_error_errno(r
, "Failed to write UID map: %m");
4240 /* And now build the GID map string */
4242 if (make_uid_map_string(bind_user_uid
, n_bind_user_uid
, 2, &s
) < 0) /* offset=2 contains the GID pair */
4245 xsprintf(uid_map
, "/proc/" PID_FMT
"/gid_map", pid
);
4246 r
= write_string_file(uid_map
, s
, WRITE_STRING_FILE_DISABLE_BUFFER
);
4248 return log_error_errno(r
, "Failed to write GID map: %m");
4253 static int nspawn_dispatch_notify_fd(sd_event_source
*source
, int fd
, uint32_t revents
, void *userdata
) {
4254 char buf
[NOTIFY_BUFFER_MAX
+1];
4256 struct iovec iovec
= {
4258 .iov_len
= sizeof(buf
)-1,
4260 CMSG_BUFFER_TYPE(CMSG_SPACE(sizeof(struct ucred
)) +
4261 CMSG_SPACE(sizeof(int) * NOTIFY_FD_MAX
)) control
;
4262 struct msghdr msghdr
= {
4265 .msg_control
= &control
,
4266 .msg_controllen
= sizeof(control
),
4268 struct ucred
*ucred
;
4270 pid_t inner_child_pid
;
4271 _cleanup_strv_free_
char **tags
= NULL
;
4276 inner_child_pid
= PTR_TO_PID(userdata
);
4278 if (revents
!= EPOLLIN
) {
4279 log_warning("Got unexpected poll event for notify fd.");
4283 n
= recvmsg_safe(fd
, &msghdr
, MSG_DONTWAIT
|MSG_CMSG_CLOEXEC
);
4284 if (ERRNO_IS_NEG_TRANSIENT(n
))
4286 else if (n
== -EXFULL
) {
4287 log_warning("Got message with truncated control data (too many fds sent?), ignoring.");
4290 return log_warning_errno(n
, "Couldn't read notification socket: %m");
4292 cmsg_close_all(&msghdr
);
4294 ucred
= CMSG_FIND_DATA(&msghdr
, SOL_SOCKET
, SCM_CREDENTIALS
, struct ucred
);
4295 if (!ucred
|| ucred
->pid
!= inner_child_pid
) {
4296 log_debug("Received notify message without valid credentials. Ignoring.");
4300 if ((size_t) n
>= sizeof(buf
)) {
4301 log_warning("Received notify message exceeded maximum size. Ignoring.");
4306 tags
= strv_split(buf
, "\n\r");
4310 if (strv_contains(tags
, "READY=1")) {
4311 r
= sd_notify(false, "READY=1\n");
4313 log_warning_errno(r
, "Failed to send readiness notification, ignoring: %m");
4316 p
= strv_find_startswith(tags
, "STATUS=");
4318 (void) sd_notifyf(false, "STATUS=Container running: %s", p
);
4323 static int setup_notify_parent(sd_event
*event
, int fd
, pid_t
*inner_child_pid
, sd_event_source
**notify_event_source
) {
4326 r
= sd_event_add_io(event
, notify_event_source
, fd
, EPOLLIN
, nspawn_dispatch_notify_fd
, inner_child_pid
);
4328 return log_error_errno(r
, "Failed to allocate notify event source: %m");
4330 (void) sd_event_source_set_description(*notify_event_source
, "nspawn-notify");
4335 static int merge_settings(Settings
*settings
, const char *path
) {
4341 /* Copy over bits from the settings, unless they have been explicitly masked by command line switches. Note
4342 * that this steals the fields of the Settings* structure, and hence modifies it. */
4344 if ((arg_settings_mask
& SETTING_START_MODE
) == 0 &&
4345 settings
->start_mode
>= 0) {
4346 arg_start_mode
= settings
->start_mode
;
4347 strv_free_and_replace(arg_parameters
, settings
->parameters
);
4350 if ((arg_settings_mask
& SETTING_EPHEMERAL
) == 0 &&
4351 settings
->ephemeral
>= 0)
4352 arg_ephemeral
= settings
->ephemeral
;
4354 if ((arg_settings_mask
& SETTING_DIRECTORY
) == 0 &&
4357 if (!arg_settings_trusted
)
4358 log_warning("Ignoring root directory setting, file %s is not trusted.", path
);
4360 free_and_replace(arg_directory
, settings
->root
);
4363 if ((arg_settings_mask
& SETTING_PIVOT_ROOT
) == 0 &&
4364 settings
->pivot_root_new
) {
4365 free_and_replace(arg_pivot_root_new
, settings
->pivot_root_new
);
4366 free_and_replace(arg_pivot_root_old
, settings
->pivot_root_old
);
4369 if ((arg_settings_mask
& SETTING_WORKING_DIRECTORY
) == 0 &&
4370 settings
->working_directory
)
4371 free_and_replace(arg_chdir
, settings
->working_directory
);
4373 if ((arg_settings_mask
& SETTING_ENVIRONMENT
) == 0 &&
4374 settings
->environment
)
4375 strv_free_and_replace(arg_setenv
, settings
->environment
);
4377 if ((arg_settings_mask
& SETTING_USER
) == 0) {
4380 free_and_replace(arg_user
, settings
->user
);
4382 if (uid_is_valid(settings
->uid
))
4383 arg_uid
= settings
->uid
;
4384 if (gid_is_valid(settings
->gid
))
4385 arg_gid
= settings
->gid
;
4386 if (settings
->n_supplementary_gids
> 0) {
4387 free_and_replace(arg_supplementary_gids
, settings
->supplementary_gids
);
4388 arg_n_supplementary_gids
= settings
->n_supplementary_gids
;
4392 if ((arg_settings_mask
& SETTING_CAPABILITY
) == 0) {
4393 uint64_t plus
, minus
;
4394 uint64_t network_minus
= 0;
4397 /* Note that we copy both the simple plus/minus caps here, and the full quintet from the
4398 * Settings structure */
4400 plus
= settings
->capability
;
4401 minus
= settings
->drop_capability
;
4403 if ((arg_settings_mask
& SETTING_NETWORK
) == 0 &&
4404 settings_network_configured(settings
)) {
4405 if (settings_private_network(settings
))
4406 plus
|= UINT64_C(1) << CAP_NET_ADMIN
;
4408 network_minus
|= UINT64_C(1) << CAP_NET_ADMIN
;
4411 if (!arg_settings_trusted
&& plus
!= 0) {
4412 if (settings
->capability
!= 0)
4413 log_warning("Ignoring Capability= setting, file %s is not trusted.", path
);
4415 arg_caps_retain
&= ~network_minus
;
4416 arg_caps_retain
|= plus
;
4419 arg_caps_retain
&= ~minus
;
4421 /* Copy the full capabilities over too */
4422 if (capability_quintet_is_set(&settings
->full_capabilities
)) {
4423 if (!arg_settings_trusted
)
4424 log_warning("Ignoring capability settings, file %s is not trusted.", path
);
4426 arg_full_capabilities
= settings
->full_capabilities
;
4429 ambient
= settings
->ambient_capability
;
4430 if (!arg_settings_trusted
&& ambient
!= 0)
4431 log_warning("Ignoring AmbientCapability= setting, file %s is not trusted.", path
);
4433 arg_caps_ambient
|= ambient
;
4436 if ((arg_settings_mask
& SETTING_KILL_SIGNAL
) == 0 &&
4437 settings
->kill_signal
> 0)
4438 arg_kill_signal
= settings
->kill_signal
;
4440 if ((arg_settings_mask
& SETTING_PERSONALITY
) == 0 &&
4441 settings
->personality
!= PERSONALITY_INVALID
)
4442 arg_personality
= settings
->personality
;
4444 if ((arg_settings_mask
& SETTING_MACHINE_ID
) == 0 &&
4445 !sd_id128_is_null(settings
->machine_id
)) {
4447 if (!arg_settings_trusted
)
4448 log_warning("Ignoring MachineID= setting, file %s is not trusted.", path
);
4450 arg_uuid
= settings
->machine_id
;
4453 if ((arg_settings_mask
& SETTING_READ_ONLY
) == 0 &&
4454 settings
->read_only
>= 0)
4455 arg_read_only
= settings
->read_only
;
4457 if ((arg_settings_mask
& SETTING_VOLATILE_MODE
) == 0 &&
4458 settings
->volatile_mode
!= _VOLATILE_MODE_INVALID
)
4459 arg_volatile_mode
= settings
->volatile_mode
;
4461 if ((arg_settings_mask
& SETTING_CUSTOM_MOUNTS
) == 0 &&
4462 settings
->n_custom_mounts
> 0) {
4464 if (!arg_settings_trusted
)
4465 log_warning("Ignoring TemporaryFileSystem=, Bind= and BindReadOnly= settings, file %s is not trusted.", path
);
4467 custom_mount_free_all(arg_custom_mounts
, arg_n_custom_mounts
);
4468 arg_custom_mounts
= TAKE_PTR(settings
->custom_mounts
);
4469 arg_n_custom_mounts
= settings
->n_custom_mounts
;
4470 settings
->n_custom_mounts
= 0;
4474 if ((arg_settings_mask
& SETTING_NETWORK
) == 0 &&
4475 settings_network_configured(settings
)) {
4477 if (!arg_settings_trusted
)
4478 log_warning("Ignoring network settings, file %s is not trusted.", path
);
4480 arg_network_veth
= settings_network_veth(settings
);
4481 arg_private_network
= settings_private_network(settings
);
4483 strv_free_and_replace(arg_network_interfaces
, settings
->network_interfaces
);
4484 strv_free_and_replace(arg_network_macvlan
, settings
->network_macvlan
);
4485 strv_free_and_replace(arg_network_ipvlan
, settings
->network_ipvlan
);
4486 strv_free_and_replace(arg_network_veth_extra
, settings
->network_veth_extra
);
4488 free_and_replace(arg_network_bridge
, settings
->network_bridge
);
4489 free_and_replace(arg_network_zone
, settings
->network_zone
);
4491 free_and_replace(arg_network_namespace_path
, settings
->network_namespace_path
);
4495 if ((arg_settings_mask
& SETTING_EXPOSE_PORTS
) == 0 &&
4496 settings
->expose_ports
) {
4498 if (!arg_settings_trusted
)
4499 log_warning("Ignoring Port= setting, file %s is not trusted.", path
);
4501 expose_port_free_all(arg_expose_ports
);
4502 arg_expose_ports
= TAKE_PTR(settings
->expose_ports
);
4506 if ((arg_settings_mask
& SETTING_USERNS
) == 0 &&
4507 settings
->userns_mode
!= _USER_NAMESPACE_MODE_INVALID
) {
4509 if (!arg_settings_trusted
)
4510 log_warning("Ignoring PrivateUsers= and PrivateUsersChown= settings, file %s is not trusted.", path
);
4512 arg_userns_mode
= settings
->userns_mode
;
4513 arg_uid_shift
= settings
->uid_shift
;
4514 arg_uid_range
= settings
->uid_range
;
4515 arg_userns_ownership
= settings
->userns_ownership
;
4519 if ((arg_settings_mask
& SETTING_BIND_USER
) == 0 &&
4520 !strv_isempty(settings
->bind_user
))
4521 strv_free_and_replace(arg_bind_user
, settings
->bind_user
);
4523 if ((arg_settings_mask
& SETTING_NOTIFY_READY
) == 0 &&
4524 settings
->notify_ready
>= 0)
4525 arg_notify_ready
= settings
->notify_ready
;
4527 if ((arg_settings_mask
& SETTING_SYSCALL_FILTER
) == 0) {
4529 if (!strv_isempty(settings
->syscall_allow_list
) || !strv_isempty(settings
->syscall_deny_list
)) {
4530 if (!arg_settings_trusted
&& !strv_isempty(settings
->syscall_allow_list
))
4531 log_warning("Ignoring SystemCallFilter= settings, file %s is not trusted.", path
);
4533 strv_free_and_replace(arg_syscall_allow_list
, settings
->syscall_allow_list
);
4534 strv_free_and_replace(arg_syscall_deny_list
, settings
->syscall_deny_list
);
4539 if (settings
->seccomp
) {
4540 if (!arg_settings_trusted
)
4541 log_warning("Ignoring SECCOMP filter, file %s is not trusted.", path
);
4543 seccomp_release(arg_seccomp
);
4544 arg_seccomp
= TAKE_PTR(settings
->seccomp
);
4550 for (rl
= 0; rl
< _RLIMIT_MAX
; rl
++) {
4551 if ((arg_settings_mask
& (SETTING_RLIMIT_FIRST
<< rl
)))
4554 if (!settings
->rlimit
[rl
])
4557 if (!arg_settings_trusted
) {
4558 log_warning("Ignoring Limit%s= setting, file '%s' is not trusted.", rlimit_to_string(rl
), path
);
4562 free_and_replace(arg_rlimit
[rl
], settings
->rlimit
[rl
]);
4565 if ((arg_settings_mask
& SETTING_HOSTNAME
) == 0 &&
4567 free_and_replace(arg_hostname
, settings
->hostname
);
4569 if ((arg_settings_mask
& SETTING_NO_NEW_PRIVILEGES
) == 0 &&
4570 settings
->no_new_privileges
>= 0)
4571 arg_no_new_privileges
= settings
->no_new_privileges
;
4573 if ((arg_settings_mask
& SETTING_OOM_SCORE_ADJUST
) == 0 &&
4574 settings
->oom_score_adjust_set
) {
4576 if (!arg_settings_trusted
)
4577 log_warning("Ignoring OOMScoreAdjust= setting, file '%s' is not trusted.", path
);
4579 arg_oom_score_adjust
= settings
->oom_score_adjust
;
4580 arg_oom_score_adjust_set
= true;
4584 if ((arg_settings_mask
& SETTING_CPU_AFFINITY
) == 0 &&
4585 settings
->cpu_set
.set
) {
4587 if (!arg_settings_trusted
)
4588 log_warning("Ignoring CPUAffinity= setting, file '%s' is not trusted.", path
);
4590 cpu_set_reset(&arg_cpu_set
);
4591 arg_cpu_set
= TAKE_STRUCT(settings
->cpu_set
);
4595 if ((arg_settings_mask
& SETTING_RESOLV_CONF
) == 0 &&
4596 settings
->resolv_conf
!= _RESOLV_CONF_MODE_INVALID
)
4597 arg_resolv_conf
= settings
->resolv_conf
;
4599 if ((arg_settings_mask
& SETTING_LINK_JOURNAL
) == 0 &&
4600 settings
->link_journal
!= _LINK_JOURNAL_INVALID
) {
4602 if (!arg_settings_trusted
)
4603 log_warning("Ignoring journal link setting, file '%s' is not trusted.", path
);
4605 arg_link_journal
= settings
->link_journal
;
4606 arg_link_journal_try
= settings
->link_journal_try
;
4610 if ((arg_settings_mask
& SETTING_TIMEZONE
) == 0 &&
4611 settings
->timezone
!= _TIMEZONE_MODE_INVALID
)
4612 arg_timezone
= settings
->timezone
;
4614 if ((arg_settings_mask
& SETTING_SLICE
) == 0 &&
4617 if (!arg_settings_trusted
)
4618 log_warning("Ignoring slice setting, file '%s' is not trusted.", path
);
4620 free_and_replace(arg_slice
, settings
->slice
);
4623 if ((arg_settings_mask
& SETTING_USE_CGNS
) == 0 &&
4624 settings
->use_cgns
>= 0) {
4626 if (!arg_settings_trusted
)
4627 log_warning("Ignoring cgroup namespace setting, file '%s' is not trusted.", path
);
4629 arg_use_cgns
= settings
->use_cgns
;
4632 if ((arg_settings_mask
& SETTING_CLONE_NS_FLAGS
) == 0 &&
4633 settings
->clone_ns_flags
!= ULONG_MAX
) {
4635 if (!arg_settings_trusted
)
4636 log_warning("Ignoring namespace setting, file '%s' is not trusted.", path
);
4638 arg_clone_ns_flags
= settings
->clone_ns_flags
;
4641 if ((arg_settings_mask
& SETTING_CONSOLE_MODE
) == 0 &&
4642 settings
->console_mode
>= 0) {
4644 if (!arg_settings_trusted
)
4645 log_warning("Ignoring console mode setting, file '%s' is not trusted.", path
);
4647 arg_console_mode
= settings
->console_mode
;
4650 if ((arg_settings_mask
& SETTING_SUPPRESS_SYNC
) == 0 &&
4651 settings
->suppress_sync
>= 0)
4652 arg_suppress_sync
= settings
->suppress_sync
;
4654 /* The following properties can only be set through the OCI settings logic, not from the command line, hence we
4655 * don't consult arg_settings_mask for them. */
4657 sd_bus_message_unref(arg_property_message
);
4658 arg_property_message
= TAKE_PTR(settings
->properties
);
4660 arg_console_width
= settings
->console_width
;
4661 arg_console_height
= settings
->console_height
;
4663 device_node_array_free(arg_extra_nodes
, arg_n_extra_nodes
);
4664 arg_extra_nodes
= TAKE_PTR(settings
->extra_nodes
);
4665 arg_n_extra_nodes
= settings
->n_extra_nodes
;
4666 settings
->n_extra_nodes
= 0;
4671 static int load_settings(void) {
4672 _cleanup_(settings_freep
) Settings
*settings
= NULL
;
4673 _cleanup_fclose_
FILE *f
= NULL
;
4674 _cleanup_free_
char *p
= NULL
;
4680 /* If all settings are masked, there's no point in looking for
4681 * the settings file */
4682 if (FLAGS_SET(arg_settings_mask
, _SETTINGS_MASK_ALL
))
4685 /* We first look in the admin's directories in /etc and /run */
4686 FOREACH_STRING(i
, "/etc/systemd/nspawn", "/run/systemd/nspawn") {
4687 _cleanup_free_
char *j
= NULL
;
4689 j
= path_join(i
, arg_settings_filename
);
4697 /* By default, we trust configuration from /etc and /run */
4698 if (arg_settings_trusted
< 0)
4699 arg_settings_trusted
= true;
4704 if (errno
!= ENOENT
)
4705 return log_error_errno(errno
, "Failed to open %s: %m", j
);
4709 /* After that, let's look for a file next to the
4710 * actual image we shall boot. */
4713 r
= file_in_same_dir(arg_image
, arg_settings_filename
, &p
);
4715 return log_error_errno(r
, "Failed to generate settings path from image path: %m");
4716 } else if (arg_directory
) {
4717 r
= file_in_same_dir(arg_directory
, arg_settings_filename
, &p
);
4718 if (r
< 0 && r
!= -EADDRNOTAVAIL
) /* if directory is root fs, don't complain */
4719 return log_error_errno(r
, "Failed to generate settings path from directory path: %m");
4724 if (!f
&& errno
!= ENOENT
)
4725 return log_error_errno(errno
, "Failed to open %s: %m", p
);
4727 /* By default, we do not trust configuration from /var/lib/machines */
4728 if (arg_settings_trusted
< 0)
4729 arg_settings_trusted
= false;
4736 log_debug("Settings are trusted: %s", yes_no(arg_settings_trusted
));
4738 r
= settings_load(f
, p
, &settings
);
4742 return merge_settings(settings
, p
);
4745 static int load_oci_bundle(void) {
4746 _cleanup_(settings_freep
) Settings
*settings
= NULL
;
4749 if (!arg_oci_bundle
)
4752 /* By default let's trust OCI bundles */
4753 if (arg_settings_trusted
< 0)
4754 arg_settings_trusted
= true;
4756 r
= oci_load(NULL
, arg_oci_bundle
, &settings
);
4760 return merge_settings(settings
, arg_oci_bundle
);
4763 static int run_container(
4764 DissectedImage
*dissected_image
,
4766 char veth_name
[IFNAMSIZ
], bool *veth_created
,
4767 struct ExposeArgs
*expose_args
,
4768 int *master
, pid_t
*pid
, int *ret
) {
4770 static const struct sigaction sa
= {
4771 .sa_handler
= nop_signal_handler
,
4772 .sa_flags
= SA_NOCLDSTOP
|SA_RESTART
,
4775 _cleanup_(release_lock_file
) LockFile uid_shift_lock
= LOCK_FILE_INIT
;
4776 _cleanup_close_
int etc_passwd_lock
= -EBADF
;
4777 _cleanup_close_pair_
int
4778 fd_inner_socket_pair
[2] = EBADF_PAIR
,
4779 fd_outer_socket_pair
[2] = EBADF_PAIR
;
4781 _cleanup_close_
int notify_socket
= -EBADF
, mntns_fd
= -EBADF
, fd_kmsg_fifo
= -EBADF
;
4782 _cleanup_(barrier_destroy
) Barrier barrier
= BARRIER_NULL
;
4783 _cleanup_(sd_event_source_unrefp
) sd_event_source
*notify_event_source
= NULL
;
4784 _cleanup_(sd_event_unrefp
) sd_event
*event
= NULL
;
4785 _cleanup_(pty_forward_freep
) PTYForward
*forward
= NULL
;
4786 _cleanup_(sd_netlink_unrefp
) sd_netlink
*rtnl
= NULL
;
4787 _cleanup_(sd_bus_flush_close_unrefp
) sd_bus
*bus
= NULL
;
4788 _cleanup_free_ uid_t
*bind_user_uid
= NULL
;
4789 size_t n_bind_user_uid
= 0;
4790 ContainerStatus container_status
= 0;
4794 _cleanup_close_
int child_netns_fd
= -EBADF
;
4796 assert_se(sigemptyset(&mask_chld
) == 0);
4797 assert_se(sigaddset(&mask_chld
, SIGCHLD
) == 0);
4799 if (arg_userns_mode
== USER_NAMESPACE_PICK
) {
4800 /* When we shall pick the UID/GID range, let's first lock /etc/passwd, so that we can safely
4801 * check with getpwuid() if the specific user already exists. Note that /etc might be
4802 * read-only, in which case this will fail with EROFS. But that's really OK, as in that case we
4803 * can be reasonably sure that no users are going to be added. Note that getpwuid() checks are
4804 * really just an extra safety net. We kinda assume that the UID range we allocate from is
4807 etc_passwd_lock
= take_etc_passwd_lock(NULL
);
4808 if (etc_passwd_lock
< 0 && etc_passwd_lock
!= -EROFS
)
4809 return log_error_errno(etc_passwd_lock
, "Failed to take /etc/passwd lock: %m");
4812 r
= barrier_create(&barrier
);
4814 return log_error_errno(r
, "Cannot initialize IPC barrier: %m");
4816 if (socketpair(AF_UNIX
, SOCK_SEQPACKET
|SOCK_CLOEXEC
, 0, fd_inner_socket_pair
) < 0)
4817 return log_error_errno(errno
, "Failed to create inner socket pair: %m");
4819 if (socketpair(AF_UNIX
, SOCK_SEQPACKET
|SOCK_CLOEXEC
, 0, fd_outer_socket_pair
) < 0)
4820 return log_error_errno(errno
, "Failed to create outer socket pair: %m");
4822 /* Child can be killed before execv(), so handle SIGCHLD in order to interrupt
4823 * parent's blocking calls and give it a chance to call wait() and terminate. */
4824 r
= sigprocmask(SIG_UNBLOCK
, &mask_chld
, NULL
);
4826 return log_error_errno(errno
, "Failed to change the signal mask: %m");
4828 r
= sigaction(SIGCHLD
, &sa
, NULL
);
4830 return log_error_errno(errno
, "Failed to install SIGCHLD handler: %m");
4832 if (arg_network_namespace_path
) {
4833 child_netns_fd
= open(arg_network_namespace_path
, O_RDONLY
|O_NOCTTY
|O_CLOEXEC
);
4834 if (child_netns_fd
< 0)
4835 return log_error_errno(errno
, "Cannot open file %s: %m", arg_network_namespace_path
);
4837 r
= fd_is_ns(child_netns_fd
, CLONE_NEWNET
);
4839 log_debug_errno(r
, "Cannot determine if passed network namespace path '%s' really refers to a network namespace, assuming it does.", arg_network_namespace_path
);
4841 return log_error_errno(r
, "Failed to check %s fs type: %m", arg_network_namespace_path
);
4843 return log_error_errno(SYNTHETIC_ERRNO(EINVAL
),
4844 "Path %s doesn't refer to a network namespace, refusing.", arg_network_namespace_path
);
4847 *pid
= raw_clone(SIGCHLD
|CLONE_NEWNS
);
4849 return log_error_errno(errno
, "clone() failed%s: %m",
4851 ", do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in)" : "");
4854 /* The outer child only has a file system namespace. */
4855 barrier_set_role(&barrier
, BARRIER_CHILD
);
4857 fd_inner_socket_pair
[0] = safe_close(fd_inner_socket_pair
[0]);
4858 fd_outer_socket_pair
[0] = safe_close(fd_outer_socket_pair
[0]);
4860 (void) reset_all_signal_handlers();
4861 (void) reset_signal_mask();
4863 r
= outer_child(&barrier
,
4866 fd_outer_socket_pair
[1],
4867 fd_inner_socket_pair
[1],
4871 _exit(EXIT_FAILURE
);
4873 _exit(EXIT_SUCCESS
);
4876 barrier_set_role(&barrier
, BARRIER_PARENT
);
4880 fd_inner_socket_pair
[1] = safe_close(fd_inner_socket_pair
[1]);
4881 fd_outer_socket_pair
[1] = safe_close(fd_outer_socket_pair
[1]);
4883 if (arg_userns_mode
!= USER_NAMESPACE_NO
) {
4884 mntns_fd
= receive_one_fd(fd_outer_socket_pair
[0], 0);
4886 return log_error_errno(mntns_fd
, "Failed to receive mount namespace fd from outer child: %m");
4888 /* The child just let us know the UID shift it might have read from the image. */
4889 l
= recv(fd_outer_socket_pair
[0], &arg_uid_shift
, sizeof arg_uid_shift
, 0);
4891 return log_error_errno(errno
, "Failed to read UID shift: %m");
4892 if (l
!= sizeof arg_uid_shift
)
4893 return log_error_errno(SYNTHETIC_ERRNO(EIO
), "Short read while reading UID shift.");
4895 if (arg_userns_mode
== USER_NAMESPACE_PICK
) {
4896 /* If we are supposed to pick the UID shift, let's try to use the shift read from the
4897 * image, but if that's already in use, pick a new one, and report back to the child,
4898 * which one we now picked. */
4900 r
= uid_shift_pick(&arg_uid_shift
, &uid_shift_lock
);
4902 return log_error_errno(r
, "Failed to pick suitable UID/GID range: %m");
4904 l
= send(fd_outer_socket_pair
[0], &arg_uid_shift
, sizeof arg_uid_shift
, MSG_NOSIGNAL
);
4906 return log_error_errno(errno
, "Failed to send UID shift: %m");
4907 if (l
!= sizeof arg_uid_shift
)
4908 return log_error_errno(SYNTHETIC_ERRNO(EIO
), "Short write while writing UID shift.");
4911 n_bind_user_uid
= strv_length(arg_bind_user
);
4912 if (n_bind_user_uid
> 0) {
4913 /* Right after the UID shift, we'll receive the list of UID mappings for the
4914 * --bind-user= logic. Always a quadruplet of payload and host UID + GID. */
4916 bind_user_uid
= new(uid_t
, n_bind_user_uid
*4);
4920 for (size_t i
= 0; i
< n_bind_user_uid
; i
++) {
4921 l
= recv(fd_outer_socket_pair
[0], bind_user_uid
+ i
*4, sizeof(uid_t
)*4, 0);
4923 return log_error_errno(errno
, "Failed to read user UID map pair: %m");
4924 if (l
!= sizeof(uid_t
)*4)
4925 return log_full_errno(l
== 0 ? LOG_DEBUG
: LOG_WARNING
,
4926 SYNTHETIC_ERRNO(EIO
),
4927 "Short read while reading bind user UID pairs.");
4932 if (arg_unified_cgroup_hierarchy
== CGROUP_UNIFIED_UNKNOWN
) {
4933 /* The child let us know the support cgroup mode it might have read from the image. */
4934 l
= recv(fd_outer_socket_pair
[0], &arg_unified_cgroup_hierarchy
, sizeof(arg_unified_cgroup_hierarchy
), 0);
4936 return log_error_errno(errno
, "Failed to read cgroup mode: %m");
4937 if (l
!= sizeof(arg_unified_cgroup_hierarchy
))
4938 return log_error_errno(SYNTHETIC_ERRNO(EIO
), "Short read while reading cgroup mode (%zi bytes).%s",
4939 l
, l
== 0 ? " The child is most likely dead." : "");
4942 /* Wait for the outer child. */
4943 r
= wait_for_terminate_and_check("(sd-namespace)", *pid
, WAIT_LOG_ABNORMAL
);
4946 if (r
!= EXIT_SUCCESS
)
4949 /* And now retrieve the PID of the inner child. */
4950 l
= recv(fd_outer_socket_pair
[0], pid
, sizeof *pid
, 0);
4952 return log_error_errno(errno
, "Failed to read inner child PID: %m");
4953 if (l
!= sizeof *pid
)
4954 return log_error_errno(SYNTHETIC_ERRNO(EIO
), "Short read while reading inner child PID.");
4956 /* We also retrieve container UUID in case it was generated by outer child */
4957 l
= recv(fd_outer_socket_pair
[0], &arg_uuid
, sizeof arg_uuid
, 0);
4959 return log_error_errno(errno
, "Failed to read container machine ID: %m");
4960 if (l
!= sizeof(arg_uuid
))
4961 return log_error_errno(SYNTHETIC_ERRNO(EIO
), "Short read while reading container machined ID.");
4963 /* We also retrieve the socket used for notifications generated by outer child */
4964 notify_socket
= receive_one_fd(fd_outer_socket_pair
[0], 0);
4965 if (notify_socket
< 0)
4966 return log_error_errno(notify_socket
,
4967 "Failed to receive notification socket from the outer child: %m");
4969 log_debug("Init process invoked as PID "PID_FMT
, *pid
);
4971 if (arg_userns_mode
!= USER_NAMESPACE_NO
) {
4972 if (!barrier_place_and_sync(&barrier
)) /* #1 */
4973 return log_error_errno(SYNTHETIC_ERRNO(ESRCH
), "Child died too early.");
4975 r
= setup_uid_map(*pid
, bind_user_uid
, n_bind_user_uid
);
4979 (void) barrier_place(&barrier
); /* #2 */
4982 if (arg_private_network
) {
4983 if (!arg_network_namespace_path
) {
4984 /* Wait until the child has unshared its network namespace. */
4985 if (!barrier_place_and_sync(&barrier
)) /* #3 */
4986 return log_error_errno(SYNTHETIC_ERRNO(ESRCH
), "Child died too early");
4989 if (child_netns_fd
< 0) {
4990 /* Make sure we have an open file descriptor to the child's network
4991 * namespace so it stays alive even if the child exits. */
4992 r
= namespace_open(*pid
, NULL
, NULL
, &child_netns_fd
, NULL
, NULL
);
4994 return log_error_errno(r
, "Failed to open child network namespace: %m");
4997 r
= move_network_interfaces(child_netns_fd
, arg_network_interfaces
);
5001 if (arg_network_veth
) {
5002 r
= setup_veth(arg_machine
, *pid
, veth_name
,
5003 arg_network_bridge
|| arg_network_zone
, &arg_network_provided_mac
);
5009 if (arg_network_bridge
) {
5010 /* Add the interface to a bridge */
5011 r
= setup_bridge(veth_name
, arg_network_bridge
, false);
5016 } else if (arg_network_zone
) {
5017 /* Add the interface to a bridge, possibly creating it */
5018 r
= setup_bridge(veth_name
, arg_network_zone
, true);
5026 r
= setup_veth_extra(arg_machine
, *pid
, arg_network_veth_extra
);
5030 /* We created the primary and extra veth links now; let's remember this, so that we know to
5031 remove them later on. Note that we don't bother with removing veth links that were created
5032 here when their setup failed half-way, because in that case the kernel should be able to
5033 remove them on its own, since they cannot be referenced by anything yet. */
5034 *veth_created
= true;
5036 r
= setup_macvlan(arg_machine
, *pid
, arg_network_macvlan
);
5040 r
= setup_ipvlan(arg_machine
, *pid
, arg_network_ipvlan
);
5045 if (arg_register
|| !arg_keep_unit
) {
5046 r
= sd_bus_default_system(&bus
);
5048 return log_error_errno(r
, "Failed to open system bus: %m");
5050 r
= sd_bus_set_close_on_exit(bus
, false);
5052 return log_error_errno(r
, "Failed to disable close-on-exit behaviour: %m");
5055 if (!arg_keep_unit
) {
5056 /* When a new scope is created for this container, then we'll be registered as its controller, in which
5057 * case PID 1 will send us a friendly RequestStop signal, when it is asked to terminate the
5058 * scope. Let's hook into that, and cleanly shut down the container, and print a friendly message. */
5060 r
= sd_bus_match_signal_async(
5063 "org.freedesktop.systemd1",
5065 "org.freedesktop.systemd1.Scope",
5067 on_request_stop
, NULL
, PID_TO_PTR(*pid
));
5069 return log_error_errno(r
, "Failed to request RequestStop match: %m");
5073 r
= register_machine(
5081 arg_custom_mounts
, arg_n_custom_mounts
,
5084 arg_property_message
,
5086 arg_container_service_name
,
5091 } else if (!arg_keep_unit
) {
5097 arg_custom_mounts
, arg_n_custom_mounts
,
5100 arg_property_message
,
5101 /* allow_pidfds= */ true,
5106 } else if (arg_slice
|| arg_property
)
5107 log_notice("Machine and scope registration turned off, --slice= and --property= settings will have no effect.");
5109 r
= create_subcgroup(*pid
, arg_keep_unit
, arg_unified_cgroup_hierarchy
);
5113 r
= sync_cgroup(*pid
, arg_unified_cgroup_hierarchy
, arg_uid_shift
);
5117 r
= chown_cgroup(*pid
, arg_unified_cgroup_hierarchy
, arg_uid_shift
);
5121 /* Notify the child that the parent is ready with all
5122 * its setup (including cgroup-ification), and that
5123 * the child can now hand over control to the code to
5124 * run inside the container. */
5125 (void) barrier_place(&barrier
); /* #4 */
5127 /* Block SIGCHLD here, before notifying child.
5128 * process_pty() will handle it with the other signals. */
5129 assert_se(sigprocmask(SIG_BLOCK
, &mask_chld
, NULL
) >= 0);
5131 /* Reset signal to default */
5132 r
= default_signals(SIGCHLD
);
5134 return log_error_errno(r
, "Failed to reset SIGCHLD: %m");
5136 r
= sd_event_new(&event
);
5138 return log_error_errno(r
, "Failed to get default event source: %m");
5140 (void) sd_event_set_watchdog(event
, true);
5143 r
= sd_bus_attach_event(bus
, event
, 0);
5145 return log_error_errno(r
, "Failed to attach bus to event loop: %m");
5148 r
= setup_notify_parent(event
, notify_socket
, PID_TO_PTR(*pid
), ¬ify_event_source
);
5152 /* Wait that the child is completely ready now, and has mounted their own copies of procfs and so on,
5153 * before we take the fully visible instances away. */
5154 if (!barrier_sync(&barrier
)) /* #5.1 */
5155 return log_error_errno(SYNTHETIC_ERRNO(ESRCH
), "Child died too early.");
5157 if (arg_userns_mode
!= USER_NAMESPACE_NO
) {
5158 r
= wipe_fully_visible_fs(mntns_fd
);
5161 mntns_fd
= safe_close(mntns_fd
);
5164 /* And now let the child know that we completed removing the procfs instances, and it can start the
5166 if (!barrier_place(&barrier
)) /* #5.2 */
5167 return log_error_errno(SYNTHETIC_ERRNO(ESRCH
), "Child died too early.");
5169 /* At this point we have made use of the UID we picked, and thus nss-systemd/systemd-machined.service
5170 * will make them appear in getpwuid(), thus we can release the /etc/passwd lock. */
5171 etc_passwd_lock
= safe_close(etc_passwd_lock
);
5173 (void) sd_notifyf(false,
5174 "STATUS=Container running.\n"
5175 "X_NSPAWN_LEADER_PID=" PID_FMT
, *pid
);
5176 if (!arg_notify_ready
) {
5177 r
= sd_notify(false, "READY=1\n");
5179 log_warning_errno(r
, "Failed to send readiness notification, ignoring: %m");
5182 if (arg_kill_signal
> 0) {
5183 /* Try to kill the init system on SIGINT or SIGTERM */
5184 (void) sd_event_add_signal(event
, NULL
, SIGINT
, on_orderly_shutdown
, PID_TO_PTR(*pid
));
5185 (void) sd_event_add_signal(event
, NULL
, SIGTERM
, on_orderly_shutdown
, PID_TO_PTR(*pid
));
5187 /* Immediately exit */
5188 (void) sd_event_add_signal(event
, NULL
, SIGINT
, NULL
, NULL
);
5189 (void) sd_event_add_signal(event
, NULL
, SIGTERM
, NULL
, NULL
);
5192 (void) sd_event_add_signal(event
, NULL
, SIGRTMIN
+18, sigrtmin18_handler
, NULL
);
5194 r
= sd_event_add_memory_pressure(event
, NULL
, NULL
, NULL
);
5196 log_debug_errno(r
, "Failed allocate memory pressure event source, ignoring: %m");
5198 /* Exit when the child exits */
5199 (void) sd_event_add_signal(event
, NULL
, SIGCHLD
, on_sigchld
, PID_TO_PTR(*pid
));
5201 /* Retrieve the kmsg fifo allocated by inner child */
5202 fd_kmsg_fifo
= receive_one_fd(fd_inner_socket_pair
[0], 0);
5203 if (fd_kmsg_fifo
< 0)
5204 return log_error_errno(fd_kmsg_fifo
, "Failed to receive kmsg fifo from inner child: %m");
5206 if (arg_expose_ports
) {
5207 r
= expose_port_watch_rtnl(event
, fd_inner_socket_pair
[0], on_address_change
, expose_args
, &rtnl
);
5211 (void) expose_port_execute(rtnl
, &expose_args
->fw_ctx
, arg_expose_ports
, AF_INET
, &expose_args
->address4
);
5212 (void) expose_port_execute(rtnl
, &expose_args
->fw_ctx
, arg_expose_ports
, AF_INET6
, &expose_args
->address6
);
5215 if (arg_console_mode
!= CONSOLE_PIPE
) {
5216 _cleanup_close_
int fd
= -EBADF
;
5217 PTYForwardFlags flags
= 0;
5219 /* Retrieve the master pty allocated by inner child */
5220 fd
= receive_one_fd(fd_inner_socket_pair
[0], 0);
5222 return log_error_errno(fd
, "Failed to receive master pty from the inner child: %m");
5224 switch (arg_console_mode
) {
5226 case CONSOLE_READ_ONLY
:
5227 flags
|= PTY_FORWARD_READ_ONLY
;
5231 case CONSOLE_INTERACTIVE
:
5232 flags
|= PTY_FORWARD_IGNORE_VHANGUP
;
5234 r
= pty_forward_new(event
, fd
, flags
, &forward
);
5236 return log_error_errno(r
, "Failed to create PTY forwarder: %m");
5238 if (arg_console_width
!= UINT_MAX
|| arg_console_height
!= UINT_MAX
)
5239 (void) pty_forward_set_width_height(forward
,
5241 arg_console_height
);
5245 assert(arg_console_mode
== CONSOLE_PASSIVE
);
5248 *master
= TAKE_FD(fd
);
5251 fd_inner_socket_pair
[0] = safe_close(fd_inner_socket_pair
[0]);
5253 r
= sd_event_loop(event
);
5255 return log_error_errno(r
, "Failed to run event loop: %m");
5260 (void) pty_forward_get_last_char(forward
, &last_char
);
5261 forward
= pty_forward_free(forward
);
5263 if (!arg_quiet
&& last_char
!= '\n')
5267 /* Kill if it is not dead yet anyway */
5268 if (!arg_register
&& !arg_keep_unit
&& bus
)
5269 terminate_scope(bus
, arg_machine
);
5271 /* Normally redundant, but better safe than sorry */
5272 (void) kill(*pid
, SIGKILL
);
5274 fd_kmsg_fifo
= safe_close(fd_kmsg_fifo
);
5276 if (arg_private_network
) {
5277 /* Move network interfaces back to the parent network namespace. We use `safe_fork`
5278 * to avoid having to move the parent to the child network namespace. */
5279 r
= safe_fork(NULL
, FORK_RESET_SIGNALS
|FORK_DEATHSIG_SIGTERM
|FORK_WAIT
|FORK_LOG
, NULL
);
5284 _cleanup_close_
int parent_netns_fd
= -EBADF
;
5286 r
= namespace_open(getpid_cached(), NULL
, NULL
, &parent_netns_fd
, NULL
, NULL
);
5288 log_error_errno(r
, "Failed to open parent network namespace: %m");
5289 _exit(EXIT_FAILURE
);
5292 r
= namespace_enter(-1, -1, child_netns_fd
, -1, -1);
5294 log_error_errno(r
, "Failed to enter child network namespace: %m");
5295 _exit(EXIT_FAILURE
);
5298 /* Reverse network interfaces pair list so that interfaces get their initial name back.
5299 * This is about ensuring interfaces get their old name back when being moved back. */
5300 arg_network_interfaces
= strv_reverse(arg_network_interfaces
);
5302 r
= move_network_interfaces(parent_netns_fd
, arg_network_interfaces
);
5304 log_error_errno(r
, "Failed to move network interfaces back to parent network namespace: %m");
5306 _exit(r
< 0 ? EXIT_FAILURE
: EXIT_SUCCESS
);
5310 r
= wait_for_container(TAKE_PID(*pid
), &container_status
);
5312 /* Tell machined that we are gone. */
5314 (void) unregister_machine(bus
, arg_machine
);
5317 /* We failed to wait for the container, or the container exited abnormally. */
5319 if (r
> 0 || container_status
== CONTAINER_TERMINATED
) {
5320 /* r > 0 → The container exited with a non-zero status.
5321 * As a special case, we need to replace 133 with a different value,
5322 * because 133 is special-cased in the service file to reboot the container.
5323 * otherwise → The container exited with zero status and a reboot was not requested.
5325 if (r
== EXIT_FORCE_RESTART
)
5326 r
= EXIT_FAILURE
; /* replace 133 with the general failure code */
5328 return 0; /* finito */
5331 /* CONTAINER_REBOOTED, loop again */
5333 if (arg_keep_unit
) {
5334 /* Special handling if we are running as a service: instead of simply
5335 * restarting the machine we want to restart the entire service, so let's
5336 * inform systemd about this with the special exit code 133. The service
5337 * file uses RestartForceExitStatus=133 so that this results in a full
5338 * nspawn restart. This is necessary since we might have cgroup parameters
5339 * set we want to have flushed out. */
5340 *ret
= EXIT_FORCE_RESTART
;
5341 return 0; /* finito */
5344 expose_port_flush(&expose_args
->fw_ctx
, arg_expose_ports
, AF_INET
, &expose_args
->address4
);
5345 expose_port_flush(&expose_args
->fw_ctx
, arg_expose_ports
, AF_INET6
, &expose_args
->address6
);
5347 (void) remove_veth_links(veth_name
, arg_network_veth_extra
);
5348 *veth_created
= false;
5349 return 1; /* loop again */
5352 static int initialize_rlimits(void) {
5353 /* The default resource limits the kernel passes to PID 1, as per kernel 5.16. Let's pass our container payload
5354 * the same values as the kernel originally passed to PID 1, in order to minimize differences between host and
5355 * container execution environments. */
5357 static const struct rlimit kernel_defaults
[_RLIMIT_MAX
] = {
5358 [RLIMIT_AS
] = { RLIM_INFINITY
, RLIM_INFINITY
},
5359 [RLIMIT_CORE
] = { 0, RLIM_INFINITY
},
5360 [RLIMIT_CPU
] = { RLIM_INFINITY
, RLIM_INFINITY
},
5361 [RLIMIT_DATA
] = { RLIM_INFINITY
, RLIM_INFINITY
},
5362 [RLIMIT_FSIZE
] = { RLIM_INFINITY
, RLIM_INFINITY
},
5363 [RLIMIT_LOCKS
] = { RLIM_INFINITY
, RLIM_INFINITY
},
5364 [RLIMIT_MEMLOCK
] = { DEFAULT_RLIMIT_MEMLOCK
, DEFAULT_RLIMIT_MEMLOCK
},
5365 [RLIMIT_MSGQUEUE
] = { 819200, 819200 },
5366 [RLIMIT_NICE
] = { 0, 0 },
5367 [RLIMIT_NOFILE
] = { 1024, 4096 },
5368 [RLIMIT_RSS
] = { RLIM_INFINITY
, RLIM_INFINITY
},
5369 [RLIMIT_RTPRIO
] = { 0, 0 },
5370 [RLIMIT_RTTIME
] = { RLIM_INFINITY
, RLIM_INFINITY
},
5371 [RLIMIT_STACK
] = { 8388608, RLIM_INFINITY
},
5373 /* The kernel scales the default for RLIMIT_NPROC and RLIMIT_SIGPENDING based on the system's amount of
5374 * RAM. To provide best compatibility we'll read these limits off PID 1 instead of hardcoding them
5375 * here. This is safe as we know that PID 1 doesn't change these two limits and thus the original
5376 * kernel's initialization should still be valid during runtime — at least if PID 1 is systemd. Note
5377 * that PID 1 changes a number of other resource limits during early initialization which is why we
5378 * don't read the other limits from PID 1 but prefer the static table above. */
5383 for (rl
= 0; rl
< _RLIMIT_MAX
; rl
++) {
5384 /* Let's only fill in what the user hasn't explicitly configured anyway */
5385 if ((arg_settings_mask
& (SETTING_RLIMIT_FIRST
<< rl
)) == 0) {
5386 const struct rlimit
*v
;
5387 struct rlimit buffer
;
5389 if (IN_SET(rl
, RLIMIT_NPROC
, RLIMIT_SIGPENDING
)) {
5390 /* For these two let's read the limits off PID 1. See above for an explanation. */
5392 if (prlimit(1, rl
, NULL
, &buffer
) < 0)
5393 return log_error_errno(errno
, "Failed to read resource limit RLIMIT_%s of PID 1: %m", rlimit_to_string(rl
));
5396 } else if (rl
== RLIMIT_NOFILE
) {
5397 /* We nowadays bump RLIMIT_NOFILE's hard limit early in PID 1 for all
5398 * userspace. Given that nspawn containers are often run without our PID 1,
5399 * let's grant the containers a raised RLIMIT_NOFILE hard limit by default,
5400 * so that container userspace gets similar resources as host userspace
5402 buffer
= kernel_defaults
[rl
];
5403 buffer
.rlim_max
= MIN((rlim_t
) read_nr_open(), (rlim_t
) HIGH_RLIMIT_NOFILE
);
5406 v
= kernel_defaults
+ rl
;
5408 arg_rlimit
[rl
] = newdup(struct rlimit
, v
, 1);
5409 if (!arg_rlimit
[rl
])
5413 if (DEBUG_LOGGING
) {
5414 _cleanup_free_
char *k
= NULL
;
5416 (void) rlimit_format(arg_rlimit
[rl
], &k
);
5417 log_debug("Setting RLIMIT_%s to %s.", rlimit_to_string(rl
), k
);
5424 static int cant_be_in_netns(void) {
5425 _cleanup_close_
int fd
= -EBADF
;
5429 /* Check if we are in the same netns as udev. If we aren't, then device monitoring (and thus waiting
5430 * for loopback block devices) won't work, and we will hang. Detect this case and exit early with a
5433 if (!arg_image
) /* only matters if --image= us used, i.e. we actually need to use loopback devices */
5436 fd
= socket(AF_UNIX
, SOCK_SEQPACKET
|SOCK_NONBLOCK
|SOCK_CLOEXEC
, 0);
5438 return log_error_errno(errno
, "Failed to allocate udev control socket: %m");
5440 r
= connect_unix_path(fd
, AT_FDCWD
, "/run/udev/control");
5441 if (r
== -ENOENT
|| ERRNO_IS_NEG_DISCONNECT(r
))
5442 return log_error_errno(SYNTHETIC_ERRNO(EOPNOTSUPP
),
5443 "Sorry, but --image= requires access to the host's /run/ hierarchy, since we need access to udev.");
5445 return log_error_errno(r
, "Failed to connect socket to udev control socket: %m");
5447 r
= getpeercred(fd
, &ucred
);
5449 return log_error_errno(r
, "Failed to determine peer of udev control socket: %m");
5451 r
= in_same_namespace(ucred
.pid
, 0, NAMESPACE_NET
);
5453 return log_error_errno(r
, "Failed to determine network namespace of udev: %m");
5455 return log_error_errno(SYNTHETIC_ERRNO(EOPNOTSUPP
),
5456 "Sorry, but --image= is only supported in the main network namespace, since we need access to udev/AF_NETLINK.");
5460 static int run(int argc
, char *argv
[]) {
5461 bool remove_directory
= false, remove_image
= false, veth_created
= false, remove_tmprootdir
= false;
5462 _cleanup_close_
int master
= -EBADF
;
5463 _cleanup_fdset_free_ FDSet
*fds
= NULL
;
5464 int r
, n_fd_passed
, ret
= EXIT_SUCCESS
;
5465 char veth_name
[IFNAMSIZ
] = "";
5466 struct ExposeArgs expose_args
= {};
5467 _cleanup_(release_lock_file
) LockFile tree_global_lock
= LOCK_FILE_INIT
, tree_local_lock
= LOCK_FILE_INIT
;
5468 char tmprootdir
[] = "/tmp/nspawn-root-XXXXXX";
5469 _cleanup_(loop_device_unrefp
) LoopDevice
*loop
= NULL
;
5470 _cleanup_(dissected_image_unrefp
) DissectedImage
*dissected_image
= NULL
;
5471 _cleanup_(fw_ctx_freep
) FirewallContext
*fw_ctx
= NULL
;
5474 log_parse_environment();
5477 r
= parse_argv(argc
, argv
);
5481 if (geteuid() != 0) {
5482 r
= log_warning_errno(SYNTHETIC_ERRNO(EPERM
),
5483 argc
>= 2 ? "Need to be root." :
5484 "Need to be root (and some arguments are usually required).\nHint: try --help");
5488 r
= cant_be_in_netns();
5492 r
= initialize_rlimits();
5496 r
= load_oci_bundle();
5500 r
= determine_names();
5504 r
= load_settings();
5510 log_error_errno(r
, "Failed to determine whether the unified cgroups hierarchy is used: %m");
5514 r
= verify_arguments();
5518 r
= verify_network_interfaces_initialized();
5522 /* Reapply environment settings. */
5523 (void) detect_unified_cgroup_hierarchy_from_environment();
5525 /* Ignore SIGPIPE here, because we use splice() on the ptyfwd stuff and that will generate SIGPIPE if
5526 * the result is closed. Note that the container payload child will reset signal mask+handler anyway,
5527 * so just turning this off here means we only turn it off in nspawn itself, not any children. */
5528 (void) ignore_signals(SIGPIPE
);
5530 n_fd_passed
= sd_listen_fds(false);
5531 if (n_fd_passed
> 0) {
5532 r
= fdset_new_listen_fds(&fds
, false);
5534 log_error_errno(r
, "Failed to collect file descriptors: %m");
5539 /* The "default" umask. This is appropriate for most file and directory
5540 * operations performed by nspawn, and is the umask that will be used for
5541 * the child. Functions like copy_devnodes() change the umask temporarily. */
5544 if (arg_directory
) {
5547 /* Safety precaution: let's not allow running images from the live host OS image, as long as
5548 * /var from the host will propagate into container dynamically (because bad things happen if
5549 * two systems write to the same /var). Let's allow it for the special cases where /var is
5550 * either copied (i.e. --ephemeral) or replaced (i.e. --volatile=yes|state). */
5551 if (path_equal(arg_directory
, "/") && !(arg_ephemeral
|| IN_SET(arg_volatile_mode
, VOLATILE_YES
, VOLATILE_STATE
))) {
5552 r
= log_error_errno(SYNTHETIC_ERRNO(EINVAL
),
5553 "Spawning container on root directory is not supported. Consider using --ephemeral, --volatile=yes or --volatile=state.");
5557 if (arg_ephemeral
) {
5558 _cleanup_free_
char *np
= NULL
;
5560 r
= chase_and_update(&arg_directory
, 0);
5564 /* If the specified path is a mount point we generate the new snapshot immediately
5565 * inside it under a random name. However if the specified is not a mount point we
5566 * create the new snapshot in the parent directory, just next to it. */
5567 r
= path_is_mount_point(arg_directory
, NULL
, 0);
5569 log_error_errno(r
, "Failed to determine whether directory %s is mount point: %m", arg_directory
);
5573 r
= tempfn_random_child(arg_directory
, "machine.", &np
);
5575 r
= tempfn_random(arg_directory
, "machine.", &np
);
5577 log_error_errno(r
, "Failed to generate name for directory snapshot: %m");
5581 /* We take an exclusive lock on this image, since it's our private, ephemeral copy
5582 * only owned by us and no one else. */
5583 r
= image_path_lock(np
, LOCK_EX
|LOCK_NB
, &tree_global_lock
, &tree_local_lock
);
5585 log_error_errno(r
, "Failed to lock %s: %m", np
);
5590 BLOCK_SIGNALS(SIGINT
);
5591 r
= btrfs_subvol_snapshot_at(AT_FDCWD
, arg_directory
, AT_FDCWD
, np
,
5592 (arg_read_only
? BTRFS_SNAPSHOT_READ_ONLY
: 0) |
5593 BTRFS_SNAPSHOT_FALLBACK_COPY
|
5594 BTRFS_SNAPSHOT_FALLBACK_DIRECTORY
|
5595 BTRFS_SNAPSHOT_RECURSIVE
|
5596 BTRFS_SNAPSHOT_QUOTA
|
5597 BTRFS_SNAPSHOT_SIGINT
);
5600 log_error_errno(r
, "Interrupted while copying file system tree to %s, removed again.", np
);
5604 log_error_errno(r
, "Failed to create snapshot %s from %s: %m", np
, arg_directory
);
5608 free_and_replace(arg_directory
, np
);
5609 remove_directory
= true;
5611 r
= chase_and_update(&arg_directory
, arg_template
? CHASE_NONEXISTENT
: 0);
5615 r
= image_path_lock(arg_directory
, (arg_read_only
? LOCK_SH
: LOCK_EX
) | LOCK_NB
, &tree_global_lock
, &tree_local_lock
);
5617 log_error_errno(r
, "Directory tree %s is currently busy.", arg_directory
);
5621 log_error_errno(r
, "Failed to lock %s: %m", arg_directory
);
5626 r
= chase_and_update(&arg_template
, 0);
5631 BLOCK_SIGNALS(SIGINT
);
5632 r
= btrfs_subvol_snapshot_at(AT_FDCWD
, arg_template
, AT_FDCWD
, arg_directory
,
5633 (arg_read_only
? BTRFS_SNAPSHOT_READ_ONLY
: 0) |
5634 BTRFS_SNAPSHOT_FALLBACK_COPY
|
5635 BTRFS_SNAPSHOT_FALLBACK_DIRECTORY
|
5636 BTRFS_SNAPSHOT_FALLBACK_IMMUTABLE
|
5637 BTRFS_SNAPSHOT_RECURSIVE
|
5638 BTRFS_SNAPSHOT_QUOTA
|
5639 BTRFS_SNAPSHOT_SIGINT
);
5642 log_full(arg_quiet
? LOG_DEBUG
: LOG_INFO
,
5643 "Directory %s already exists, not populating from template %s.", arg_directory
, arg_template
);
5644 else if (r
== -EINTR
) {
5645 log_error_errno(r
, "Interrupted while copying file system tree to %s, removed again.", arg_directory
);
5648 log_error_errno(r
, "Couldn't create snapshot %s from %s: %m", arg_directory
, arg_template
);
5651 log_full(arg_quiet
? LOG_DEBUG
: LOG_INFO
,
5652 "Populated %s from template %s.", arg_directory
, arg_template
);
5656 if (arg_start_mode
== START_BOOT
) {
5657 _cleanup_free_
char *b
= NULL
;
5660 if (arg_pivot_root_new
) {
5661 b
= path_join(arg_directory
, arg_pivot_root_new
);
5669 if (path_is_os_tree(p
) <= 0) {
5670 r
= log_error_errno(SYNTHETIC_ERRNO(EINVAL
),
5671 "Directory %s doesn't look like an OS root directory (os-release file is missing). Refusing.", p
);
5675 _cleanup_free_
char *p
= NULL
;
5677 if (arg_pivot_root_new
)
5678 p
= path_join(arg_directory
, arg_pivot_root_new
, "/usr/");
5680 p
= path_join(arg_directory
, "/usr/");
5684 if (laccess(p
, F_OK
) < 0) {
5685 r
= log_error_errno(SYNTHETIC_ERRNO(EINVAL
),
5686 "Directory %s doesn't look like it has an OS tree (/usr/ directory is missing). Refusing.", arg_directory
);
5692 DissectImageFlags dissect_image_flags
=
5693 DISSECT_IMAGE_GENERIC_ROOT
|
5694 DISSECT_IMAGE_REQUIRE_ROOT
|
5695 DISSECT_IMAGE_RELAX_VAR_CHECK
|
5696 DISSECT_IMAGE_USR_NO_ROOT
|
5697 DISSECT_IMAGE_ADD_PARTITION_DEVICES
|
5698 DISSECT_IMAGE_PIN_PARTITION_DEVICES
;
5700 assert(!arg_template
);
5702 r
= chase_and_update(&arg_image
, 0);
5706 if (arg_ephemeral
) {
5707 _cleanup_free_
char *np
= NULL
;
5709 r
= tempfn_random(arg_image
, "machine.", &np
);
5711 log_error_errno(r
, "Failed to generate name for image snapshot: %m");
5715 /* Always take an exclusive lock on our own ephemeral copy. */
5716 r
= image_path_lock(np
, LOCK_EX
|LOCK_NB
, &tree_global_lock
, &tree_local_lock
);
5718 r
= log_error_errno(r
, "Failed to create image lock: %m");
5723 BLOCK_SIGNALS(SIGINT
);
5724 r
= copy_file_full(arg_image
, np
, O_EXCL
, arg_read_only
? 0400 : 0600,
5725 FS_NOCOW_FL
, FS_NOCOW_FL
,
5726 COPY_REFLINK
|COPY_CRTIME
|COPY_SIGINT
,
5730 log_error_errno(r
, "Interrupted while copying image file to %s, removed again.", np
);
5734 r
= log_error_errno(r
, "Failed to copy image file: %m");
5738 free_and_replace(arg_image
, np
);
5739 remove_image
= true;
5741 r
= image_path_lock(arg_image
, (arg_read_only
? LOCK_SH
: LOCK_EX
) | LOCK_NB
, &tree_global_lock
, &tree_local_lock
);
5743 r
= log_error_errno(r
, "Disk image %s is currently busy.", arg_image
);
5747 r
= log_error_errno(r
, "Failed to create image lock: %m");
5751 r
= verity_settings_load(
5752 &arg_verity_settings
,
5753 arg_image
, NULL
, NULL
);
5755 log_error_errno(r
, "Failed to read verity artefacts for %s: %m", arg_image
);
5759 if (arg_verity_settings
.data_path
)
5760 dissect_image_flags
|= DISSECT_IMAGE_NO_PARTITION_TABLE
;
5763 if (!mkdtemp(tmprootdir
)) {
5764 r
= log_error_errno(errno
, "Failed to create temporary directory: %m");
5768 remove_tmprootdir
= true;
5770 arg_directory
= strdup(tmprootdir
);
5771 if (!arg_directory
) {
5776 r
= loop_device_make_by_path(
5778 arg_read_only
? O_RDONLY
: O_RDWR
,
5779 /* sector_size= */ UINT32_MAX
,
5780 FLAGS_SET(dissect_image_flags
, DISSECT_IMAGE_NO_PARTITION_TABLE
) ? 0 : LO_FLAGS_PARTSCAN
,
5784 log_error_errno(r
, "Failed to set up loopback block device: %m");
5788 r
= dissect_loop_device_and_warn(
5790 &arg_verity_settings
,
5791 /* mount_options=*/ NULL
,
5792 arg_image_policy
?: &image_policy_container
,
5793 dissect_image_flags
,
5796 /* dissected_image_and_warn() already printed a brief error message. Extend on that with more details */
5797 log_notice("Note that the disk image needs to\n"
5798 " a) either contain only a single MBR partition of type 0x83 that is marked bootable\n"
5799 " b) or contain a single GPT partition of type 0FC63DAF-8483-4772-8E79-3D69D8477DE4\n"
5800 " c) or follow https://uapi-group.org/specifications/specs/discoverable_partitions_specification\n"
5801 " d) or contain a file system without a partition table\n"
5802 "in order to be bootable with systemd-nspawn.");
5808 r
= dissected_image_load_verity_sig_partition(
5811 &arg_verity_settings
);
5815 if (dissected_image
->has_verity
&& !arg_verity_settings
.root_hash
&& !dissected_image
->has_verity_sig
)
5816 log_notice("Note: image %s contains verity information, but no root hash specified and no embedded "
5817 "root hash signature found! Proceeding without integrity checking.", arg_image
);
5819 r
= dissected_image_decrypt_interactively(
5822 &arg_verity_settings
,
5827 /* Now that we mounted the image, let's try to remove it again, if it is ephemeral */
5828 if (remove_image
&& unlink(arg_image
) >= 0)
5829 remove_image
= false;
5831 if (arg_architecture
< 0)
5832 arg_architecture
= dissected_image_architecture(dissected_image
);
5835 r
= custom_mount_prepare_all(arg_directory
, arg_custom_mounts
, arg_n_custom_mounts
);
5839 if (arg_console_mode
< 0)
5841 isatty(STDIN_FILENO
) > 0 &&
5842 isatty(STDOUT_FILENO
) > 0 ? CONSOLE_INTERACTIVE
: CONSOLE_READ_ONLY
;
5844 if (arg_console_mode
== CONSOLE_PIPE
) /* if we pass STDERR on to the container, don't add our own logs into it too */
5848 log_info("Spawning container %s on %s.\nPress Ctrl-] three times within 1s to kill container.",
5849 arg_machine
, arg_image
?: arg_directory
);
5851 assert_se(sigprocmask_many(SIG_BLOCK
, NULL
, SIGCHLD
, SIGWINCH
, SIGTERM
, SIGINT
, SIGRTMIN
+18, -1) >= 0);
5853 r
= make_reaper_process(true);
5855 log_error_errno(r
, "Failed to become subreaper: %m");
5859 if (arg_expose_ports
) {
5860 r
= fw_ctx_new(&fw_ctx
);
5862 log_error_errno(r
, "Cannot expose configured ports, firewall initialization failed: %m");
5865 expose_args
.fw_ctx
= fw_ctx
;
5868 r
= run_container(dissected_image
,
5870 veth_name
, &veth_created
,
5871 &expose_args
, &master
,
5878 (void) sd_notify(false,
5879 r
== 0 && ret
== EXIT_FORCE_RESTART
? "STOPPING=1\nSTATUS=Restarting..." :
5880 "STOPPING=1\nSTATUS=Terminating...");
5883 (void) kill(pid
, SIGKILL
);
5885 /* Try to flush whatever is still queued in the pty */
5887 (void) copy_bytes(master
, STDOUT_FILENO
, UINT64_MAX
, 0);
5888 master
= safe_close(master
);
5892 (void) wait_for_terminate(pid
, NULL
);
5896 if (remove_directory
&& arg_directory
) {
5899 k
= rm_rf(arg_directory
, REMOVE_ROOT
|REMOVE_PHYSICAL
|REMOVE_SUBVOLUME
);
5901 log_warning_errno(k
, "Cannot remove '%s', ignoring: %m", arg_directory
);
5904 if (remove_image
&& arg_image
) {
5905 if (unlink(arg_image
) < 0)
5906 log_warning_errno(errno
, "Can't remove image file '%s', ignoring: %m", arg_image
);
5909 if (remove_tmprootdir
) {
5910 if (rmdir(tmprootdir
) < 0)
5911 log_debug_errno(errno
, "Can't remove temporary root directory '%s', ignoring: %m", tmprootdir
);
5917 p
= strjoina("/run/systemd/nspawn/propagate/", arg_machine
);
5918 (void) rm_rf(p
, REMOVE_ROOT
);
5921 expose_port_flush(&fw_ctx
, arg_expose_ports
, AF_INET
, &expose_args
.address4
);
5922 expose_port_flush(&fw_ctx
, arg_expose_ports
, AF_INET6
, &expose_args
.address6
);
5925 (void) remove_veth_links(veth_name
, arg_network_veth_extra
);
5926 (void) remove_bridge(arg_network_zone
);
5928 custom_mount_free_all(arg_custom_mounts
, arg_n_custom_mounts
);
5929 expose_port_free_all(arg_expose_ports
);
5930 rlimit_free_all(arg_rlimit
);
5931 device_node_array_free(arg_extra_nodes
, arg_n_extra_nodes
);
5932 credential_free_all(arg_credentials
, arg_n_credentials
);
5940 DEFINE_MAIN_FUNCTION_WITH_POSITIVE_FAILURE(run
);