From 3a47437fc9f37637c2924a7663cf2b8849ceb10d Mon Sep 17 00:00:00 2001 From: Matteo Croce Date: Fri, 27 Jun 2025 14:17:00 +0200 Subject: [PATCH] core: Introduce PrivateBPF= to mount a private BPFFS Add a new option PrivateBPF= to mount a new instance of bpffs within a namespace. PrivateBPF= can be set to "no" to use the host bpffs in readonly mode and "yes" to do a new mount. The mount is done with the new fsopen()/fsmount() API because in future we'll hook some commands between the two calls. --- man/org.freedesktop.systemd1.xml | 28 +++++++ man/systemd.exec.xml | 10 +++ src/core/dbus-execute.c | 6 ++ src/core/exec-invoke.c | 112 ++++++++++++++++++++++++- src/core/execute-serialize.c | 8 ++ src/core/execute.c | 7 +- src/core/execute.h | 2 + src/core/load-fragment-gperf.gperf.in | 1 + src/core/load-fragment.c | 1 + src/core/load-fragment.h | 1 + src/core/namespace.c | 80 +++++++++++++++++- src/core/namespace.h | 13 +++ src/shared/bus-unit-util.c | 1 + test/units/TEST-07-PID1.private-bpf.sh | 21 +++++ 14 files changed, 284 insertions(+), 7 deletions(-) create mode 100755 test/units/TEST-07-PID1.private-bpf.sh diff --git a/man/org.freedesktop.systemd1.xml b/man/org.freedesktop.systemd1.xml index d5f270c6817..95fb54d2f29 100644 --- a/man/org.freedesktop.systemd1.xml +++ b/man/org.freedesktop.systemd1.xml @@ -3374,6 +3374,8 @@ node /org/freedesktop/systemd1/unit/avahi_2ddaemon_2eservice { @org.freedesktop.DBus.Property.EmitsChangedSignal("const") readonly (ss) ProtectHostnameEx = ...; @org.freedesktop.DBus.Property.EmitsChangedSignal("const") + readonly s PrivateBPF = '...'; + @org.freedesktop.DBus.Property.EmitsChangedSignal("const") readonly b MemoryKSM = ...; @org.freedesktop.DBus.Property.EmitsChangedSignal("const") readonly s NetworkNamespacePath = '...'; @@ -3975,6 +3977,8 @@ node /org/freedesktop/systemd1/unit/avahi_2ddaemon_2eservice { + + @@ -4701,6 +4705,8 @@ node /org/freedesktop/systemd1/unit/avahi_2ddaemon_2eservice { + + @@ -5583,6 +5589,8 @@ node /org/freedesktop/systemd1/unit/avahi_2ddaemon_2esocket { @org.freedesktop.DBus.Property.EmitsChangedSignal("const") readonly (ss) ProtectHostnameEx = ...; @org.freedesktop.DBus.Property.EmitsChangedSignal("const") + readonly s PrivateBPF = '...'; + @org.freedesktop.DBus.Property.EmitsChangedSignal("const") readonly b MemoryKSM = ...; @org.freedesktop.DBus.Property.EmitsChangedSignal("const") readonly s NetworkNamespacePath = '...'; @@ -6204,6 +6212,8 @@ node /org/freedesktop/systemd1/unit/avahi_2ddaemon_2esocket { + + @@ -6910,6 +6920,8 @@ node /org/freedesktop/systemd1/unit/avahi_2ddaemon_2esocket { + + @@ -7616,6 +7628,8 @@ node /org/freedesktop/systemd1/unit/home_2emount { @org.freedesktop.DBus.Property.EmitsChangedSignal("const") readonly (ss) ProtectHostnameEx = ...; @org.freedesktop.DBus.Property.EmitsChangedSignal("const") + readonly s PrivateBPF = '...'; + @org.freedesktop.DBus.Property.EmitsChangedSignal("const") readonly b MemoryKSM = ...; @org.freedesktop.DBus.Property.EmitsChangedSignal("const") readonly s NetworkNamespacePath = '...'; @@ -8159,6 +8173,8 @@ node /org/freedesktop/systemd1/unit/home_2emount { + + @@ -8773,6 +8789,8 @@ node /org/freedesktop/systemd1/unit/home_2emount { + + @@ -9612,6 +9630,8 @@ node /org/freedesktop/systemd1/unit/dev_2dsda3_2eswap { @org.freedesktop.DBus.Property.EmitsChangedSignal("const") readonly (ss) ProtectHostnameEx = ...; @org.freedesktop.DBus.Property.EmitsChangedSignal("const") + readonly s PrivateBPF = '...'; + @org.freedesktop.DBus.Property.EmitsChangedSignal("const") readonly b MemoryKSM = ...; @org.freedesktop.DBus.Property.EmitsChangedSignal("const") readonly s NetworkNamespacePath = '...'; @@ -10137,6 +10157,8 @@ node /org/freedesktop/systemd1/unit/dev_2dsda3_2eswap { + + @@ -10733,6 +10755,8 @@ node /org/freedesktop/systemd1/unit/dev_2dsda3_2eswap { + + @@ -12316,6 +12340,7 @@ $ gdbus introspect --system --dest org.freedesktop.systemd1 \ PrivatePIDs were added in version 257. ProtectHostnameEx, DelegateNamespaces, + PrivateBPF, RemoveSubGroup(), StateDirectoryQuota, StateDirectoryQuotaUsage, @@ -12374,6 +12399,7 @@ $ gdbus introspect --system --dest org.freedesktop.systemd1 \ PassPIDFD, AcceptFileDescriptors, DelegateNamespaces, + PrivateBPF, RemoveSubgroup(), DeferTrigger, DeferTriggerMaxUSec, @@ -12429,6 +12455,7 @@ $ gdbus introspect --system --dest org.freedesktop.systemd1 \ PrivatePIDs were added in version 257. ProtectHostnameEx, DelegateNamespaces, + PrivateBPF, RemoveSubgroup(), ReloadResult, CleanResult, @@ -12484,6 +12511,7 @@ $ gdbus introspect --system --dest org.freedesktop.systemd1 \ PrivatePIDs were added in version 257. ProtectHostnameEx, DelegateNamespaces, + PrivateBPF, RemoveSubgroup(), StateDirectoryQuota, StateDirectoryQuotaUsage, diff --git a/man/systemd.exec.xml b/man/systemd.exec.xml index 813ea023138..85db1de264e 100644 --- a/man/systemd.exec.xml +++ b/man/systemd.exec.xml @@ -2555,6 +2555,16 @@ RestrictNamespaces=~cgroup net + + PrivateBPF= + + Takes a boolean argument. If set, mount a private instance of the BPF filesystem + on /sys/fs/bpf/. Otherwise, if ProtectKernelTunables= is set, + the instance from the host is inherited but mounted read-only. Defaults to false. + + + + LockPersonality= diff --git a/src/core/dbus-execute.c b/src/core/dbus-execute.c index b1e3df1688a..7e4d6fa6dbf 100644 --- a/src/core/dbus-execute.c +++ b/src/core/dbus-execute.c @@ -54,6 +54,7 @@ BUS_DEFINE_PROPERTY_GET_ENUM(bus_property_get_exec_preserve_mode, exec_preserve_ static BUS_DEFINE_PROPERTY_GET_ENUM(property_get_exec_keyring_mode, exec_keyring_mode, ExecKeyringMode); static BUS_DEFINE_PROPERTY_GET_ENUM(property_get_protect_proc, protect_proc, ProtectProc); static BUS_DEFINE_PROPERTY_GET_ENUM(property_get_proc_subset, proc_subset, ProcSubset); +static BUS_DEFINE_PROPERTY_GET_ENUM(property_get_private_bpf, private_bpf, PrivateBPF); static BUS_DEFINE_PROPERTY_GET_ENUM(property_get_protect_home, protect_home, ProtectHome); static BUS_DEFINE_PROPERTY_GET_ENUM(property_get_protect_system, protect_system, ProtectSystem); static BUS_DEFINE_PROPERTY_GET_ENUM(property_get_personality, personality, unsigned long); @@ -1316,6 +1317,7 @@ const sd_bus_vtable bus_exec_vtable[] = { SD_BUS_PROPERTY("ProcSubset", "s", property_get_proc_subset, offsetof(ExecContext, proc_subset), SD_BUS_VTABLE_PROPERTY_CONST), SD_BUS_PROPERTY("ProtectHostname", "b", property_get_protect_hostname, offsetof(ExecContext, protect_hostname), SD_BUS_VTABLE_PROPERTY_CONST), SD_BUS_PROPERTY("ProtectHostnameEx", "(ss)", property_get_protect_hostname_ex, 0, SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("PrivateBPF", "s", property_get_private_bpf, offsetof(ExecContext, private_bpf), SD_BUS_VTABLE_PROPERTY_CONST), SD_BUS_PROPERTY("MemoryKSM", "b", bus_property_get_tristate, offsetof(ExecContext, memory_ksm), SD_BUS_VTABLE_PROPERTY_CONST), SD_BUS_PROPERTY("NetworkNamespacePath", "s", NULL, offsetof(ExecContext, network_namespace_path), SD_BUS_VTABLE_PROPERTY_CONST), SD_BUS_PROPERTY("IPCNamespacePath", "s", NULL, offsetof(ExecContext, ipc_namespace_path), SD_BUS_VTABLE_PROPERTY_CONST), @@ -1753,6 +1755,7 @@ static BUS_DEFINE_SET_TRANSIENT_PARSE(protect_home, ProtectHome, protect_home_fr static BUS_DEFINE_SET_TRANSIENT_PARSE(keyring_mode, ExecKeyringMode, exec_keyring_mode_from_string); static BUS_DEFINE_SET_TRANSIENT_PARSE(protect_proc, ProtectProc, protect_proc_from_string); static BUS_DEFINE_SET_TRANSIENT_PARSE(proc_subset, ProcSubset, proc_subset_from_string); +static BUS_DEFINE_SET_TRANSIENT_PARSE(private_bpf, PrivateBPF, private_bpf_from_string); BUS_DEFINE_SET_TRANSIENT_PARSE(exec_preserve_mode, ExecPreserveMode, exec_preserve_mode_from_string); static BUS_DEFINE_SET_TRANSIENT_PARSE_PTR(personality, unsigned long, parse_personality); static BUS_DEFINE_SET_TRANSIENT_TO_STRING_ALLOC(secure_bits, "i", int32_t, int, "%" PRIi32, secure_bits_to_string_alloc_with_check); @@ -2279,6 +2282,9 @@ int bus_exec_context_set_transient_property( if (streq(name, "ProcSubset")) return bus_set_transient_proc_subset(u, name, &c->proc_subset, message, flags, error); + if (streq(name, "PrivateBPF")) + return bus_set_transient_private_bpf(u, name, &c->private_bpf, message, flags, error); + if (streq(name, "RuntimeDirectoryPreserve")) return bus_set_transient_exec_preserve_mode(u, name, &c->runtime_directory_preserve_mode, message, flags, error); diff --git a/src/core/exec-invoke.c b/src/core/exec-invoke.c index 09deb0f5c11..e6fce99340b 100644 --- a/src/core/exec-invoke.c +++ b/src/core/exec-invoke.c @@ -2270,6 +2270,61 @@ static int setup_private_users_child(int unshare_ready_fd, const char *uid_map, return 0; } +static int bpffs_prepare( + PidRef *ret_pid, + int *ret_sock_fd, + int *ret_errno_pipe) { + + _cleanup_close_pair_ int socket_fds[2] = EBADF_PAIR, bpffs_errno_pipe[2] = EBADF_PAIR; + int r; + + assert(ret_sock_fd); + assert(ret_pid); + assert(ret_errno_pipe); + + r = pipe2(bpffs_errno_pipe, O_CLOEXEC|O_NONBLOCK); + if (r < 0) + return log_debug_errno(errno, "Failed to create pipe: %m"); + + r = socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, socket_fds); + if (r < 0) + return log_debug_errno(errno, "Failed to create socket pair: %m"); + + r = pidref_safe_fork("(sd-bpffs)", FORK_RESET_SIGNALS|FORK_DEATHSIG_SIGKILL, ret_pid); + if (r < 0) + return log_debug_errno(r, "Failed to fork bpffs privileged helper: %m"); + if (r == 0) { + _cleanup_close_ int fs_fd = -EBADF; + + bpffs_errno_pipe[0] = safe_close(bpffs_errno_pipe[0]); + socket_fds[0] = safe_close(socket_fds[0]); + + fs_fd = receive_one_fd(socket_fds[1], /* flags = */ 0); + if (fs_fd < 0) { + log_debug_errno(fs_fd, "Failed to receive file descriptor from parent: %m"); + report_errno_and_exit(bpffs_errno_pipe[1], fs_fd); + } + + r = fsconfig(fs_fd, FSCONFIG_CMD_CREATE, /* key = */ NULL, /* value = */ NULL, /* aux = */ 0); + if (r < 0) { + log_debug_errno(errno, "Failed to create bpffs superblock: %m"); + report_errno_and_exit(bpffs_errno_pipe[1], errno); + } + + if (write(socket_fds[1], (uint8_t[1]) {}, 1) < 0) { + log_debug_errno(errno, "Failed to send data to child: %m"); + report_errno_and_exit(bpffs_errno_pipe[1], errno); + } + + _exit(EXIT_SUCCESS); + } + + *ret_sock_fd = TAKE_FD(socket_fds[0]); + *ret_errno_pipe = TAKE_FD(bpffs_errno_pipe[0]); + + return 0; +} + static int setup_private_users(PrivateUsers private_users, uid_t ouid, gid_t ogid, uid_t uid, gid_t gid, bool allow_setgroups) { _cleanup_free_ char *uid_map = NULL, *gid_map = NULL; _cleanup_close_pair_ int errno_pipe[2] = EBADF_PAIR; @@ -3600,9 +3655,10 @@ static int apply_mount_namespace( ExecRuntime *runtime, const char *memory_pressure_path, bool needs_sandboxing, - char **reterr_path, uid_t exec_directory_uid, - gid_t exec_directory_gid) { + gid_t exec_directory_gid, + int bpffs_socket_fd, + char **reterr_path) { _cleanup_(verity_settings_done) VeritySettings verity = VERITY_SETTINGS_DEFAULT; _cleanup_strv_free_ char **empty_directories = NULL, **symlinks = NULL, @@ -3814,6 +3870,9 @@ static int apply_mount_namespace( .protect_system = needs_sandboxing ? context->protect_system : PROTECT_SYSTEM_NO, .protect_proc = needs_sandboxing ? context->protect_proc : PROTECT_PROC_DEFAULT, .proc_subset = needs_sandboxing ? context->proc_subset : PROC_SUBSET_ALL, + .private_bpf = needs_sandboxing ? context->private_bpf : PRIVATE_BPF_NO, + + .bpffs_socket_fd = bpffs_socket_fd, }; r = setup_namespace(¶meters, reterr_path); @@ -4454,6 +4513,7 @@ static int setup_delegated_namespaces( const ExecCommand *command, bool needs_sandboxing, bool have_cap_sys_admin, + int bpffs_socket_fd, int *reterr_exit_status) { int r; @@ -4574,9 +4634,10 @@ static int setup_delegated_namespaces( runtime, memory_pressure_path, needs_sandboxing, - &error_path, uid, - gid); + gid, + bpffs_socket_fd, + &error_path); if (r < 0) { *reterr_exit_status = EXIT_NAMESPACE; return log_error_errno(r, "Failed to set up mount namespacing%s%s: %m", @@ -4911,7 +4972,9 @@ int exec_invoke( _cleanup_free_ gid_t *gids = NULL, *gids_after_pam = NULL; int ngids = 0, ngids_after_pam = 0; int socket_fd = -EBADF, named_iofds[3] = EBADF_TRIPLET; + _cleanup_close_ int bpffs_socket_fd = -EBADF, bpffs_errno_pipe = -EBADF; size_t n_storage_fds, n_socket_fds, n_extra_fds; + _cleanup_(pidref_done_sigkill_wait) PidRef bpffs_pidref = PIDREF_NULL; assert(command); assert(context); @@ -5627,6 +5690,26 @@ int exec_invoke( } } + if (context->private_bpf != PRIVATE_BPF_NO) { + /* To create a BPF token, the bpffs has to be mounted with the fsopen()/fsmount() API. + * More specifically, fsopen() must be called within the user namespace, then all the + * fsconfig() as privileged user, and finally and fsmount() and move_mount() in + * the user namespace. + * To do this, we split the code into a bpffs_prepare() and mount_bpffs() functions, + * the first runs as privileged user the second as unprivileged one, and they coordinate + * by sending messages and file descriptors via a socket pair. + * The user and mount namespaces need to be unshared in this exact order and before + * the fsopen() call for the fsopen() API to work as unprivileged. + * This is the kernel sample doing this: + * https://github.com/torvalds/linux/blob/master/tools/testing/selftests/bpf/prog_tests/token.c + */ + r = bpffs_prepare(&bpffs_pidref, &bpffs_socket_fd, &bpffs_errno_pipe); + if (r < 0) { + *exit_status = EXIT_BPF; + return log_error_errno(r, "Failed to mount bpffs in bpffs_prepare(): %m"); + } + } + if (needs_sandboxing && !have_cap_sys_admin && exec_needs_cap_sys_admin(context, params)) { /* If we're unprivileged, set up the user namespace first to enable use of the other namespaces. * Users with CAP_SYS_ADMIN can set up user namespaces last because they will be able to @@ -5665,6 +5748,7 @@ int exec_invoke( command, needs_sandboxing, have_cap_sys_admin, + bpffs_socket_fd, exit_status); if (r < 0) return r; @@ -5724,10 +5808,30 @@ int exec_invoke( command, needs_sandboxing, have_cap_sys_admin, + bpffs_socket_fd, exit_status); if (r < 0) return r; + if (context->private_bpf != PRIVATE_BPF_NO) { + r = pidref_wait_for_terminate_and_check("(sd-bpffs)", &bpffs_pidref, /* flags = */ 0); + if (r < 0) { + *exit_status = EXIT_BPF; + return r; + } + /* If something strange happened with the child, let's consider this fatal, too */ + if (r != EXIT_SUCCESS) { + *exit_status = EXIT_BPF; + ssize_t ss = read(bpffs_errno_pipe, &r, sizeof(r)); + if (ss == sizeof(r)) + return log_debug_errno(r, "bpffs helper exited with error: %m"); + if (ss < 0) + return log_debug_errno(errno, "Failed to read from the bpffs helper errno pipe: %m"); + return log_debug_errno(SYNTHETIC_ERRNO(EIO), "Short read from the bpffs helper errno pipe."); + } + pidref_done(&bpffs_pidref); + } + if (needs_sandboxing && exec_needs_cgroup_namespace(context) && params->cgroup_path) { /* Move ourselves into the subcgroup now *after* we've unshared the cgroup namespace, which * ensures the root of the cgroup namespace is the top level service cgroup and not the diff --git a/src/core/execute-serialize.c b/src/core/execute-serialize.c index 0a1af05e51d..167e4dfd7fa 100644 --- a/src/core/execute-serialize.c +++ b/src/core/execute-serialize.c @@ -1803,6 +1803,10 @@ static int exec_context_serialize(const ExecContext *c, FILE *f) { if (r < 0) return r; + r = serialize_item(f, "exec-context-private-bpf", private_bpf_to_string(c->private_bpf)); + if (r < 0) + return r; + r = serialize_item(f, "exec-context-runtime-directory-preserve-mode", exec_preserve_mode_to_string(c->runtime_directory_preserve_mode)); if (r < 0) return r; @@ -2741,6 +2745,10 @@ static int exec_context_deserialize(ExecContext *c, FILE *f) { c->proc_subset = proc_subset_from_string(val); if (c->proc_subset < 0) return -EINVAL; + } else if ((val = startswith(l, "exec-context-private-bpf="))) { + c->private_bpf = private_bpf_from_string(val); + if (c->private_bpf < 0) + return -EINVAL; } else if ((val = startswith(l, "exec-context-runtime-directory-preserve-mode="))) { c->runtime_directory_preserve_mode = exec_preserve_mode_from_string(val); if (c->runtime_directory_preserve_mode < 0) diff --git a/src/core/execute.c b/src/core/execute.c index 5d5cc412076..9fc9e549de3 100644 --- a/src/core/execute.c +++ b/src/core/execute.c @@ -324,6 +324,7 @@ bool exec_needs_mount_namespace( exec_needs_cgroup_mount(context) || context->protect_proc != PROTECT_PROC_DEFAULT || context->proc_subset != PROC_SUBSET_ALL || + context->private_bpf != PRIVATE_BPF_NO || exec_needs_ipc_namespace(context) || exec_needs_pid_namespace(context, params)) return true; @@ -1124,7 +1125,8 @@ void exec_context_dump(const ExecContext *c, FILE* f, const char *prefix) { "%sKeyringMode: %s\n" "%sProtectHostname: %s%s%s\n" "%sProtectProc: %s\n" - "%sProcSubset: %s\n", + "%sProcSubset: %s\n" + "%sPrivateBPF: %s\n", prefix, c->umask, prefix, empty_to_root(c->working_directory), prefix, empty_to_root(c->root_directory), @@ -1151,7 +1153,8 @@ void exec_context_dump(const ExecContext *c, FILE* f, const char *prefix) { prefix, exec_keyring_mode_to_string(c->keyring_mode), prefix, protect_hostname_to_string(c->protect_hostname), c->private_hostname ? ":" : "", strempty(c->private_hostname), prefix, protect_proc_to_string(c->protect_proc), - prefix, proc_subset_to_string(c->proc_subset)); + prefix, proc_subset_to_string(c->proc_subset), + prefix, private_bpf_to_string(c->private_bpf)); if (c->set_login_environment >= 0) fprintf(f, "%sSetLoginEnvironment: %s\n", prefix, yes_no(c->set_login_environment > 0)); diff --git a/src/core/execute.h b/src/core/execute.h index da1600a0440..6f1df610a8a 100644 --- a/src/core/execute.h +++ b/src/core/execute.h @@ -300,6 +300,8 @@ typedef struct ExecContext { ProtectProc protect_proc; /* hidepid= */ ProcSubset proc_subset; /* subset= */ + PrivateBPF private_bpf; + int private_mounts; int mount_apivfs; int bind_log_sockets; diff --git a/src/core/load-fragment-gperf.gperf.in b/src/core/load-fragment-gperf.gperf.in index 7d4d174d845..edb06395399 100644 --- a/src/core/load-fragment-gperf.gperf.in +++ b/src/core/load-fragment-gperf.gperf.in @@ -67,6 +67,7 @@ {{type}}.KeyringMode, config_parse_exec_keyring_mode, 0, offsetof({{type}}, exec_context.keyring_mode) {{type}}.ProtectProc, config_parse_protect_proc, 0, offsetof({{type}}, exec_context.protect_proc) {{type}}.ProcSubset, config_parse_proc_subset, 0, offsetof({{type}}, exec_context.proc_subset) +{{type}}.PrivateBPF, config_parse_private_bpf, 0, offsetof({{type}}, exec_context.private_bpf) {% if HAVE_SECCOMP %} {{type}}.SystemCallFilter, config_parse_syscall_filter, 0, offsetof({{type}}, exec_context) {{type}}.SystemCallArchitectures, config_parse_syscall_archs, 0, offsetof({{type}}, exec_context.syscall_archs) diff --git a/src/core/load-fragment.c b/src/core/load-fragment.c index c1e704b1c6a..9c544a35e05 100644 --- a/src/core/load-fragment.c +++ b/src/core/load-fragment.c @@ -133,6 +133,7 @@ DEFINE_CONFIG_PARSE_ENUM(config_parse_device_policy, cgroup_device_policy, CGrou DEFINE_CONFIG_PARSE_ENUM(config_parse_exec_keyring_mode, exec_keyring_mode, ExecKeyringMode); DEFINE_CONFIG_PARSE_ENUM(config_parse_protect_proc, protect_proc, ProtectProc); DEFINE_CONFIG_PARSE_ENUM(config_parse_proc_subset, proc_subset, ProcSubset); +DEFINE_CONFIG_PARSE_ENUM(config_parse_private_bpf, private_bpf, PrivateBPF); DEFINE_CONFIG_PARSE_ENUM(config_parse_private_tmp, private_tmp, PrivateTmp); DEFINE_CONFIG_PARSE_ENUM(config_parse_private_users, private_users, PrivateUsers); DEFINE_CONFIG_PARSE_ENUM(config_parse_private_pids, private_pids, PrivatePIDs); diff --git a/src/core/load-fragment.h b/src/core/load-fragment.h index a31ad750d3d..ba226e2e5c3 100644 --- a/src/core/load-fragment.h +++ b/src/core/load-fragment.h @@ -129,6 +129,7 @@ CONFIG_PARSER_PROTOTYPE(config_parse_bind_paths); CONFIG_PARSER_PROTOTYPE(config_parse_exec_keyring_mode); CONFIG_PARSER_PROTOTYPE(config_parse_protect_proc); CONFIG_PARSER_PROTOTYPE(config_parse_proc_subset); +CONFIG_PARSER_PROTOTYPE(config_parse_private_bpf); CONFIG_PARSER_PROTOTYPE(config_parse_job_timeout_sec); CONFIG_PARSER_PROTOTYPE(config_parse_job_running_timeout_sec); CONFIG_PARSER_PROTOTYPE(config_parse_log_extra_fields); diff --git a/src/core/namespace.c b/src/core/namespace.c index faa84ced20b..0768eafac22 100644 --- a/src/core/namespace.c +++ b/src/core/namespace.c @@ -79,6 +79,7 @@ typedef enum MountMode { MOUNT_EXTENSION_IMAGE, /* Mounted outside the root directory, and used by subsequent mounts */ MOUNT_MQUEUEFS, MOUNT_READ_WRITE_IMPLICIT, /* Should have the lowest priority. */ + MOUNT_BPFFS, /* Special mount for bpffs, which is mounted with fsmount() and move_mount() */ _MOUNT_MODE_MAX, _MOUNT_MODE_INVALID = -EINVAL, } MountMode; @@ -161,13 +162,17 @@ static const MountEntry protect_kernel_tunables_proc_table[] = { static const MountEntry protect_kernel_tunables_sys_table[] = { { "/sys", MOUNT_READ_ONLY, false }, - { "/sys/fs/bpf", MOUNT_READ_ONLY, true }, { "/sys/fs/cgroup", MOUNT_READ_WRITE_IMPLICIT, false }, /* READ_ONLY is set by ProtectControlGroups= option */ { "/sys/fs/selinux", MOUNT_READ_WRITE_IMPLICIT, true }, { "/sys/kernel/debug", MOUNT_READ_ONLY, true }, { "/sys/kernel/tracing", MOUNT_READ_ONLY, true }, }; +/* PrivateBPF= option */ +static const MountEntry private_bpf_no_table[] = { + { "/sys/fs/bpf", MOUNT_READ_ONLY, true }, +}; + /* ProtectKernelModules= option */ static const MountEntry protect_kernel_modules_table[] = { { "/usr/lib/modules", MOUNT_INACCESSIBLE, true }, @@ -927,6 +932,36 @@ static int append_protect_system(MountList *ml, ProtectSystem protect_system, bo } } +static int append_private_bpf( + MountList *ml, + PrivateBPF private_bpf, + bool protect_kernel_tunables, + bool ignore_protect, + const NamespaceParameters *p) { + + assert(ml); + + switch (private_bpf) { + case PRIVATE_BPF_NO: + if (protect_kernel_tunables) + return append_static_mounts(ml, private_bpf_no_table, ELEMENTSOF(private_bpf_no_table), ignore_protect); + return 0; + case PRIVATE_BPF_YES: { + MountEntry *me = mount_list_extend(ml); + if (!me) + return log_oom_debug(); + + *me = (MountEntry) { + .path_const = "/sys/fs/bpf", + .mode = MOUNT_BPFFS, + }; + return 0; + } + default: + assert_not_reached(); + } +} + static int mount_path_compare(const MountEntry *a, const MountEntry *b) { int d; @@ -1697,6 +1732,34 @@ static int mount_overlay(const MountEntry *m) { return 1; } +static int mount_bpffs(const MountEntry *m, int socket_fd) { + int r; + + assert(m); + assert(socket_fd >= 0); + + _cleanup_close_ int fs_fd = fsopen("bpf", FSOPEN_CLOEXEC); + if (fs_fd < 0) + return log_debug_errno(errno, "Failed to fsopen: %m"); + + r = send_one_fd(socket_fd, fs_fd, /* flags = */ 0); + if (r < 0) + return log_debug_errno(r, "Failed to send bpffs fd to child: %m"); + + if (read(socket_fd, (uint8_t[1]) {}, 1) < 0) + return log_debug_errno(errno, "Failed to receive data from child: %m"); + + _cleanup_close_ int mnt_fd = fsmount(fs_fd, /* flags = */ 0, /* mount_attrs = */ 0); + if (mnt_fd < 0) + return log_debug_errno(errno, "Failed to fsmount bpffs: %m"); + + r = move_mount(mnt_fd, "", AT_FDCWD, mount_entry_path(m), MOVE_MOUNT_F_EMPTY_PATH); + if (r < 0) + return log_debug_errno(errno, "Failed to move bpffs mount to %s: %m", mount_entry_path(m)); + + return 1; +} + static int follow_symlink( const char *root_directory, MountEntry *m) { @@ -1953,6 +2016,9 @@ static int apply_one_mount( case MOUNT_OVERLAY: return mount_overlay(m); + case MOUNT_BPFFS: + return mount_bpffs(m, p->bpffs_socket_fd); + default: assert_not_reached(); } @@ -2151,6 +2217,7 @@ static bool namespace_parameters_mount_apivfs(const NamespaceParameters *p) { p->protect_kernel_tunables || p->protect_proc != PROTECT_PROC_DEFAULT || p->proc_subset != PROC_SUBSET_ALL || + p->private_bpf != PRIVATE_BPF_NO || p->private_pids != PRIVATE_PIDS_NO; } @@ -2653,6 +2720,10 @@ int setup_namespace(const NamespaceParameters *p, char **reterr_path) { if (r < 0) return r; + r = append_private_bpf(&ml, p->private_bpf, p->protect_kernel_tunables, /* ignore_protect = */ false, p); + if (r < 0) + return r; + if (namespace_parameters_mount_apivfs(p)) { r = append_static_mounts(&ml, apivfs_table, @@ -3888,6 +3959,13 @@ static const char* const proc_subset_table[_PROC_SUBSET_MAX] = { DEFINE_STRING_TABLE_LOOKUP(proc_subset, ProcSubset); +static const char* const private_bpf_table[_PRIVATE_BPF_MAX] = { + [PRIVATE_BPF_NO] = "no", + [PRIVATE_BPF_YES] = "yes", +}; + +DEFINE_STRING_TABLE_LOOKUP_WITH_BOOLEAN(private_bpf, PrivateBPF, PRIVATE_BPF_YES); + static const char* const private_tmp_table[_PRIVATE_TMP_MAX] = { [PRIVATE_TMP_NO] = "no", [PRIVATE_TMP_CONNECTED] = "connected", diff --git a/src/core/namespace.h b/src/core/namespace.h index eadd991ed2a..178ed1e5480 100644 --- a/src/core/namespace.h +++ b/src/core/namespace.h @@ -51,6 +51,13 @@ typedef enum ProcSubset { _PROC_SUBSET_INVALID = -EINVAL, } ProcSubset; +typedef enum PrivateBPF { + PRIVATE_BPF_NO, + PRIVATE_BPF_YES, + _PRIVATE_BPF_MAX, + _PRIVATE_BPF_INVALID = -EINVAL, +} PrivateBPF; + typedef enum PrivateTmp { PRIVATE_TMP_NO, PRIVATE_TMP_CONNECTED, /* Bind mounted from the host's filesystem */ @@ -188,9 +195,12 @@ typedef struct NamespaceParameters { ProtectSystem protect_system; ProtectProc protect_proc; ProcSubset proc_subset; + PrivateBPF private_bpf; PrivateTmp private_tmp; PrivateTmp private_var_tmp; PrivatePIDs private_pids; + + int bpffs_socket_fd; } NamespaceParameters; int setup_namespace(const NamespaceParameters *p, char **reterr_path); @@ -223,6 +233,9 @@ ProtectProc protect_proc_from_string(const char *s) _pure_; const char* proc_subset_to_string(ProcSubset i) _const_; ProcSubset proc_subset_from_string(const char *s) _pure_; +const char* private_bpf_to_string(PrivateBPF i) _const_; +PrivateBPF private_bpf_from_string(const char *s) _pure_; + const char* private_tmp_to_string(PrivateTmp i) _const_; PrivateTmp private_tmp_from_string(const char *s) _pure_; diff --git a/src/shared/bus-unit-util.c b/src/shared/bus-unit-util.c index 0e60cd63d40..8fc97db191b 100644 --- a/src/shared/bus-unit-util.c +++ b/src/shared/bus-unit-util.c @@ -2425,6 +2425,7 @@ static const BusProperty execute_properties[] = { { "MountImagePolicy", bus_append_string }, { "ExtensionImagePolicy", bus_append_string }, { "PrivatePIDs", bus_append_string }, + { "PrivateBPF", bus_append_string }, { "IgnoreSIGPIPE", bus_append_parse_boolean }, { "TTYVHangup", bus_append_parse_boolean }, { "TTYReset", bus_append_parse_boolean }, diff --git a/test/units/TEST-07-PID1.private-bpf.sh b/test/units/TEST-07-PID1.private-bpf.sh new file mode 100755 index 00000000000..f0c1dcf73e4 --- /dev/null +++ b/test/units/TEST-07-PID1.private-bpf.sh @@ -0,0 +1,21 @@ +#!/usr/bin/env bash +# SPDX-License-Identifier: LGPL-2.1-or-later +set -eux +set -o pipefail + +# Check that with ProtectKernelTunables=yes and PrivateBPF=no, the host bpffs is remounted ro +systemd-run --wait \ + -p PrivateUsers=yes \ + -p PrivateMounts=yes \ + -p DelegateNamespaces=mnt \ + -p ProtectKernelTunables=yes \ + -p PrivateBPF=no \ + grep -q '/sys/fs/bpf .* ro,' /proc/mounts + +# Check that with PrivateBPF=yes, a new bpffs instance is mounted +systemd-run --wait \ + -p PrivateUsers=yes \ + -p PrivateMounts=yes \ + -p DelegateNamespaces=mnt \ + -p PrivateBPF=yes \ + grep -q '^none /sys/fs/bpf bpf rw' /proc/mounts -- 2.47.3