@org.freedesktop.DBus.Property.EmitsChangedSignal("const")
readonly (ss) ProtectHostnameEx = ...;
@org.freedesktop.DBus.Property.EmitsChangedSignal("const")
+ readonly s PrivateBPF = '...';
+ @org.freedesktop.DBus.Property.EmitsChangedSignal("const")
readonly b MemoryKSM = ...;
@org.freedesktop.DBus.Property.EmitsChangedSignal("const")
readonly s NetworkNamespacePath = '...';
<!--property ProcSubset is not documented!-->
+ <!--property PrivateBPF is not documented!-->
+
<!--property MemoryKSM is not documented!-->
<!--property NetworkNamespacePath is not documented!-->
<variablelist class="dbus-property" generated="True" extra-ref="ProtectHostnameEx"/>
+ <variablelist class="dbus-property" generated="True" extra-ref="PrivateBPF"/>
+
<variablelist class="dbus-property" generated="True" extra-ref="MemoryKSM"/>
<variablelist class="dbus-property" generated="True" extra-ref="NetworkNamespacePath"/>
@org.freedesktop.DBus.Property.EmitsChangedSignal("const")
readonly (ss) ProtectHostnameEx = ...;
@org.freedesktop.DBus.Property.EmitsChangedSignal("const")
+ readonly s PrivateBPF = '...';
+ @org.freedesktop.DBus.Property.EmitsChangedSignal("const")
readonly b MemoryKSM = ...;
@org.freedesktop.DBus.Property.EmitsChangedSignal("const")
readonly s NetworkNamespacePath = '...';
<!--property ProcSubset is not documented!-->
+ <!--property PrivateBPF is not documented!-->
+
<!--property MemoryKSM is not documented!-->
<!--property NetworkNamespacePath is not documented!-->
<variablelist class="dbus-property" generated="True" extra-ref="ProtectHostnameEx"/>
+ <variablelist class="dbus-property" generated="True" extra-ref="PrivateBPF"/>
+
<variablelist class="dbus-property" generated="True" extra-ref="MemoryKSM"/>
<variablelist class="dbus-property" generated="True" extra-ref="NetworkNamespacePath"/>
@org.freedesktop.DBus.Property.EmitsChangedSignal("const")
readonly (ss) ProtectHostnameEx = ...;
@org.freedesktop.DBus.Property.EmitsChangedSignal("const")
+ readonly s PrivateBPF = '...';
+ @org.freedesktop.DBus.Property.EmitsChangedSignal("const")
readonly b MemoryKSM = ...;
@org.freedesktop.DBus.Property.EmitsChangedSignal("const")
readonly s NetworkNamespacePath = '...';
<!--property ProcSubset is not documented!-->
+ <!--property PrivateBPF is not documented!-->
+
<!--property MemoryKSM is not documented!-->
<!--property NetworkNamespacePath is not documented!-->
<variablelist class="dbus-property" generated="True" extra-ref="ProtectHostnameEx"/>
+ <variablelist class="dbus-property" generated="True" extra-ref="PrivateBPF"/>
+
<variablelist class="dbus-property" generated="True" extra-ref="MemoryKSM"/>
<variablelist class="dbus-property" generated="True" extra-ref="NetworkNamespacePath"/>
@org.freedesktop.DBus.Property.EmitsChangedSignal("const")
readonly (ss) ProtectHostnameEx = ...;
@org.freedesktop.DBus.Property.EmitsChangedSignal("const")
+ readonly s PrivateBPF = '...';
+ @org.freedesktop.DBus.Property.EmitsChangedSignal("const")
readonly b MemoryKSM = ...;
@org.freedesktop.DBus.Property.EmitsChangedSignal("const")
readonly s NetworkNamespacePath = '...';
<!--property ProcSubset is not documented!-->
+ <!--property PrivateBPF is not documented!-->
+
<!--property MemoryKSM is not documented!-->
<!--property NetworkNamespacePath is not documented!-->
<variablelist class="dbus-property" generated="True" extra-ref="ProtectHostnameEx"/>
+ <variablelist class="dbus-property" generated="True" extra-ref="PrivateBPF"/>
+
<variablelist class="dbus-property" generated="True" extra-ref="MemoryKSM"/>
<variablelist class="dbus-property" generated="True" extra-ref="NetworkNamespacePath"/>
<varname>PrivatePIDs</varname> were added in version 257.</para>
<para><varname>ProtectHostnameEx</varname>,
<varname>DelegateNamespaces</varname>,
+ <varname>PrivateBPF</varname>,
<function>RemoveSubGroup()</function>,
<varname>StateDirectoryQuota</varname>,
<varname>StateDirectoryQuotaUsage</varname>,
<varname>PassPIDFD</varname>,
<varname>AcceptFileDescriptors</varname>,
<varname>DelegateNamespaces</varname>,
+ <varname>PrivateBPF</varname>,
<function>RemoveSubgroup()</function>,
<varname>DeferTrigger</varname>,
<varname>DeferTriggerMaxUSec</varname>,
<varname>PrivatePIDs</varname> were added in version 257.</para>
<para><varname>ProtectHostnameEx</varname>,
<varname>DelegateNamespaces</varname>,
+ <varname>PrivateBPF</varname>,
<function>RemoveSubgroup()</function>,
<varname>ReloadResult</varname>,
<varname>CleanResult</varname>,
<varname>PrivatePIDs</varname> were added in version 257.</para>
<para><varname>ProtectHostnameEx</varname>,
<varname>DelegateNamespaces</varname>,
+ <varname>PrivateBPF</varname>,
<function>RemoveSubgroup()</function>,
<varname>StateDirectoryQuota</varname>,
<varname>StateDirectoryQuotaUsage</varname>,
<xi:include href="version-info.xml" xpointer="v258"/></listitem>
</varlistentry>
+ <varlistentry>
+ <term><varname>PrivateBPF=</varname></term>
+
+ <listitem><para>Takes a boolean argument. If set, mount a private instance of the BPF filesystem
+ on <filename>/sys/fs/bpf/</filename>. Otherwise, if <varname>ProtectKernelTunables=</varname> is set,
+ the instance from the host is inherited but mounted read-only. Defaults to false.</para>
+
+ <xi:include href="version-info.xml" xpointer="v258"/></listitem>
+ </varlistentry>
+
<varlistentry>
<term><varname>LockPersonality=</varname></term>
static BUS_DEFINE_PROPERTY_GET_ENUM(property_get_exec_keyring_mode, exec_keyring_mode, ExecKeyringMode);
static BUS_DEFINE_PROPERTY_GET_ENUM(property_get_protect_proc, protect_proc, ProtectProc);
static BUS_DEFINE_PROPERTY_GET_ENUM(property_get_proc_subset, proc_subset, ProcSubset);
+static BUS_DEFINE_PROPERTY_GET_ENUM(property_get_private_bpf, private_bpf, PrivateBPF);
static BUS_DEFINE_PROPERTY_GET_ENUM(property_get_protect_home, protect_home, ProtectHome);
static BUS_DEFINE_PROPERTY_GET_ENUM(property_get_protect_system, protect_system, ProtectSystem);
static BUS_DEFINE_PROPERTY_GET_ENUM(property_get_personality, personality, unsigned long);
SD_BUS_PROPERTY("ProcSubset", "s", property_get_proc_subset, offsetof(ExecContext, proc_subset), SD_BUS_VTABLE_PROPERTY_CONST),
SD_BUS_PROPERTY("ProtectHostname", "b", property_get_protect_hostname, offsetof(ExecContext, protect_hostname), SD_BUS_VTABLE_PROPERTY_CONST),
SD_BUS_PROPERTY("ProtectHostnameEx", "(ss)", property_get_protect_hostname_ex, 0, SD_BUS_VTABLE_PROPERTY_CONST),
+ SD_BUS_PROPERTY("PrivateBPF", "s", property_get_private_bpf, offsetof(ExecContext, private_bpf), SD_BUS_VTABLE_PROPERTY_CONST),
SD_BUS_PROPERTY("MemoryKSM", "b", bus_property_get_tristate, offsetof(ExecContext, memory_ksm), SD_BUS_VTABLE_PROPERTY_CONST),
SD_BUS_PROPERTY("NetworkNamespacePath", "s", NULL, offsetof(ExecContext, network_namespace_path), SD_BUS_VTABLE_PROPERTY_CONST),
SD_BUS_PROPERTY("IPCNamespacePath", "s", NULL, offsetof(ExecContext, ipc_namespace_path), SD_BUS_VTABLE_PROPERTY_CONST),
static BUS_DEFINE_SET_TRANSIENT_PARSE(keyring_mode, ExecKeyringMode, exec_keyring_mode_from_string);
static BUS_DEFINE_SET_TRANSIENT_PARSE(protect_proc, ProtectProc, protect_proc_from_string);
static BUS_DEFINE_SET_TRANSIENT_PARSE(proc_subset, ProcSubset, proc_subset_from_string);
+static BUS_DEFINE_SET_TRANSIENT_PARSE(private_bpf, PrivateBPF, private_bpf_from_string);
BUS_DEFINE_SET_TRANSIENT_PARSE(exec_preserve_mode, ExecPreserveMode, exec_preserve_mode_from_string);
static BUS_DEFINE_SET_TRANSIENT_PARSE_PTR(personality, unsigned long, parse_personality);
static BUS_DEFINE_SET_TRANSIENT_TO_STRING_ALLOC(secure_bits, "i", int32_t, int, "%" PRIi32, secure_bits_to_string_alloc_with_check);
if (streq(name, "ProcSubset"))
return bus_set_transient_proc_subset(u, name, &c->proc_subset, message, flags, error);
+ if (streq(name, "PrivateBPF"))
+ return bus_set_transient_private_bpf(u, name, &c->private_bpf, message, flags, error);
+
if (streq(name, "RuntimeDirectoryPreserve"))
return bus_set_transient_exec_preserve_mode(u, name, &c->runtime_directory_preserve_mode, message, flags, error);
return 0;
}
+static int bpffs_prepare(
+ PidRef *ret_pid,
+ int *ret_sock_fd,
+ int *ret_errno_pipe) {
+
+ _cleanup_close_pair_ int socket_fds[2] = EBADF_PAIR, bpffs_errno_pipe[2] = EBADF_PAIR;
+ int r;
+
+ assert(ret_sock_fd);
+ assert(ret_pid);
+ assert(ret_errno_pipe);
+
+ r = pipe2(bpffs_errno_pipe, O_CLOEXEC|O_NONBLOCK);
+ if (r < 0)
+ return log_debug_errno(errno, "Failed to create pipe: %m");
+
+ r = socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, socket_fds);
+ if (r < 0)
+ return log_debug_errno(errno, "Failed to create socket pair: %m");
+
+ r = pidref_safe_fork("(sd-bpffs)", FORK_RESET_SIGNALS|FORK_DEATHSIG_SIGKILL, ret_pid);
+ if (r < 0)
+ return log_debug_errno(r, "Failed to fork bpffs privileged helper: %m");
+ if (r == 0) {
+ _cleanup_close_ int fs_fd = -EBADF;
+
+ bpffs_errno_pipe[0] = safe_close(bpffs_errno_pipe[0]);
+ socket_fds[0] = safe_close(socket_fds[0]);
+
+ fs_fd = receive_one_fd(socket_fds[1], /* flags = */ 0);
+ if (fs_fd < 0) {
+ log_debug_errno(fs_fd, "Failed to receive file descriptor from parent: %m");
+ report_errno_and_exit(bpffs_errno_pipe[1], fs_fd);
+ }
+
+ r = fsconfig(fs_fd, FSCONFIG_CMD_CREATE, /* key = */ NULL, /* value = */ NULL, /* aux = */ 0);
+ if (r < 0) {
+ log_debug_errno(errno, "Failed to create bpffs superblock: %m");
+ report_errno_and_exit(bpffs_errno_pipe[1], errno);
+ }
+
+ if (write(socket_fds[1], (uint8_t[1]) {}, 1) < 0) {
+ log_debug_errno(errno, "Failed to send data to child: %m");
+ report_errno_and_exit(bpffs_errno_pipe[1], errno);
+ }
+
+ _exit(EXIT_SUCCESS);
+ }
+
+ *ret_sock_fd = TAKE_FD(socket_fds[0]);
+ *ret_errno_pipe = TAKE_FD(bpffs_errno_pipe[0]);
+
+ return 0;
+}
+
static int setup_private_users(PrivateUsers private_users, uid_t ouid, gid_t ogid, uid_t uid, gid_t gid, bool allow_setgroups) {
_cleanup_free_ char *uid_map = NULL, *gid_map = NULL;
_cleanup_close_pair_ int errno_pipe[2] = EBADF_PAIR;
ExecRuntime *runtime,
const char *memory_pressure_path,
bool needs_sandboxing,
- char **reterr_path,
uid_t exec_directory_uid,
- gid_t exec_directory_gid) {
+ gid_t exec_directory_gid,
+ int bpffs_socket_fd,
+ char **reterr_path) {
_cleanup_(verity_settings_done) VeritySettings verity = VERITY_SETTINGS_DEFAULT;
_cleanup_strv_free_ char **empty_directories = NULL, **symlinks = NULL,
.protect_system = needs_sandboxing ? context->protect_system : PROTECT_SYSTEM_NO,
.protect_proc = needs_sandboxing ? context->protect_proc : PROTECT_PROC_DEFAULT,
.proc_subset = needs_sandboxing ? context->proc_subset : PROC_SUBSET_ALL,
+ .private_bpf = needs_sandboxing ? context->private_bpf : PRIVATE_BPF_NO,
+
+ .bpffs_socket_fd = bpffs_socket_fd,
};
r = setup_namespace(¶meters, reterr_path);
const ExecCommand *command,
bool needs_sandboxing,
bool have_cap_sys_admin,
+ int bpffs_socket_fd,
int *reterr_exit_status) {
int r;
runtime,
memory_pressure_path,
needs_sandboxing,
- &error_path,
uid,
- gid);
+ gid,
+ bpffs_socket_fd,
+ &error_path);
if (r < 0) {
*reterr_exit_status = EXIT_NAMESPACE;
return log_error_errno(r, "Failed to set up mount namespacing%s%s: %m",
_cleanup_free_ gid_t *gids = NULL, *gids_after_pam = NULL;
int ngids = 0, ngids_after_pam = 0;
int socket_fd = -EBADF, named_iofds[3] = EBADF_TRIPLET;
+ _cleanup_close_ int bpffs_socket_fd = -EBADF, bpffs_errno_pipe = -EBADF;
size_t n_storage_fds, n_socket_fds, n_extra_fds;
+ _cleanup_(pidref_done_sigkill_wait) PidRef bpffs_pidref = PIDREF_NULL;
assert(command);
assert(context);
}
}
+ if (context->private_bpf != PRIVATE_BPF_NO) {
+ /* To create a BPF token, the bpffs has to be mounted with the fsopen()/fsmount() API.
+ * More specifically, fsopen() must be called within the user namespace, then all the
+ * fsconfig() as privileged user, and finally and fsmount() and move_mount() in
+ * the user namespace.
+ * To do this, we split the code into a bpffs_prepare() and mount_bpffs() functions,
+ * the first runs as privileged user the second as unprivileged one, and they coordinate
+ * by sending messages and file descriptors via a socket pair.
+ * The user and mount namespaces need to be unshared in this exact order and before
+ * the fsopen() call for the fsopen() API to work as unprivileged.
+ * This is the kernel sample doing this:
+ * https://github.com/torvalds/linux/blob/master/tools/testing/selftests/bpf/prog_tests/token.c
+ */
+ r = bpffs_prepare(&bpffs_pidref, &bpffs_socket_fd, &bpffs_errno_pipe);
+ if (r < 0) {
+ *exit_status = EXIT_BPF;
+ return log_error_errno(r, "Failed to mount bpffs in bpffs_prepare(): %m");
+ }
+ }
+
if (needs_sandboxing && !have_cap_sys_admin && exec_needs_cap_sys_admin(context, params)) {
/* If we're unprivileged, set up the user namespace first to enable use of the other namespaces.
* Users with CAP_SYS_ADMIN can set up user namespaces last because they will be able to
command,
needs_sandboxing,
have_cap_sys_admin,
+ bpffs_socket_fd,
exit_status);
if (r < 0)
return r;
command,
needs_sandboxing,
have_cap_sys_admin,
+ bpffs_socket_fd,
exit_status);
if (r < 0)
return r;
+ if (context->private_bpf != PRIVATE_BPF_NO) {
+ r = pidref_wait_for_terminate_and_check("(sd-bpffs)", &bpffs_pidref, /* flags = */ 0);
+ if (r < 0) {
+ *exit_status = EXIT_BPF;
+ return r;
+ }
+ /* If something strange happened with the child, let's consider this fatal, too */
+ if (r != EXIT_SUCCESS) {
+ *exit_status = EXIT_BPF;
+ ssize_t ss = read(bpffs_errno_pipe, &r, sizeof(r));
+ if (ss == sizeof(r))
+ return log_debug_errno(r, "bpffs helper exited with error: %m");
+ if (ss < 0)
+ return log_debug_errno(errno, "Failed to read from the bpffs helper errno pipe: %m");
+ return log_debug_errno(SYNTHETIC_ERRNO(EIO), "Short read from the bpffs helper errno pipe.");
+ }
+ pidref_done(&bpffs_pidref);
+ }
+
if (needs_sandboxing && exec_needs_cgroup_namespace(context) && params->cgroup_path) {
/* Move ourselves into the subcgroup now *after* we've unshared the cgroup namespace, which
* ensures the root of the cgroup namespace is the top level service cgroup and not the
if (r < 0)
return r;
+ r = serialize_item(f, "exec-context-private-bpf", private_bpf_to_string(c->private_bpf));
+ if (r < 0)
+ return r;
+
r = serialize_item(f, "exec-context-runtime-directory-preserve-mode", exec_preserve_mode_to_string(c->runtime_directory_preserve_mode));
if (r < 0)
return r;
c->proc_subset = proc_subset_from_string(val);
if (c->proc_subset < 0)
return -EINVAL;
+ } else if ((val = startswith(l, "exec-context-private-bpf="))) {
+ c->private_bpf = private_bpf_from_string(val);
+ if (c->private_bpf < 0)
+ return -EINVAL;
} else if ((val = startswith(l, "exec-context-runtime-directory-preserve-mode="))) {
c->runtime_directory_preserve_mode = exec_preserve_mode_from_string(val);
if (c->runtime_directory_preserve_mode < 0)
exec_needs_cgroup_mount(context) ||
context->protect_proc != PROTECT_PROC_DEFAULT ||
context->proc_subset != PROC_SUBSET_ALL ||
+ context->private_bpf != PRIVATE_BPF_NO ||
exec_needs_ipc_namespace(context) ||
exec_needs_pid_namespace(context, params))
return true;
"%sKeyringMode: %s\n"
"%sProtectHostname: %s%s%s\n"
"%sProtectProc: %s\n"
- "%sProcSubset: %s\n",
+ "%sProcSubset: %s\n"
+ "%sPrivateBPF: %s\n",
prefix, c->umask,
prefix, empty_to_root(c->working_directory),
prefix, empty_to_root(c->root_directory),
prefix, exec_keyring_mode_to_string(c->keyring_mode),
prefix, protect_hostname_to_string(c->protect_hostname), c->private_hostname ? ":" : "", strempty(c->private_hostname),
prefix, protect_proc_to_string(c->protect_proc),
- prefix, proc_subset_to_string(c->proc_subset));
+ prefix, proc_subset_to_string(c->proc_subset),
+ prefix, private_bpf_to_string(c->private_bpf));
if (c->set_login_environment >= 0)
fprintf(f, "%sSetLoginEnvironment: %s\n", prefix, yes_no(c->set_login_environment > 0));
ProtectProc protect_proc; /* hidepid= */
ProcSubset proc_subset; /* subset= */
+ PrivateBPF private_bpf;
+
int private_mounts;
int mount_apivfs;
int bind_log_sockets;
{{type}}.KeyringMode, config_parse_exec_keyring_mode, 0, offsetof({{type}}, exec_context.keyring_mode)
{{type}}.ProtectProc, config_parse_protect_proc, 0, offsetof({{type}}, exec_context.protect_proc)
{{type}}.ProcSubset, config_parse_proc_subset, 0, offsetof({{type}}, exec_context.proc_subset)
+{{type}}.PrivateBPF, config_parse_private_bpf, 0, offsetof({{type}}, exec_context.private_bpf)
{% if HAVE_SECCOMP %}
{{type}}.SystemCallFilter, config_parse_syscall_filter, 0, offsetof({{type}}, exec_context)
{{type}}.SystemCallArchitectures, config_parse_syscall_archs, 0, offsetof({{type}}, exec_context.syscall_archs)
DEFINE_CONFIG_PARSE_ENUM(config_parse_exec_keyring_mode, exec_keyring_mode, ExecKeyringMode);
DEFINE_CONFIG_PARSE_ENUM(config_parse_protect_proc, protect_proc, ProtectProc);
DEFINE_CONFIG_PARSE_ENUM(config_parse_proc_subset, proc_subset, ProcSubset);
+DEFINE_CONFIG_PARSE_ENUM(config_parse_private_bpf, private_bpf, PrivateBPF);
DEFINE_CONFIG_PARSE_ENUM(config_parse_private_tmp, private_tmp, PrivateTmp);
DEFINE_CONFIG_PARSE_ENUM(config_parse_private_users, private_users, PrivateUsers);
DEFINE_CONFIG_PARSE_ENUM(config_parse_private_pids, private_pids, PrivatePIDs);
CONFIG_PARSER_PROTOTYPE(config_parse_exec_keyring_mode);
CONFIG_PARSER_PROTOTYPE(config_parse_protect_proc);
CONFIG_PARSER_PROTOTYPE(config_parse_proc_subset);
+CONFIG_PARSER_PROTOTYPE(config_parse_private_bpf);
CONFIG_PARSER_PROTOTYPE(config_parse_job_timeout_sec);
CONFIG_PARSER_PROTOTYPE(config_parse_job_running_timeout_sec);
CONFIG_PARSER_PROTOTYPE(config_parse_log_extra_fields);
MOUNT_EXTENSION_IMAGE, /* Mounted outside the root directory, and used by subsequent mounts */
MOUNT_MQUEUEFS,
MOUNT_READ_WRITE_IMPLICIT, /* Should have the lowest priority. */
+ MOUNT_BPFFS, /* Special mount for bpffs, which is mounted with fsmount() and move_mount() */
_MOUNT_MODE_MAX,
_MOUNT_MODE_INVALID = -EINVAL,
} MountMode;
static const MountEntry protect_kernel_tunables_sys_table[] = {
{ "/sys", MOUNT_READ_ONLY, false },
- { "/sys/fs/bpf", MOUNT_READ_ONLY, true },
{ "/sys/fs/cgroup", MOUNT_READ_WRITE_IMPLICIT, false }, /* READ_ONLY is set by ProtectControlGroups= option */
{ "/sys/fs/selinux", MOUNT_READ_WRITE_IMPLICIT, true },
{ "/sys/kernel/debug", MOUNT_READ_ONLY, true },
{ "/sys/kernel/tracing", MOUNT_READ_ONLY, true },
};
+/* PrivateBPF= option */
+static const MountEntry private_bpf_no_table[] = {
+ { "/sys/fs/bpf", MOUNT_READ_ONLY, true },
+};
+
/* ProtectKernelModules= option */
static const MountEntry protect_kernel_modules_table[] = {
{ "/usr/lib/modules", MOUNT_INACCESSIBLE, true },
}
}
+static int append_private_bpf(
+ MountList *ml,
+ PrivateBPF private_bpf,
+ bool protect_kernel_tunables,
+ bool ignore_protect,
+ const NamespaceParameters *p) {
+
+ assert(ml);
+
+ switch (private_bpf) {
+ case PRIVATE_BPF_NO:
+ if (protect_kernel_tunables)
+ return append_static_mounts(ml, private_bpf_no_table, ELEMENTSOF(private_bpf_no_table), ignore_protect);
+ return 0;
+ case PRIVATE_BPF_YES: {
+ MountEntry *me = mount_list_extend(ml);
+ if (!me)
+ return log_oom_debug();
+
+ *me = (MountEntry) {
+ .path_const = "/sys/fs/bpf",
+ .mode = MOUNT_BPFFS,
+ };
+ return 0;
+ }
+ default:
+ assert_not_reached();
+ }
+}
+
static int mount_path_compare(const MountEntry *a, const MountEntry *b) {
int d;
return 1;
}
+static int mount_bpffs(const MountEntry *m, int socket_fd) {
+ int r;
+
+ assert(m);
+ assert(socket_fd >= 0);
+
+ _cleanup_close_ int fs_fd = fsopen("bpf", FSOPEN_CLOEXEC);
+ if (fs_fd < 0)
+ return log_debug_errno(errno, "Failed to fsopen: %m");
+
+ r = send_one_fd(socket_fd, fs_fd, /* flags = */ 0);
+ if (r < 0)
+ return log_debug_errno(r, "Failed to send bpffs fd to child: %m");
+
+ if (read(socket_fd, (uint8_t[1]) {}, 1) < 0)
+ return log_debug_errno(errno, "Failed to receive data from child: %m");
+
+ _cleanup_close_ int mnt_fd = fsmount(fs_fd, /* flags = */ 0, /* mount_attrs = */ 0);
+ if (mnt_fd < 0)
+ return log_debug_errno(errno, "Failed to fsmount bpffs: %m");
+
+ r = move_mount(mnt_fd, "", AT_FDCWD, mount_entry_path(m), MOVE_MOUNT_F_EMPTY_PATH);
+ if (r < 0)
+ return log_debug_errno(errno, "Failed to move bpffs mount to %s: %m", mount_entry_path(m));
+
+ return 1;
+}
+
static int follow_symlink(
const char *root_directory,
MountEntry *m) {
case MOUNT_OVERLAY:
return mount_overlay(m);
+ case MOUNT_BPFFS:
+ return mount_bpffs(m, p->bpffs_socket_fd);
+
default:
assert_not_reached();
}
p->protect_kernel_tunables ||
p->protect_proc != PROTECT_PROC_DEFAULT ||
p->proc_subset != PROC_SUBSET_ALL ||
+ p->private_bpf != PRIVATE_BPF_NO ||
p->private_pids != PRIVATE_PIDS_NO;
}
if (r < 0)
return r;
+ r = append_private_bpf(&ml, p->private_bpf, p->protect_kernel_tunables, /* ignore_protect = */ false, p);
+ if (r < 0)
+ return r;
+
if (namespace_parameters_mount_apivfs(p)) {
r = append_static_mounts(&ml,
apivfs_table,
DEFINE_STRING_TABLE_LOOKUP(proc_subset, ProcSubset);
+static const char* const private_bpf_table[_PRIVATE_BPF_MAX] = {
+ [PRIVATE_BPF_NO] = "no",
+ [PRIVATE_BPF_YES] = "yes",
+};
+
+DEFINE_STRING_TABLE_LOOKUP_WITH_BOOLEAN(private_bpf, PrivateBPF, PRIVATE_BPF_YES);
+
static const char* const private_tmp_table[_PRIVATE_TMP_MAX] = {
[PRIVATE_TMP_NO] = "no",
[PRIVATE_TMP_CONNECTED] = "connected",
_PROC_SUBSET_INVALID = -EINVAL,
} ProcSubset;
+typedef enum PrivateBPF {
+ PRIVATE_BPF_NO,
+ PRIVATE_BPF_YES,
+ _PRIVATE_BPF_MAX,
+ _PRIVATE_BPF_INVALID = -EINVAL,
+} PrivateBPF;
+
typedef enum PrivateTmp {
PRIVATE_TMP_NO,
PRIVATE_TMP_CONNECTED, /* Bind mounted from the host's filesystem */
ProtectSystem protect_system;
ProtectProc protect_proc;
ProcSubset proc_subset;
+ PrivateBPF private_bpf;
PrivateTmp private_tmp;
PrivateTmp private_var_tmp;
PrivatePIDs private_pids;
+
+ int bpffs_socket_fd;
} NamespaceParameters;
int setup_namespace(const NamespaceParameters *p, char **reterr_path);
const char* proc_subset_to_string(ProcSubset i) _const_;
ProcSubset proc_subset_from_string(const char *s) _pure_;
+const char* private_bpf_to_string(PrivateBPF i) _const_;
+PrivateBPF private_bpf_from_string(const char *s) _pure_;
+
const char* private_tmp_to_string(PrivateTmp i) _const_;
PrivateTmp private_tmp_from_string(const char *s) _pure_;
{ "MountImagePolicy", bus_append_string },
{ "ExtensionImagePolicy", bus_append_string },
{ "PrivatePIDs", bus_append_string },
+ { "PrivateBPF", bus_append_string },
{ "IgnoreSIGPIPE", bus_append_parse_boolean },
{ "TTYVHangup", bus_append_parse_boolean },
{ "TTYReset", bus_append_parse_boolean },
--- /dev/null
+#!/usr/bin/env bash
+# SPDX-License-Identifier: LGPL-2.1-or-later
+set -eux
+set -o pipefail
+
+# Check that with ProtectKernelTunables=yes and PrivateBPF=no, the host bpffs is remounted ro
+systemd-run --wait \
+ -p PrivateUsers=yes \
+ -p PrivateMounts=yes \
+ -p DelegateNamespaces=mnt \
+ -p ProtectKernelTunables=yes \
+ -p PrivateBPF=no \
+ grep -q '/sys/fs/bpf .* ro,' /proc/mounts
+
+# Check that with PrivateBPF=yes, a new bpffs instance is mounted
+systemd-run --wait \
+ -p PrivateUsers=yes \
+ -p PrivateMounts=yes \
+ -p DelegateNamespaces=mnt \
+ -p PrivateBPF=yes \
+ grep -q '^none /sys/fs/bpf bpf rw' /proc/mounts