]> git.ipfire.org Git - thirdparty/systemd.git/commitdiff
core: Introduce PrivateBPF= to mount a private BPFFS
authorMatteo Croce <teknoraver@meta.com>
Fri, 27 Jun 2025 12:17:00 +0000 (14:17 +0200)
committerMatteo Croce <teknoraver@meta.com>
Tue, 8 Jul 2025 20:33:28 +0000 (22:33 +0200)
Add a new option PrivateBPF= to mount a new instance of bpffs within a
namespace.
PrivateBPF= can be set to "no" to use the host bpffs in readonly mode
and "yes" to do a new mount.
The mount is done with the new fsopen()/fsmount() API because in future
we'll hook some commands between the two calls.

14 files changed:
man/org.freedesktop.systemd1.xml
man/systemd.exec.xml
src/core/dbus-execute.c
src/core/exec-invoke.c
src/core/execute-serialize.c
src/core/execute.c
src/core/execute.h
src/core/load-fragment-gperf.gperf.in
src/core/load-fragment.c
src/core/load-fragment.h
src/core/namespace.c
src/core/namespace.h
src/shared/bus-unit-util.c
test/units/TEST-07-PID1.private-bpf.sh [new file with mode: 0755]

index d5f270c68179fc504f45f671d227c0b3860b36cd..95fb54d2f29a4e6e199a10dd7f11d07972b52062 100644 (file)
@@ -3374,6 +3374,8 @@ node /org/freedesktop/systemd1/unit/avahi_2ddaemon_2eservice {
       @org.freedesktop.DBus.Property.EmitsChangedSignal("const")
       readonly (ss) ProtectHostnameEx = ...;
       @org.freedesktop.DBus.Property.EmitsChangedSignal("const")
+      readonly s PrivateBPF = '...';
+      @org.freedesktop.DBus.Property.EmitsChangedSignal("const")
       readonly b MemoryKSM = ...;
       @org.freedesktop.DBus.Property.EmitsChangedSignal("const")
       readonly s NetworkNamespacePath = '...';
@@ -3975,6 +3977,8 @@ node /org/freedesktop/systemd1/unit/avahi_2ddaemon_2eservice {
 
     <!--property ProcSubset is not documented!-->
 
+    <!--property PrivateBPF is not documented!-->
+
     <!--property MemoryKSM is not documented!-->
 
     <!--property NetworkNamespacePath is not documented!-->
@@ -4701,6 +4705,8 @@ node /org/freedesktop/systemd1/unit/avahi_2ddaemon_2eservice {
 
     <variablelist class="dbus-property" generated="True" extra-ref="ProtectHostnameEx"/>
 
+    <variablelist class="dbus-property" generated="True" extra-ref="PrivateBPF"/>
+
     <variablelist class="dbus-property" generated="True" extra-ref="MemoryKSM"/>
 
     <variablelist class="dbus-property" generated="True" extra-ref="NetworkNamespacePath"/>
@@ -5583,6 +5589,8 @@ node /org/freedesktop/systemd1/unit/avahi_2ddaemon_2esocket {
       @org.freedesktop.DBus.Property.EmitsChangedSignal("const")
       readonly (ss) ProtectHostnameEx = ...;
       @org.freedesktop.DBus.Property.EmitsChangedSignal("const")
+      readonly s PrivateBPF = '...';
+      @org.freedesktop.DBus.Property.EmitsChangedSignal("const")
       readonly b MemoryKSM = ...;
       @org.freedesktop.DBus.Property.EmitsChangedSignal("const")
       readonly s NetworkNamespacePath = '...';
@@ -6204,6 +6212,8 @@ node /org/freedesktop/systemd1/unit/avahi_2ddaemon_2esocket {
 
     <!--property ProcSubset is not documented!-->
 
+    <!--property PrivateBPF is not documented!-->
+
     <!--property MemoryKSM is not documented!-->
 
     <!--property NetworkNamespacePath is not documented!-->
@@ -6910,6 +6920,8 @@ node /org/freedesktop/systemd1/unit/avahi_2ddaemon_2esocket {
 
     <variablelist class="dbus-property" generated="True" extra-ref="ProtectHostnameEx"/>
 
+    <variablelist class="dbus-property" generated="True" extra-ref="PrivateBPF"/>
+
     <variablelist class="dbus-property" generated="True" extra-ref="MemoryKSM"/>
 
     <variablelist class="dbus-property" generated="True" extra-ref="NetworkNamespacePath"/>
@@ -7616,6 +7628,8 @@ node /org/freedesktop/systemd1/unit/home_2emount {
       @org.freedesktop.DBus.Property.EmitsChangedSignal("const")
       readonly (ss) ProtectHostnameEx = ...;
       @org.freedesktop.DBus.Property.EmitsChangedSignal("const")
+      readonly s PrivateBPF = '...';
+      @org.freedesktop.DBus.Property.EmitsChangedSignal("const")
       readonly b MemoryKSM = ...;
       @org.freedesktop.DBus.Property.EmitsChangedSignal("const")
       readonly s NetworkNamespacePath = '...';
@@ -8159,6 +8173,8 @@ node /org/freedesktop/systemd1/unit/home_2emount {
 
     <!--property ProcSubset is not documented!-->
 
+    <!--property PrivateBPF is not documented!-->
+
     <!--property MemoryKSM is not documented!-->
 
     <!--property NetworkNamespacePath is not documented!-->
@@ -8773,6 +8789,8 @@ node /org/freedesktop/systemd1/unit/home_2emount {
 
     <variablelist class="dbus-property" generated="True" extra-ref="ProtectHostnameEx"/>
 
+    <variablelist class="dbus-property" generated="True" extra-ref="PrivateBPF"/>
+
     <variablelist class="dbus-property" generated="True" extra-ref="MemoryKSM"/>
 
     <variablelist class="dbus-property" generated="True" extra-ref="NetworkNamespacePath"/>
@@ -9612,6 +9630,8 @@ node /org/freedesktop/systemd1/unit/dev_2dsda3_2eswap {
       @org.freedesktop.DBus.Property.EmitsChangedSignal("const")
       readonly (ss) ProtectHostnameEx = ...;
       @org.freedesktop.DBus.Property.EmitsChangedSignal("const")
+      readonly s PrivateBPF = '...';
+      @org.freedesktop.DBus.Property.EmitsChangedSignal("const")
       readonly b MemoryKSM = ...;
       @org.freedesktop.DBus.Property.EmitsChangedSignal("const")
       readonly s NetworkNamespacePath = '...';
@@ -10137,6 +10157,8 @@ node /org/freedesktop/systemd1/unit/dev_2dsda3_2eswap {
 
     <!--property ProcSubset is not documented!-->
 
+    <!--property PrivateBPF is not documented!-->
+
     <!--property MemoryKSM is not documented!-->
 
     <!--property NetworkNamespacePath is not documented!-->
@@ -10733,6 +10755,8 @@ node /org/freedesktop/systemd1/unit/dev_2dsda3_2eswap {
 
     <variablelist class="dbus-property" generated="True" extra-ref="ProtectHostnameEx"/>
 
+    <variablelist class="dbus-property" generated="True" extra-ref="PrivateBPF"/>
+
     <variablelist class="dbus-property" generated="True" extra-ref="MemoryKSM"/>
 
     <variablelist class="dbus-property" generated="True" extra-ref="NetworkNamespacePath"/>
@@ -12316,6 +12340,7 @@ $ gdbus introspect --system --dest org.freedesktop.systemd1 \
       <varname>PrivatePIDs</varname> were added in version 257.</para>
       <para><varname>ProtectHostnameEx</varname>,
       <varname>DelegateNamespaces</varname>,
+      <varname>PrivateBPF</varname>,
       <function>RemoveSubGroup()</function>,
       <varname>StateDirectoryQuota</varname>,
       <varname>StateDirectoryQuotaUsage</varname>,
@@ -12374,6 +12399,7 @@ $ gdbus introspect --system --dest org.freedesktop.systemd1 \
       <varname>PassPIDFD</varname>,
       <varname>AcceptFileDescriptors</varname>,
       <varname>DelegateNamespaces</varname>,
+      <varname>PrivateBPF</varname>,
       <function>RemoveSubgroup()</function>,
       <varname>DeferTrigger</varname>,
       <varname>DeferTriggerMaxUSec</varname>,
@@ -12429,6 +12455,7 @@ $ gdbus introspect --system --dest org.freedesktop.systemd1 \
       <varname>PrivatePIDs</varname> were added in version 257.</para>
       <para><varname>ProtectHostnameEx</varname>,
       <varname>DelegateNamespaces</varname>,
+      <varname>PrivateBPF</varname>,
       <function>RemoveSubgroup()</function>,
       <varname>ReloadResult</varname>,
       <varname>CleanResult</varname>,
@@ -12484,6 +12511,7 @@ $ gdbus introspect --system --dest org.freedesktop.systemd1 \
       <varname>PrivatePIDs</varname> were added in version 257.</para>
       <para><varname>ProtectHostnameEx</varname>,
       <varname>DelegateNamespaces</varname>,
+      <varname>PrivateBPF</varname>,
       <function>RemoveSubgroup()</function>,
       <varname>StateDirectoryQuota</varname>,
       <varname>StateDirectoryQuotaUsage</varname>,
index 813ea0231384d2b182db12095f5287f38d65a4df..85db1de264e26f1e7bfc1fe43590c3420ef0d477 100644 (file)
@@ -2555,6 +2555,16 @@ RestrictNamespaces=~cgroup net</programlisting>
         <xi:include href="version-info.xml" xpointer="v258"/></listitem>
       </varlistentry>
 
+      <varlistentry>
+        <term><varname>PrivateBPF=</varname></term>
+
+        <listitem><para>Takes a boolean argument. If set, mount a private instance of the BPF filesystem
+        on <filename>/sys/fs/bpf/</filename>. Otherwise, if <varname>ProtectKernelTunables=</varname> is set,
+        the instance from the host is inherited but mounted read-only. Defaults to false.</para>
+
+        <xi:include href="version-info.xml" xpointer="v258"/></listitem>
+      </varlistentry>
+
       <varlistentry>
         <term><varname>LockPersonality=</varname></term>
 
index b1e3df1688a4232fea49ea4b3fa0e4238bf5d41e..7e4d6fa6dbf425edb534b5c58676a8dcb864c237 100644 (file)
@@ -54,6 +54,7 @@ BUS_DEFINE_PROPERTY_GET_ENUM(bus_property_get_exec_preserve_mode, exec_preserve_
 static BUS_DEFINE_PROPERTY_GET_ENUM(property_get_exec_keyring_mode, exec_keyring_mode, ExecKeyringMode);
 static BUS_DEFINE_PROPERTY_GET_ENUM(property_get_protect_proc, protect_proc, ProtectProc);
 static BUS_DEFINE_PROPERTY_GET_ENUM(property_get_proc_subset, proc_subset, ProcSubset);
+static BUS_DEFINE_PROPERTY_GET_ENUM(property_get_private_bpf, private_bpf, PrivateBPF);
 static BUS_DEFINE_PROPERTY_GET_ENUM(property_get_protect_home, protect_home, ProtectHome);
 static BUS_DEFINE_PROPERTY_GET_ENUM(property_get_protect_system, protect_system, ProtectSystem);
 static BUS_DEFINE_PROPERTY_GET_ENUM(property_get_personality, personality, unsigned long);
@@ -1316,6 +1317,7 @@ const sd_bus_vtable bus_exec_vtable[] = {
         SD_BUS_PROPERTY("ProcSubset", "s", property_get_proc_subset, offsetof(ExecContext, proc_subset), SD_BUS_VTABLE_PROPERTY_CONST),
         SD_BUS_PROPERTY("ProtectHostname", "b", property_get_protect_hostname, offsetof(ExecContext, protect_hostname), SD_BUS_VTABLE_PROPERTY_CONST),
         SD_BUS_PROPERTY("ProtectHostnameEx", "(ss)", property_get_protect_hostname_ex, 0, SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("PrivateBPF", "s", property_get_private_bpf, offsetof(ExecContext, private_bpf), SD_BUS_VTABLE_PROPERTY_CONST),
         SD_BUS_PROPERTY("MemoryKSM", "b", bus_property_get_tristate, offsetof(ExecContext, memory_ksm), SD_BUS_VTABLE_PROPERTY_CONST),
         SD_BUS_PROPERTY("NetworkNamespacePath", "s", NULL, offsetof(ExecContext, network_namespace_path), SD_BUS_VTABLE_PROPERTY_CONST),
         SD_BUS_PROPERTY("IPCNamespacePath", "s", NULL, offsetof(ExecContext, ipc_namespace_path), SD_BUS_VTABLE_PROPERTY_CONST),
@@ -1753,6 +1755,7 @@ static BUS_DEFINE_SET_TRANSIENT_PARSE(protect_home, ProtectHome, protect_home_fr
 static BUS_DEFINE_SET_TRANSIENT_PARSE(keyring_mode, ExecKeyringMode, exec_keyring_mode_from_string);
 static BUS_DEFINE_SET_TRANSIENT_PARSE(protect_proc, ProtectProc, protect_proc_from_string);
 static BUS_DEFINE_SET_TRANSIENT_PARSE(proc_subset, ProcSubset, proc_subset_from_string);
+static BUS_DEFINE_SET_TRANSIENT_PARSE(private_bpf, PrivateBPF, private_bpf_from_string);
 BUS_DEFINE_SET_TRANSIENT_PARSE(exec_preserve_mode, ExecPreserveMode, exec_preserve_mode_from_string);
 static BUS_DEFINE_SET_TRANSIENT_PARSE_PTR(personality, unsigned long, parse_personality);
 static BUS_DEFINE_SET_TRANSIENT_TO_STRING_ALLOC(secure_bits, "i", int32_t, int, "%" PRIi32, secure_bits_to_string_alloc_with_check);
@@ -2279,6 +2282,9 @@ int bus_exec_context_set_transient_property(
         if (streq(name, "ProcSubset"))
                 return bus_set_transient_proc_subset(u, name, &c->proc_subset, message, flags, error);
 
+        if (streq(name, "PrivateBPF"))
+                return bus_set_transient_private_bpf(u, name, &c->private_bpf, message, flags, error);
+
         if (streq(name, "RuntimeDirectoryPreserve"))
                 return bus_set_transient_exec_preserve_mode(u, name, &c->runtime_directory_preserve_mode, message, flags, error);
 
index 09deb0f5c11651b8351085f4752be6a98f742617..e6fce99340b040d018e7dbf63eeac9872f8cde3f 100644 (file)
@@ -2270,6 +2270,61 @@ static int setup_private_users_child(int unshare_ready_fd, const char *uid_map,
         return 0;
 }
 
+static int bpffs_prepare(
+                PidRef *ret_pid,
+                int *ret_sock_fd,
+                int *ret_errno_pipe) {
+
+        _cleanup_close_pair_ int socket_fds[2] = EBADF_PAIR, bpffs_errno_pipe[2] = EBADF_PAIR;
+        int r;
+
+        assert(ret_sock_fd);
+        assert(ret_pid);
+        assert(ret_errno_pipe);
+
+        r = pipe2(bpffs_errno_pipe, O_CLOEXEC|O_NONBLOCK);
+        if (r < 0)
+                return log_debug_errno(errno, "Failed to create pipe: %m");
+
+        r = socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, socket_fds);
+        if (r < 0)
+                return log_debug_errno(errno, "Failed to create socket pair: %m");
+
+        r = pidref_safe_fork("(sd-bpffs)", FORK_RESET_SIGNALS|FORK_DEATHSIG_SIGKILL, ret_pid);
+        if (r < 0)
+                return log_debug_errno(r, "Failed to fork bpffs privileged helper: %m");
+        if (r == 0) {
+                _cleanup_close_ int fs_fd = -EBADF;
+
+                bpffs_errno_pipe[0] = safe_close(bpffs_errno_pipe[0]);
+                socket_fds[0] = safe_close(socket_fds[0]);
+
+                fs_fd = receive_one_fd(socket_fds[1], /* flags = */ 0);
+                if (fs_fd < 0) {
+                        log_debug_errno(fs_fd, "Failed to receive file descriptor from parent: %m");
+                        report_errno_and_exit(bpffs_errno_pipe[1], fs_fd);
+                }
+
+                r = fsconfig(fs_fd, FSCONFIG_CMD_CREATE, /* key = */ NULL, /* value = */ NULL, /* aux = */ 0);
+                if (r < 0) {
+                        log_debug_errno(errno, "Failed to create bpffs superblock: %m");
+                        report_errno_and_exit(bpffs_errno_pipe[1], errno);
+                }
+
+                if (write(socket_fds[1], (uint8_t[1]) {}, 1) < 0) {
+                        log_debug_errno(errno, "Failed to send data to child: %m");
+                        report_errno_and_exit(bpffs_errno_pipe[1], errno);
+                }
+
+                _exit(EXIT_SUCCESS);
+        }
+
+        *ret_sock_fd = TAKE_FD(socket_fds[0]);
+        *ret_errno_pipe = TAKE_FD(bpffs_errno_pipe[0]);
+
+        return 0;
+}
+
 static int setup_private_users(PrivateUsers private_users, uid_t ouid, gid_t ogid, uid_t uid, gid_t gid, bool allow_setgroups) {
         _cleanup_free_ char *uid_map = NULL, *gid_map = NULL;
         _cleanup_close_pair_ int errno_pipe[2] = EBADF_PAIR;
@@ -3600,9 +3655,10 @@ static int apply_mount_namespace(
                 ExecRuntime *runtime,
                 const char *memory_pressure_path,
                 bool needs_sandboxing,
-                char **reterr_path,
                 uid_t exec_directory_uid,
-                gid_t exec_directory_gid) {
+                gid_t exec_directory_gid,
+                int bpffs_socket_fd,
+                char **reterr_path) {
 
         _cleanup_(verity_settings_done) VeritySettings verity = VERITY_SETTINGS_DEFAULT;
         _cleanup_strv_free_ char **empty_directories = NULL, **symlinks = NULL,
@@ -3814,6 +3870,9 @@ static int apply_mount_namespace(
                 .protect_system = needs_sandboxing ? context->protect_system : PROTECT_SYSTEM_NO,
                 .protect_proc = needs_sandboxing ? context->protect_proc : PROTECT_PROC_DEFAULT,
                 .proc_subset = needs_sandboxing ? context->proc_subset : PROC_SUBSET_ALL,
+                .private_bpf = needs_sandboxing ? context->private_bpf : PRIVATE_BPF_NO,
+
+                .bpffs_socket_fd = bpffs_socket_fd,
         };
 
         r = setup_namespace(&parameters, reterr_path);
@@ -4454,6 +4513,7 @@ static int setup_delegated_namespaces(
                 const ExecCommand *command,
                 bool needs_sandboxing,
                 bool have_cap_sys_admin,
+                int bpffs_socket_fd,
                 int *reterr_exit_status) {
 
         int r;
@@ -4574,9 +4634,10 @@ static int setup_delegated_namespaces(
                                           runtime,
                                           memory_pressure_path,
                                           needs_sandboxing,
-                                          &error_path,
                                           uid,
-                                          gid);
+                                          gid,
+                                          bpffs_socket_fd,
+                                          &error_path);
                 if (r < 0) {
                         *reterr_exit_status = EXIT_NAMESPACE;
                         return log_error_errno(r, "Failed to set up mount namespacing%s%s: %m",
@@ -4911,7 +4972,9 @@ int exec_invoke(
         _cleanup_free_ gid_t *gids = NULL, *gids_after_pam = NULL;
         int ngids = 0, ngids_after_pam = 0;
         int socket_fd = -EBADF, named_iofds[3] = EBADF_TRIPLET;
+        _cleanup_close_ int bpffs_socket_fd = -EBADF, bpffs_errno_pipe = -EBADF;
         size_t n_storage_fds, n_socket_fds, n_extra_fds;
+        _cleanup_(pidref_done_sigkill_wait) PidRef bpffs_pidref = PIDREF_NULL;
 
         assert(command);
         assert(context);
@@ -5627,6 +5690,26 @@ int exec_invoke(
                 }
         }
 
+        if (context->private_bpf != PRIVATE_BPF_NO) {
+                /* To create a BPF token, the bpffs has to be mounted with the fsopen()/fsmount() API.
+                 * More specifically, fsopen() must be called within the user namespace, then all the
+                 * fsconfig() as privileged user, and finally and fsmount() and move_mount() in
+                 * the user namespace.
+                 * To do this, we split the code into a bpffs_prepare() and mount_bpffs() functions,
+                 * the first runs as privileged user the second as unprivileged one, and they coordinate
+                 * by sending messages and file descriptors via a socket pair.
+                 * The user and mount namespaces need to be unshared in this exact order and before
+                 * the fsopen() call for the fsopen() API to work as unprivileged.
+                 * This is the kernel sample doing this:
+                 * https://github.com/torvalds/linux/blob/master/tools/testing/selftests/bpf/prog_tests/token.c
+                 */
+                r = bpffs_prepare(&bpffs_pidref, &bpffs_socket_fd, &bpffs_errno_pipe);
+                if (r < 0) {
+                        *exit_status = EXIT_BPF;
+                        return log_error_errno(r, "Failed to mount bpffs in bpffs_prepare(): %m");
+                }
+        }
+
         if (needs_sandboxing && !have_cap_sys_admin && exec_needs_cap_sys_admin(context, params)) {
                 /* If we're unprivileged, set up the user namespace first to enable use of the other namespaces.
                  * Users with CAP_SYS_ADMIN can set up user namespaces last because they will be able to
@@ -5665,6 +5748,7 @@ int exec_invoke(
                         command,
                         needs_sandboxing,
                         have_cap_sys_admin,
+                        bpffs_socket_fd,
                         exit_status);
         if (r < 0)
                 return r;
@@ -5724,10 +5808,30 @@ int exec_invoke(
                         command,
                         needs_sandboxing,
                         have_cap_sys_admin,
+                        bpffs_socket_fd,
                         exit_status);
         if (r < 0)
                 return r;
 
+        if (context->private_bpf != PRIVATE_BPF_NO) {
+                r = pidref_wait_for_terminate_and_check("(sd-bpffs)", &bpffs_pidref, /* flags = */ 0);
+                if (r < 0) {
+                        *exit_status = EXIT_BPF;
+                        return r;
+                }
+                /* If something strange happened with the child, let's consider this fatal, too */
+                if (r != EXIT_SUCCESS) {
+                        *exit_status = EXIT_BPF;
+                        ssize_t ss = read(bpffs_errno_pipe, &r, sizeof(r));
+                        if (ss == sizeof(r))
+                                return log_debug_errno(r, "bpffs helper exited with error: %m");
+                        if (ss < 0)
+                                return log_debug_errno(errno, "Failed to read from the bpffs helper errno pipe: %m");
+                        return log_debug_errno(SYNTHETIC_ERRNO(EIO), "Short read from the bpffs helper errno pipe.");
+                }
+                pidref_done(&bpffs_pidref);
+        }
+
         if (needs_sandboxing && exec_needs_cgroup_namespace(context) && params->cgroup_path) {
                 /* Move ourselves into the subcgroup now *after* we've unshared the cgroup namespace, which
                  * ensures the root of the cgroup namespace is the top level service cgroup and not the
index 0a1af05e51dcf6884d0152b13be72ca8268a1bd8..167e4dfd7fa9cbfcfe84fb4410ac8ef5f7f29349 100644 (file)
@@ -1803,6 +1803,10 @@ static int exec_context_serialize(const ExecContext *c, FILE *f) {
         if (r < 0)
                 return r;
 
+        r = serialize_item(f, "exec-context-private-bpf", private_bpf_to_string(c->private_bpf));
+        if (r < 0)
+                return r;
+
         r = serialize_item(f, "exec-context-runtime-directory-preserve-mode", exec_preserve_mode_to_string(c->runtime_directory_preserve_mode));
         if (r < 0)
                 return r;
@@ -2741,6 +2745,10 @@ static int exec_context_deserialize(ExecContext *c, FILE *f) {
                         c->proc_subset = proc_subset_from_string(val);
                         if (c->proc_subset < 0)
                                 return -EINVAL;
+                } else if ((val = startswith(l, "exec-context-private-bpf="))) {
+                        c->private_bpf = private_bpf_from_string(val);
+                        if (c->private_bpf < 0)
+                                return -EINVAL;
                 } else if ((val = startswith(l, "exec-context-runtime-directory-preserve-mode="))) {
                         c->runtime_directory_preserve_mode = exec_preserve_mode_from_string(val);
                         if (c->runtime_directory_preserve_mode < 0)
index 5d5cc4120765f71c44f0999be52ff428dd73cc1b..9fc9e549de38beefc5e2c23a72c59a2567ed8f8e 100644 (file)
@@ -324,6 +324,7 @@ bool exec_needs_mount_namespace(
             exec_needs_cgroup_mount(context) ||
             context->protect_proc != PROTECT_PROC_DEFAULT ||
             context->proc_subset != PROC_SUBSET_ALL ||
+            context->private_bpf != PRIVATE_BPF_NO ||
             exec_needs_ipc_namespace(context) ||
             exec_needs_pid_namespace(context, params))
                 return true;
@@ -1124,7 +1125,8 @@ void exec_context_dump(const ExecContext *c, FILE* f, const char *prefix) {
                 "%sKeyringMode: %s\n"
                 "%sProtectHostname: %s%s%s\n"
                 "%sProtectProc: %s\n"
-                "%sProcSubset: %s\n",
+                "%sProcSubset: %s\n"
+                "%sPrivateBPF: %s\n",
                 prefix, c->umask,
                 prefix, empty_to_root(c->working_directory),
                 prefix, empty_to_root(c->root_directory),
@@ -1151,7 +1153,8 @@ void exec_context_dump(const ExecContext *c, FILE* f, const char *prefix) {
                 prefix, exec_keyring_mode_to_string(c->keyring_mode),
                 prefix, protect_hostname_to_string(c->protect_hostname), c->private_hostname ? ":" : "", strempty(c->private_hostname),
                 prefix, protect_proc_to_string(c->protect_proc),
-                prefix, proc_subset_to_string(c->proc_subset));
+                prefix, proc_subset_to_string(c->proc_subset),
+                prefix, private_bpf_to_string(c->private_bpf));
 
         if (c->set_login_environment >= 0)
                 fprintf(f, "%sSetLoginEnvironment: %s\n", prefix, yes_no(c->set_login_environment > 0));
index da1600a0440257f5cb231c6798222b9c555ccf7a..6f1df610a8a5e6dae4fec98858a7ed16699ba860 100644 (file)
@@ -300,6 +300,8 @@ typedef struct ExecContext {
         ProtectProc protect_proc;  /* hidepid= */
         ProcSubset proc_subset;    /* subset= */
 
+        PrivateBPF private_bpf;
+
         int private_mounts;
         int mount_apivfs;
         int bind_log_sockets;
index 7d4d174d845e5b66744f8fa14a466d865affcf86..edb063953997b582d8b82e8219ca0221b7d8c142 100644 (file)
@@ -67,6 +67,7 @@
 {{type}}.KeyringMode,                         config_parse_exec_keyring_mode,                     0,                                  offsetof({{type}}, exec_context.keyring_mode)
 {{type}}.ProtectProc,                         config_parse_protect_proc,                          0,                                  offsetof({{type}}, exec_context.protect_proc)
 {{type}}.ProcSubset,                          config_parse_proc_subset,                           0,                                  offsetof({{type}}, exec_context.proc_subset)
+{{type}}.PrivateBPF,                          config_parse_private_bpf,                           0,                                  offsetof({{type}}, exec_context.private_bpf)
 {% if HAVE_SECCOMP %}
 {{type}}.SystemCallFilter,                    config_parse_syscall_filter,                        0,                                  offsetof({{type}}, exec_context)
 {{type}}.SystemCallArchitectures,             config_parse_syscall_archs,                         0,                                  offsetof({{type}}, exec_context.syscall_archs)
index c1e704b1c6ab597dd6e1121c1fbc0d5b8507e762..9c544a35e052a703127f7e52eb6f381a0e2ef594 100644 (file)
@@ -133,6 +133,7 @@ DEFINE_CONFIG_PARSE_ENUM(config_parse_device_policy, cgroup_device_policy, CGrou
 DEFINE_CONFIG_PARSE_ENUM(config_parse_exec_keyring_mode, exec_keyring_mode, ExecKeyringMode);
 DEFINE_CONFIG_PARSE_ENUM(config_parse_protect_proc, protect_proc, ProtectProc);
 DEFINE_CONFIG_PARSE_ENUM(config_parse_proc_subset, proc_subset, ProcSubset);
+DEFINE_CONFIG_PARSE_ENUM(config_parse_private_bpf, private_bpf, PrivateBPF);
 DEFINE_CONFIG_PARSE_ENUM(config_parse_private_tmp, private_tmp, PrivateTmp);
 DEFINE_CONFIG_PARSE_ENUM(config_parse_private_users, private_users, PrivateUsers);
 DEFINE_CONFIG_PARSE_ENUM(config_parse_private_pids, private_pids, PrivatePIDs);
index a31ad750d3d044d801fed35f68741833ae7f70df..ba226e2e5c39ae725fc6291ad329f35b86493e01 100644 (file)
@@ -129,6 +129,7 @@ CONFIG_PARSER_PROTOTYPE(config_parse_bind_paths);
 CONFIG_PARSER_PROTOTYPE(config_parse_exec_keyring_mode);
 CONFIG_PARSER_PROTOTYPE(config_parse_protect_proc);
 CONFIG_PARSER_PROTOTYPE(config_parse_proc_subset);
+CONFIG_PARSER_PROTOTYPE(config_parse_private_bpf);
 CONFIG_PARSER_PROTOTYPE(config_parse_job_timeout_sec);
 CONFIG_PARSER_PROTOTYPE(config_parse_job_running_timeout_sec);
 CONFIG_PARSER_PROTOTYPE(config_parse_log_extra_fields);
index faa84ced20bbba4e1068ee5008ae40493f20ba99..0768eafac22ec159b500a96f8e84c438b39baaad 100644 (file)
@@ -79,6 +79,7 @@ typedef enum MountMode {
         MOUNT_EXTENSION_IMAGE,     /* Mounted outside the root directory, and used by subsequent mounts */
         MOUNT_MQUEUEFS,
         MOUNT_READ_WRITE_IMPLICIT, /* Should have the lowest priority. */
+        MOUNT_BPFFS,               /* Special mount for bpffs, which is mounted with fsmount() and move_mount() */
         _MOUNT_MODE_MAX,
         _MOUNT_MODE_INVALID = -EINVAL,
 } MountMode;
@@ -161,13 +162,17 @@ static const MountEntry protect_kernel_tunables_proc_table[] = {
 
 static const MountEntry protect_kernel_tunables_sys_table[] = {
         { "/sys",                MOUNT_READ_ONLY,           false },
-        { "/sys/fs/bpf",         MOUNT_READ_ONLY,           true  },
         { "/sys/fs/cgroup",      MOUNT_READ_WRITE_IMPLICIT, false }, /* READ_ONLY is set by ProtectControlGroups= option */
         { "/sys/fs/selinux",     MOUNT_READ_WRITE_IMPLICIT, true  },
         { "/sys/kernel/debug",   MOUNT_READ_ONLY,           true  },
         { "/sys/kernel/tracing", MOUNT_READ_ONLY,           true  },
 };
 
+/* PrivateBPF= option */
+static const MountEntry private_bpf_no_table[] = {
+        { "/sys/fs/bpf",         MOUNT_READ_ONLY,    true  },
+};
+
 /* ProtectKernelModules= option */
 static const MountEntry protect_kernel_modules_table[] = {
         { "/usr/lib/modules",    MOUNT_INACCESSIBLE, true  },
@@ -927,6 +932,36 @@ static int append_protect_system(MountList *ml, ProtectSystem protect_system, bo
         }
 }
 
+static int append_private_bpf(
+                MountList *ml,
+                PrivateBPF private_bpf,
+                bool protect_kernel_tunables,
+                bool ignore_protect,
+                const NamespaceParameters *p) {
+
+        assert(ml);
+
+        switch (private_bpf) {
+        case PRIVATE_BPF_NO:
+                if (protect_kernel_tunables)
+                        return append_static_mounts(ml, private_bpf_no_table, ELEMENTSOF(private_bpf_no_table), ignore_protect);
+                return 0;
+        case PRIVATE_BPF_YES: {
+                MountEntry *me = mount_list_extend(ml);
+                if (!me)
+                        return log_oom_debug();
+
+                *me = (MountEntry) {
+                        .path_const = "/sys/fs/bpf",
+                        .mode = MOUNT_BPFFS,
+                };
+                return 0;
+        }
+        default:
+                assert_not_reached();
+        }
+}
+
 static int mount_path_compare(const MountEntry *a, const MountEntry *b) {
         int d;
 
@@ -1697,6 +1732,34 @@ static int mount_overlay(const MountEntry *m) {
         return 1;
 }
 
+static int mount_bpffs(const MountEntry *m, int socket_fd) {
+        int r;
+
+        assert(m);
+        assert(socket_fd >= 0);
+
+        _cleanup_close_ int fs_fd = fsopen("bpf", FSOPEN_CLOEXEC);
+        if (fs_fd < 0)
+                return log_debug_errno(errno, "Failed to fsopen: %m");
+
+        r = send_one_fd(socket_fd, fs_fd, /* flags = */ 0);
+        if (r < 0)
+                return log_debug_errno(r, "Failed to send bpffs fd to child: %m");
+
+        if (read(socket_fd, (uint8_t[1]) {}, 1) < 0)
+                return log_debug_errno(errno, "Failed to receive data from child: %m");
+
+        _cleanup_close_ int mnt_fd = fsmount(fs_fd, /* flags = */ 0, /* mount_attrs = */ 0);
+        if (mnt_fd < 0)
+                return log_debug_errno(errno, "Failed to fsmount bpffs: %m");
+
+        r = move_mount(mnt_fd, "", AT_FDCWD, mount_entry_path(m), MOVE_MOUNT_F_EMPTY_PATH);
+        if (r < 0)
+                return log_debug_errno(errno, "Failed to move bpffs mount to %s: %m", mount_entry_path(m));
+
+        return 1;
+}
+
 static int follow_symlink(
                 const char *root_directory,
                 MountEntry *m) {
@@ -1953,6 +2016,9 @@ static int apply_one_mount(
         case MOUNT_OVERLAY:
                 return mount_overlay(m);
 
+        case MOUNT_BPFFS:
+                return mount_bpffs(m, p->bpffs_socket_fd);
+
         default:
                 assert_not_reached();
         }
@@ -2151,6 +2217,7 @@ static bool namespace_parameters_mount_apivfs(const NamespaceParameters *p) {
                 p->protect_kernel_tunables ||
                 p->protect_proc != PROTECT_PROC_DEFAULT ||
                 p->proc_subset != PROC_SUBSET_ALL ||
+                p->private_bpf != PRIVATE_BPF_NO ||
                 p->private_pids != PRIVATE_PIDS_NO;
 }
 
@@ -2653,6 +2720,10 @@ int setup_namespace(const NamespaceParameters *p, char **reterr_path) {
         if (r < 0)
                 return r;
 
+        r = append_private_bpf(&ml, p->private_bpf, p->protect_kernel_tunables, /* ignore_protect = */ false, p);
+        if (r < 0)
+                return r;
+
         if (namespace_parameters_mount_apivfs(p)) {
                 r = append_static_mounts(&ml,
                                          apivfs_table,
@@ -3888,6 +3959,13 @@ static const char* const proc_subset_table[_PROC_SUBSET_MAX] = {
 
 DEFINE_STRING_TABLE_LOOKUP(proc_subset, ProcSubset);
 
+static const char* const private_bpf_table[_PRIVATE_BPF_MAX] = {
+        [PRIVATE_BPF_NO]  = "no",
+        [PRIVATE_BPF_YES] = "yes",
+};
+
+DEFINE_STRING_TABLE_LOOKUP_WITH_BOOLEAN(private_bpf, PrivateBPF, PRIVATE_BPF_YES);
+
 static const char* const private_tmp_table[_PRIVATE_TMP_MAX] = {
         [PRIVATE_TMP_NO]           = "no",
         [PRIVATE_TMP_CONNECTED]    = "connected",
index eadd991ed2a135c6f1af9e70f1dda672a9f04b64..178ed1e54801a77a2103a0550413839f26fae2e1 100644 (file)
@@ -51,6 +51,13 @@ typedef enum ProcSubset {
         _PROC_SUBSET_INVALID = -EINVAL,
 } ProcSubset;
 
+typedef enum PrivateBPF {
+        PRIVATE_BPF_NO,
+        PRIVATE_BPF_YES,
+        _PRIVATE_BPF_MAX,
+        _PRIVATE_BPF_INVALID = -EINVAL,
+} PrivateBPF;
+
 typedef enum PrivateTmp {
         PRIVATE_TMP_NO,
         PRIVATE_TMP_CONNECTED, /* Bind mounted from the host's filesystem */
@@ -188,9 +195,12 @@ typedef struct NamespaceParameters {
         ProtectSystem protect_system;
         ProtectProc protect_proc;
         ProcSubset proc_subset;
+        PrivateBPF private_bpf;
         PrivateTmp private_tmp;
         PrivateTmp private_var_tmp;
         PrivatePIDs private_pids;
+
+        int bpffs_socket_fd;
 } NamespaceParameters;
 
 int setup_namespace(const NamespaceParameters *p, char **reterr_path);
@@ -223,6 +233,9 @@ ProtectProc protect_proc_from_string(const char *s) _pure_;
 const char* proc_subset_to_string(ProcSubset i) _const_;
 ProcSubset proc_subset_from_string(const char *s) _pure_;
 
+const char* private_bpf_to_string(PrivateBPF i) _const_;
+PrivateBPF private_bpf_from_string(const char *s) _pure_;
+
 const char* private_tmp_to_string(PrivateTmp i) _const_;
 PrivateTmp private_tmp_from_string(const char *s) _pure_;
 
index 0e60cd63d40e042cdc87b4e6972a7b89619b0301..8fc97db191badb9a9dbd89de68781456639b8a9b 100644 (file)
@@ -2425,6 +2425,7 @@ static const BusProperty execute_properties[] = {
         { "MountImagePolicy",                      bus_append_string                             },
         { "ExtensionImagePolicy",                  bus_append_string                             },
         { "PrivatePIDs",                           bus_append_string                             },
+        { "PrivateBPF",                            bus_append_string                             },
         { "IgnoreSIGPIPE",                         bus_append_parse_boolean                      },
         { "TTYVHangup",                            bus_append_parse_boolean                      },
         { "TTYReset",                              bus_append_parse_boolean                      },
diff --git a/test/units/TEST-07-PID1.private-bpf.sh b/test/units/TEST-07-PID1.private-bpf.sh
new file mode 100755 (executable)
index 0000000..f0c1dcf
--- /dev/null
@@ -0,0 +1,21 @@
+#!/usr/bin/env bash
+# SPDX-License-Identifier: LGPL-2.1-or-later
+set -eux
+set -o pipefail
+
+# Check that with ProtectKernelTunables=yes and PrivateBPF=no, the host bpffs is remounted ro
+systemd-run --wait \
+        -p PrivateUsers=yes \
+        -p PrivateMounts=yes \
+        -p DelegateNamespaces=mnt \
+        -p ProtectKernelTunables=yes \
+        -p PrivateBPF=no \
+        grep -q '/sys/fs/bpf .* ro,' /proc/mounts
+
+# Check that with PrivateBPF=yes, a new bpffs instance is mounted
+systemd-run --wait \
+        -p PrivateUsers=yes \
+        -p PrivateMounts=yes \
+        -p DelegateNamespaces=mnt \
+        -p PrivateBPF=yes \
+        grep -q '^none /sys/fs/bpf bpf rw' /proc/mounts