From: Christian Brauner <brauner@kernel.org>
Date: Fri, 8 May 2026 08:49:10 +0000 (+0200)
Subject: core: add self-protection guard for RestrictFileSystemAccess= BPF LSM
X-Git-Url: http://git.ipfire.org/gitweb.cgi?a=commitdiff_plain;h=51e88de7f8be8434f474bc595d909ba8826cc01f;p=thirdparty%2Fsystemd.git

core: add self-protection guard for RestrictFileSystemAccess= BPF LSM

Add self-protection guard programs to the RestrictFileSystemAccess= skeleton that
prevent non-PID1 processes from obtaining FDs to our maps, programs, or
links via the bpf() syscall.

This blocks the primary attack vector against the RestrictFileSystemAccess= policy:
using BPF_MAP_GET_FD_BY_ID to get an FD to the verity_devices map,
then BPF_MAP_UPDATE_ELEM to inject fake trusted devices. Protection of
program and link IDs is defense-in-depth (the kernel already blocks
BPF_LINK_UPDATE and BPF_LINK_DETACH for LSM tracing links).

Additionally, a ptrace guard (lsm/ptrace_access_check) blocks
PTRACE_MODE_ATTACH to PID1 from other processes, preventing
extraction of sensitive state from PID1's address space via
ptrace, /proc/1/mem, process_vm_readv(), or pidfd_getfd().

Guard logic:
1. Allow all BPF ops from PID1 (tgid == 1, unspoofable)
2. Deny BPF_MAP_GET_FD_BY_ID for our protected map IDs
3. Deny BPF_PROG_GET_FD_BY_ID for our program IDs
4. Deny BPF_LINK_GET_FD_BY_ID for our link IDs
5. Allow everything else (zero collateral damage)

The guard starts inactive (all protected IDs default to 0 in .bss).
After skeleton attach, PID1 queries kernel-assigned IDs via
bpf_obj_get_info_by_fd() and writes them into the guard globals via
the mmap'd .bss, then extracts owned FDs and destroys the skeleton.
Destroying the skeleton unmaps the .bss page from PID1's address
space, so no BPF state — guard globals, protected map/prog/link IDs,
initramfs_s_dev — remains readable via /proc/1/mem. The kernel map
data persists (held by the dup'd FDs) but is only accessible via
bpf_map_* syscalls, which the guard itself blocks for non-PID1.

Signed-off-by: Christian Brauner <brauner@kernel.org>
---

diff --git a/src/bpf/restrict-fsaccess.bpf.c b/src/bpf/restrict-fsaccess.bpf.c
index a9f368ab399..538ddf3ef17 100644
--- a/src/bpf/restrict-fsaccess.bpf.c
+++ b/src/bpf/restrict-fsaccess.bpf.c
@@ -34,8 +34,9 @@
 #include <bpf/bpf_helpers.h>
 #include <bpf/bpf_tracing.h>
 
-#define PROT_EXEC 0x4
-#define VM_EXEC   0x00000004
+#define PROT_EXEC          0x4
+#define VM_EXEC            0x00000004
+#define PTRACE_MODE_ATTACH 0x02
 
 /* ---- Maps ---- */
 
@@ -53,6 +54,19 @@ struct {
  * â the window is closed." */
 volatile __u32 initramfs_s_dev;
 
+/* ---- Self-protection guard globals (set by PID1 after attach) ----
+ *
+ * While all IDs are 0 (the .bss default), the guard is inactive â no real BPF
+ * object has ID 0, so no comparisons match. PID1 populates these after
+ * attaching all programs. */
+volatile __u32 protected_map_id_verity;
+volatile __u32 protected_map_id_bss;
+
+/* Must equal _RESTRICT_FILESYSTEM_ACCESS_LINK_MAX in bpf-restrict-fsaccess.h â update when adding programs */
+#define NUM_PROTECTED_OBJS 9 /* 5 enforcement + 4 guard (bpf, bpf_map, bpf_prog, ptrace) */
+volatile __u32 protected_prog_ids[NUM_PROTECTED_OBJS];
+volatile __u32 protected_link_ids[NUM_PROTECTED_OBJS];
+
 /* ---- Integrity tracking hooks ---- */
 
 SEC("lsm/bdev_setintegrity")
@@ -149,4 +163,113 @@ int BPF_PROG(restrict_fsaccess_file_mprotect, struct vm_area_struct *vma,
         return check_trusted_file(file);
 }
 
+/* ---- PID1 ptrace protection ----
+ *
+ * Blocks PTRACE_MODE_ATTACH access to PID1 from any other process. This
+ * prevents ptrace(PTRACE_ATTACH), /proc/1/mem, process_vm_readv(), and
+ * pidfd_getfd() from extracting sensitive state from PID1's address space.
+ *
+ * PTRACE_MODE_READ is allowed â monitoring tools and systemctl need
+ * /proc/1/status, /proc/1/fd/, /proc/1/ns/ *, etc.
+ *
+ * PID1 accessing itself is allowed. */
+
+SEC("lsm/ptrace_access_check")
+int BPF_PROG(restrict_fsaccess_ptrace_guard, struct task_struct *child,
+             unsigned int mode)
+{
+        /* We only care about PID 1 and its threads (There are none but still.). */
+        if (child->tgid != 1)
+                return 0;
+
+        /* We only care about dangerous operations. */
+        if (!(mode & PTRACE_MODE_ATTACH))
+                return 0;
+
+        /* PID1 (any thread) accessing itself is allowed. */
+        if ((bpf_get_current_pid_tgid() >> 32) == 1)
+                return 0;
+
+        return -EPERM;
+}
+
+/* ---- Self-protection guard ----
+ *
+ * Three hooks protect our BPF objects from non-PID1 processes:
+ *
+ *   lsm/bpf_map  â fires inside bpf_map_new_fd(), the chokepoint for ALL
+ *                   code paths that produce a map FD (BPF_MAP_GET_FD_BY_ID,
+ *                   BPF_OBJ_GET, BPF_MAP_CREATE). Blocks the primary attack:
+ *                   obtaining an FD to verity_devices to inject fake trusted
+ *                   devices via BPF_MAP_UPDATE_ELEM.
+ *
+ *   lsm/bpf_prog â fires inside bpf_prog_new_fd(), same chokepoint coverage
+ *                   for programs. Defense-in-depth.
+ *
+ *   lsm/bpf      â handles BPF_LINK_GET_FD_BY_ID only. There is no
+ *                   security_bpf_link() hook in the kernel, so link
+ *                   protection uses the command-level bpf() hook. This is
+ *                   sufficient: we don't pin links in production, so
+ *                   BPF_OBJ_GET is not an attack vector for links. */
+
+SEC("lsm/bpf_map")
+int BPF_PROG(restrict_fsaccess_bpf_map_guard, struct bpf_map *map,
+             unsigned int fmode)
+{
+        __u32 id;
+
+        if ((bpf_get_current_pid_tgid() >> 32) == 1)
+                return 0;
+
+        id = map->id;
+        if (id != 0 && (id == protected_map_id_verity ||
+                        id == protected_map_id_bss))
+                return -EPERM;
+
+        return 0;
+}
+
+SEC("lsm/bpf_prog")
+int BPF_PROG(restrict_fsaccess_bpf_prog_guard, struct bpf_prog *prog)
+{
+        __u32 id;
+
+        if ((bpf_get_current_pid_tgid() >> 32) == 1)
+                return 0;
+
+        id = BPF_CORE_READ(prog, aux, id);
+        if (id == 0)
+                return 0;
+
+        for (int i = 0; i < NUM_PROTECTED_OBJS; i++)
+                if (id == protected_prog_ids[i])
+                        return -EPERM;
+
+        return 0;
+}
+
+SEC("lsm/bpf")
+int BPF_PROG(restrict_fsaccess_bpf_guard, int cmd, union bpf_attr *attr,
+             unsigned int size)
+{
+        __u32 id;
+
+        if ((bpf_get_current_pid_tgid() >> 32) == 1)
+                return 0;
+
+        if (cmd != BPF_LINK_GET_FD_BY_ID)
+                return 0;
+
+        /* link_id/map_id/prog_id share the same offset in the bpf_attr union */
+        id = attr->link_id;
+        if (id == 0)
+                return 0;
+
+        for (int i = 0; i < NUM_PROTECTED_OBJS; i++)
+                if (id == protected_link_ids[i])
+                        return -EPERM;
+
+        return 0;
+}
+
 static const char _license[] SEC("license") = "GPL";
diff --git a/src/core/bpf-restrict-fsaccess.c b/src/core/bpf-restrict-fsaccess.c
index dc8a7d63a75..af8a97c6627 100644
--- a/src/core/bpf-restrict-fsaccess.c
+++ b/src/core/bpf-restrict-fsaccess.c
@@ -30,6 +30,10 @@ const char* const restrict_fsaccess_link_names[_RESTRICT_FILESYSTEM_ACCESS_LINK_
         [RESTRICT_FILESYSTEM_ACCESS_LINK_BPRM_CHECK]        = "restrict-fsaccess-bprm-check-link",
         [RESTRICT_FILESYSTEM_ACCESS_LINK_MMAP_FILE]         = "restrict-fsaccess-mmap-file-link",
         [RESTRICT_FILESYSTEM_ACCESS_LINK_FILE_MPROTECT]     = "restrict-fsaccess-file-mprotect-link",
+        [RESTRICT_FILESYSTEM_ACCESS_LINK_PTRACE_GUARD]      = "restrict-fsaccess-ptrace-guard-link",
+        [RESTRICT_FILESYSTEM_ACCESS_LINK_BPF_MAP_GUARD]     = "restrict-fsaccess-bpf-map-guard-link",
+        [RESTRICT_FILESYSTEM_ACCESS_LINK_BPF_PROG_GUARD]    = "restrict-fsaccess-bpf-prog-guard-link",
+        [RESTRICT_FILESYSTEM_ACCESS_LINK_BPF_GUARD]         = "restrict-fsaccess-bpf-guard-link",
 };
 
 #if BPF_FRAMEWORK && HAVE_LSM_INTEGRITY_TYPE
@@ -44,8 +48,19 @@ static struct restrict_fsaccess_bpf *restrict_fsaccess_bpf_free(struct restrict_
 
 DEFINE_TRIVIAL_CLEANUP_FUNC(struct restrict_fsaccess_bpf *, restrict_fsaccess_bpf_free);
 
-/* Verify that restrict_fsaccess_bss matches the skeleton's .bss layout */
+/* Verify that restrict_fsaccess_bss matches the skeleton's .bss layout. The sizeof
+ * check catches field additions/removals; the offsetof checks catch field
+ * reordering. Field order in restrict_fsaccess_bss must match the BPF global
+ * declaration order in restrict-fsaccess.bpf.c â this is what bpftool uses for the
+ * generated struct. The read-modify-write in restrict_fsaccess_clear_initramfs_trust()
+ * depends on this layout. */
 assert_cc(sizeof(struct restrict_fsaccess_bss) == sizeof_field(struct restrict_fsaccess_bpf, bss[0]));
+assert_cc(offsetof(struct restrict_fsaccess_bss, initramfs_s_dev) ==
+          offsetof(typeof_field(struct restrict_fsaccess_bpf, bss[0]), initramfs_s_dev));
+assert_cc(offsetof(struct restrict_fsaccess_bss, protected_map_id_verity) ==
+          offsetof(typeof_field(struct restrict_fsaccess_bpf, bss[0]), protected_map_id_verity));
+assert_cc(offsetof(struct restrict_fsaccess_bss, protected_map_id_bss) ==
+          offsetof(typeof_field(struct restrict_fsaccess_bpf, bss[0]), protected_map_id_bss));
 
 /* Build the skeleton links array indexed by the link enum. */
 #define RESTRICT_FSACCESS_LINKS(obj) {                                                                      \
@@ -54,6 +69,10 @@ assert_cc(sizeof(struct restrict_fsaccess_bss) == sizeof_field(struct restrict_f
         [RESTRICT_FILESYSTEM_ACCESS_LINK_BPRM_CHECK]        = (obj)->links.restrict_fsaccess_bprm_check,                 \
         [RESTRICT_FILESYSTEM_ACCESS_LINK_MMAP_FILE]         = (obj)->links.restrict_fsaccess_mmap_file,                  \
         [RESTRICT_FILESYSTEM_ACCESS_LINK_FILE_MPROTECT]     = (obj)->links.restrict_fsaccess_file_mprotect,              \
+        [RESTRICT_FILESYSTEM_ACCESS_LINK_PTRACE_GUARD]      = (obj)->links.restrict_fsaccess_ptrace_guard,               \
+        [RESTRICT_FILESYSTEM_ACCESS_LINK_BPF_MAP_GUARD]     = (obj)->links.restrict_fsaccess_bpf_map_guard,              \
+        [RESTRICT_FILESYSTEM_ACCESS_LINK_BPF_PROG_GUARD]    = (obj)->links.restrict_fsaccess_bpf_prog_guard,             \
+        [RESTRICT_FILESYSTEM_ACCESS_LINK_BPF_GUARD]         = (obj)->links.restrict_fsaccess_bpf_guard,                  \
 }
 
 static bool dm_verity_require_signatures(void) {
@@ -209,6 +228,63 @@ static int bpf_get_map_id(int fd, uint32_t *ret_id) {
         return 0;
 }
 
+static int bpf_get_link_ids(int fd, uint32_t *ret_link_id, uint32_t *ret_prog_id) {
+        struct bpf_link_info info = {};
+        uint32_t len = sizeof(info);
+        int r;
+
+        if (fd < 0)
+                return -EBADF;
+
+        r = sym_bpf_obj_get_info_by_fd(fd, &info, &len);
+        if (r < 0)
+                return r;
+
+        if (ret_link_id)
+                *ret_link_id = info.id;
+        if (ret_prog_id)
+                *ret_prog_id = info.prog_id;
+
+        return 0;
+}
+
+/* Populate guard globals with kernel-assigned IDs so the guard hooks block
+ * non-PID1 access to our maps/progs/links via the bpf() syscall. */
+int bpf_restrict_fsaccess_populate_guard(struct restrict_fsaccess_bpf *obj) {
+        int r;
+
+        assert(obj);
+
+        struct bpf_link *links[] = RESTRICT_FSACCESS_LINKS(obj);
+        assert_cc(ELEMENTSOF(links) == _RESTRICT_FILESYSTEM_ACCESS_LINK_MAX);
+
+        /* Map IDs */
+        r = bpf_get_map_id(sym_bpf_map__fd(obj->maps.verity_devices), &obj->bss->protected_map_id_verity);
+        if (r < 0)
+                return log_error_errno(r, "bpf-restrict-fsaccess: Failed to get verity_devices map ID: %m");
+
+        r = bpf_get_map_id(sym_bpf_map__fd(obj->maps.bss), &obj->bss->protected_map_id_bss);
+        if (r < 0)
+                return log_error_errno(r, "bpf-restrict-fsaccess: Failed to get .bss map ID: %m");
+
+        /* Link and program IDs (each link knows its associated program) */
+        FOREACH_ELEMENT(link, links) {
+                size_t idx = link - links;
+
+                r = bpf_get_link_ids(sym_bpf_link__fd(*link),
+                                     &obj->bss->protected_link_ids[idx],
+                                     &obj->bss->protected_prog_ids[idx]);
+                if (r < 0)
+                        return log_error_errno(r, "bpf-restrict-fsaccess: Failed to get link/prog IDs for %s: %m",
+                                               restrict_fsaccess_link_names[idx]);
+        }
+
+        log_info("bpf-restrict-fsaccess: Guard globals populated (verity_map=%u, bss_map=%u)",
+                 (unsigned) obj->bss->protected_map_id_verity,
+                 (unsigned) obj->bss->protected_map_id_bss);
+        return 0;
+}
+
 /* Validate that deserialized FDs actually reference our LSM BPF links. A
  * corrupted serialization file could leave FDs pointing at arbitrary kernel
  * objects; a stale FD could point at a BPF link of an entirely different type
@@ -321,12 +397,18 @@ int bpf_restrict_fsaccess_setup(Manager *m) {
 
         log_info("bpf-restrict-fsaccess: LSM BPF programs attached");
 
+        /* Now that all programs are attached, populate the guard's globals with
+         * the kernel-assigned IDs of our maps, programs, and links. From this
+         * point on, non-PID1 processes cannot obtain FDs to our BPF objects. */
+        r = bpf_restrict_fsaccess_populate_guard(obj);
+        if (r < 0)
+                return r;
+
         /* Extract owned FDs from the skeleton. These keep the kernel BPF objects
          * alive after the skeleton is destroyed. Destroying the skeleton unmaps
-         * the .bss page from our address space so no BPF state is reachable via
-         * /proc/1/mem. */
+         * the .bss page from our address space so no BPF state (guard globals,
+         * map IDs, initramfs_s_dev) is reachable via /proc/1/mem. */
         struct bpf_link *links[] = RESTRICT_FSACCESS_LINKS(obj);
-
         FOREACH_ELEMENT(link, links) {
                 size_t idx = link - links;
 
@@ -407,6 +489,10 @@ int bpf_restrict_fsaccess_setup(Manager *m) {
                                  "bpf-restrict-fsaccess: RestrictFileSystemAccess= requested but BPF framework is not compiled in.");
 }
 
+int bpf_restrict_fsaccess_populate_guard(struct restrict_fsaccess_bpf *obj) {
+        return 0;
+}
+
 int bpf_restrict_fsaccess_close_initramfs_trust(Manager *m) {
         return 0;
 }
diff --git a/src/core/bpf-restrict-fsaccess.h b/src/core/bpf-restrict-fsaccess.h
index 8a0a9cf2677..a23beab4ce5 100644
--- a/src/core/bpf-restrict-fsaccess.h
+++ b/src/core/bpf-restrict-fsaccess.h
@@ -23,6 +23,10 @@ enum {
         RESTRICT_FILESYSTEM_ACCESS_LINK_BPRM_CHECK,
         RESTRICT_FILESYSTEM_ACCESS_LINK_MMAP_FILE,
         RESTRICT_FILESYSTEM_ACCESS_LINK_FILE_MPROTECT,
+        RESTRICT_FILESYSTEM_ACCESS_LINK_PTRACE_GUARD,
+        RESTRICT_FILESYSTEM_ACCESS_LINK_BPF_MAP_GUARD,
+        RESTRICT_FILESYSTEM_ACCESS_LINK_BPF_PROG_GUARD,
+        RESTRICT_FILESYSTEM_ACCESS_LINK_BPF_GUARD,
         _RESTRICT_FILESYSTEM_ACCESS_LINK_MAX,
 };
 
@@ -39,12 +43,17 @@ enum {
  * bpf_map_lookup_elem/bpf_map_update_elem on the serialized .bss map FD. */
 struct restrict_fsaccess_bss {
         uint32_t initramfs_s_dev; /* kernel dev_t encoding: (major << 20) | minor */
+        uint32_t protected_map_id_verity;
+        uint32_t protected_map_id_bss;
+        uint32_t protected_prog_ids[_RESTRICT_FILESYSTEM_ACCESS_LINK_MAX];
+        uint32_t protected_link_ids[_RESTRICT_FILESYSTEM_ACCESS_LINK_MAX];
 };
 
 extern const char* const restrict_fsaccess_link_names[_RESTRICT_FILESYSTEM_ACCESS_LINK_MAX];
 
 bool bpf_restrict_fsaccess_supported(void);
 int bpf_restrict_fsaccess_setup(Manager *m);
+int bpf_restrict_fsaccess_populate_guard(struct restrict_fsaccess_bpf *obj);
 
 int bpf_restrict_fsaccess_close_initramfs_trust(Manager *m);
 int bpf_restrict_fsaccess_serialize(Manager *m, FILE *f, FDSet *fds);