core: add RestrictFileSystemAccess= BPF LSM for dm-verity execution enforcement

author Christian Brauner <brauner@kernel.org>

Fri, 8 May 2026 08:45:23 +0000 (10:45 +0200)

committer Christian Brauner <brauner@kernel.org>

Wed, 13 May 2026 08:36:12 +0000 (10:36 +0200)
author Christian Brauner <brauner@kernel.org>
Fri, 8 May 2026 08:45:23 +0000 (10:45 +0200)
committer Christian Brauner <brauner@kernel.org>
Wed, 13 May 2026 08:36:12 +0000 (10:36 +0200)
diff --git a/man/kernel-command-line.xml b/man/kernel-command-line.xml

index 83544b3606464af6634b4e0eae29ace7efb12d8e..03765010f0744e91aec9441af84257d698667483 100644 (file)
--- a/man/kernel-command-line.xml
+++ b/man/kernel-command-line.xml
@@ -79,6 +79,17 @@
          </listitem>
        </varlistentry>
  
+      <varlistentry>
+        <term><varname>systemd.restrict_filesystem_access=</varname></term>
+        <listitem>
+          <para>Controls the <varname>RestrictFileSystemAccess=</varname> execution enforcement policy. For
+          details, see
+          <citerefentry><refentrytitle>systemd-system.conf</refentrytitle><manvolnum>5</manvolnum></citerefentry>.</para>
+
+          <xi:include href="version-info.xml" xpointer="v261"/>
+        </listitem>
+      </varlistentry>
+
        <varlistentry>
          <term><varname>systemd.mask=</varname></term>
          <term><varname>systemd.wants=</varname></term>
diff --git a/man/systemd-system.conf.xml b/man/systemd-system.conf.xml

index eb14cb7f3074636822c6b6a671c27945e8fe6cab..fb565b03506c7f4b1300127a10a1073467266693 100644 (file)
--- a/man/systemd-system.conf.xml
+++ b/man/systemd-system.conf.xml
@@ -532,6 +532,49 @@
          <xi:include href="version-info.xml" xpointer="v256"/></listitem>
        </varlistentry>
  
+      <varlistentry>
+        <term><varname>RestrictFileSystemAccess=</varname></term>
+
+        <listitem><para>Takes a boolean argument or the special value <literal>exec</literal>. Defaults to
+        <literal>no</literal>. When enabled, PID 1 loads a BPF LSM program that enforces a deny-default
+        execution policy: only binaries residing on signed dm-verity block devices (and the initramfs during
+        early boot) are permitted to execute. Execution from tmpfs, procfs, sysfs, unsigned dm-verity devices,
+        and anonymous executable memory mappings is denied.</para>
+
+        <para>This setting is intended as one component of an image-based, fully verified system, where the
+        whole boot chain (firmware, kernel image, kernel command line, initramfs) is measured and attested.
+        On a general-purpose system without such guarantees it does not provide a meaningful security
+        boundary on its own: an attacker with sufficient privilege to edit
+        <filename>system.conf</filename>, modify the kernel command line, or kexec into an unsigned initrd
+        can disable or bypass the policy.</para>
+
+        <para>The enforcement hooks block <function>execve()</function> of untrusted binaries
+        (<literal>bprm_check_security</literal>), <constant>PROT_EXEC</constant> memory mappings including
+        shared libraries (<literal>mmap_file</literal>), and write-to-execute transitions such as JIT
+        compilation (<literal>file_mprotect</literal>).</para>
+
+        <para>Note that execution from overlayfs mounts is blocked even if the underlying layers reside on
+        signed dm-verity devices, because the BPF program sees the overlay filesystem's anonymous device
+        number rather than the underlying block device. Multi-device filesystems such as btrfs are similarly
+        unsupported.</para>
+
+        <para>Note that, without further measures to secure the system, kexec can be used to circumvent this.</para>
+
+        <para>This requires the kernel to be booted with <literal>dm_verity.require_signatures=1</literal>
+        on the kernel command line and with BPF LSM enabled (<literal>lsm=...,bpf</literal>). If either
+        prerequisite is not met, PID 1 will refuse to complete startup.</para>
+
+        <para>The value <literal>yes</literal> is equivalent to <literal>exec</literal>. Additional
+        modes may be added in the future.</para>
+
+        <para>This option may also be set via the <varname>systemd.restrict_filesystem_access=</varname> kernel command
+        line option, see
+        <citerefentry><refentrytitle>kernel-command-line</refentrytitle><manvolnum>7</manvolnum></citerefentry>.
+        </para>
+
+        <xi:include href="version-info.xml" xpointer="v261"/></listitem>
+      </varlistentry>
+
        <varlistentry>
          <term><varname>SystemCallArchitectures=</varname></term>
  
diff --git a/src/bpf/meson.build b/src/bpf/meson.build

index 32ab32e93dcdba55e843d01fb130c3ea74a81f56..13af4c968e825557e1321e6622a06151fce7de27 100644 (file)
--- a/src/bpf/meson.build
+++ b/src/bpf/meson.build
@@ -311,6 +311,19 @@ endif
  
  conf.set10('HAVE_VMLINUX_H', use_provided_vmlinux_h or use_generated_vmlinux_h)
  
+# 'enum lsm_integrity_type' was added together with the bdev_setintegrity LSM
+# hook in kernel commit 2deeb6c333e5 (v6.5). The generated vmlinux.h reflects
+# the running kernel's BTF; a provided vmlinux.h can be older, so probe.
+have_lsm_integrity_type = false
+if use_generated_vmlinux_h
+        have_lsm_integrity_type = true
+elif use_provided_vmlinux_h
+        have_lsm_integrity_type = cc.compiles(
+                '#include "@0@"\nenum lsm_integrity_type _t;\n'.format(provided_vmlinux_h_path),
+                name : 'enum lsm_integrity_type in vmlinux.h')
+endif
+conf.set10('HAVE_LSM_INTEGRITY_TYPE', have_lsm_integrity_type)
+
  conf.set10('ENABLE_SYSCTL_BPF', conf.get('HAVE_VMLINUX_H') == 1 and libbpf.version().version_compare('>= 0.7'))
  
  bpf_programs = [
@@ -318,6 +331,11 @@ bpf_programs = [
                  'source' : files('bind-iface.bpf.c'),
                  'condition' : 'BPF_FRAMEWORK',
          },
+        {
+                'source' : files('restrict-fsaccess.bpf.c'),
+                'condition' : 'HAVE_LSM_INTEGRITY_TYPE',
+                'depends' : vmlinux_h_dependency,
+        },
          {
                  'source' : files('restrict-fs.bpf.c'),
                  'condition' : 'BPF_FRAMEWORK',
diff --git a/src/bpf/restrict-fsaccess.bpf.c b/src/bpf/restrict-fsaccess.bpf.c

new file mode 100644 (file)

index 0000000..a9f368a
--- /dev/null
+++ b/src/bpf/restrict-fsaccess.bpf.c
@@ -0,0 +1,152 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+/* The SPDX header above is actually correct in claiming this was
+ * LGPL-2.1-or-later, because it is. Since the kernel doesn't consider that
+ * compatible with GPL we will claim this to be GPL however, which should be
+ * fine given that LGPL-2.1-or-later downgrades to GPL if needed.
+ */
+
+/* Trusted Execution BPF LSM program.
+ *
+ * Enforces that only binaries from signed dm-verity block devices (or the
+ * initramfs during early boot) can be executed.
+ *
+ * Architecture:
+ *   - bdev_setintegrity hook:  self-populates a map of trusted devices when
+ *                              dm-verity signals signature validity
+ *   - bdev_free_security hook: removes devices from the map on teardown
+ *   - bprm_check_security:    blocks execve() from untrusted sources
+ *   - mmap_file:              blocks PROT_EXEC mmap from untrusted sources
+ *   - file_mprotect:          blocks W->X transitions from untrusted sources
+ */
+
+/* If offsetof() is implemented via __builtin_offset() then it doesn't work on current compilers, since the
+ * built-ins do not understand CO-RE. Let's undefine any such macros here, to force bpf_helpers.h to define
+ * its own definitions for this. (In new versions it will do so automatically, but at least in libbpf 1.1.0
+ * it does not.) */
+#undef offsetof
+#undef container_of
+
+#include "vmlinux.h"
+
+#include <errno.h>                      /* IWYU pragma: keep */
+#include <bpf/bpf_core_read.h>
+#include <bpf/bpf_helpers.h>
+#include <bpf/bpf_tracing.h>
+
+#define PROT_EXEC 0x4
+#define VM_EXEC   0x00000004
+
+/* ---- Maps ---- */
+
+struct {
+        __uint(type, BPF_MAP_TYPE_HASH);
+        __uint(max_entries, 0);  /* placeholder */
+        __type(key, __u32);     /* dev_t from bdev->bd_dev */
+        __type(value, __u8);    /* 1 = signature valid */
+} verity_devices SEC(".maps");
+
+/* ---- Globals (set by PID1 via skeleton) ---- */
+
+/* Device number of the initramfs superblock. PID1 sets this at load time and
+ * clears it (to 0) after switch_root. A value of 0 means "no initramfs trust
+ * — the window is closed." */
+volatile __u32 initramfs_s_dev;
+
+/* ---- Integrity tracking hooks ---- */
+
+SEC("lsm/bdev_setintegrity")
+int BPF_PROG(restrict_fsaccess_bdev_setintegrity, struct block_device *bdev,
+             enum lsm_integrity_type type, const void *value, __u64 size)
+{
+        if (type == LSM_INT_DMVERITY_SIG_VALID) {
+                __u32 dev = bdev->bd_dev;
+                __u8 valid = value && size > 0;
+                bpf_map_update_elem(&verity_devices, &dev, &valid, BPF_ANY);
+        }
+
+        return 0;
+}
+
+SEC("lsm/bdev_free_security")
+void BPF_PROG(restrict_fsaccess_bdev_free, struct block_device *bdev)
+{
+        __u32 dev = bdev->bd_dev;
+        bpf_map_delete_elem(&verity_devices, &dev);
+}
+
+/* ---- Enforcement helpers ---- */
+
+/* Check whether a file is from a trusted source.
+ * Returns 0 (allow) or -EPERM (deny). */
+static __always_inline int check_trusted_file(struct file *file)
+{
+        __u32 s_dev;
+        __u8 *sig_valid;
+
+        BPF_CORE_READ_INTO(&s_dev, file, f_inode, i_sb, s_dev);
+
+        /* Check initramfs trust (active only during early boot) */
+        if (initramfs_s_dev != 0 && s_dev == initramfs_s_dev)
+                return 0;
+
+        /* Check verity device map */
+        sig_valid = bpf_map_lookup_elem(&verity_devices, &s_dev);
+        if (sig_valid && *sig_valid)
+                return 0;
+
+        return -EPERM;
+}
+
+/* ---- Enforcement hooks ---- */
+
+SEC("lsm/bprm_check_security")
+int BPF_PROG(restrict_fsaccess_bprm_check, struct linux_binprm *bprm)
+{
+        struct file *file;
+
+        BPF_CORE_READ_INTO(&file, bprm, file);
+        return check_trusted_file(file);
+}
+
+SEC("lsm/mmap_file")
+int BPF_PROG(restrict_fsaccess_mmap_file, struct file *file, unsigned long reqprot,
+             unsigned long prot, unsigned long flags)
+{
+        /* Only enforce on executable mappings */
+        if (!(prot & PROT_EXEC))
+                return 0;
+
+        /* Anonymous executable mapping — no file backing, deny */
+        if (!file)
+                return -EPERM;
+
+        return check_trusted_file(file);
+}
+
+SEC("lsm/file_mprotect")
+int BPF_PROG(restrict_fsaccess_file_mprotect, struct vm_area_struct *vma,
+             unsigned long reqprot, unsigned long prot)
+{
+        struct file *file;
+        unsigned long vm_flags;
+
+        /* Only enforce when adding PROT_EXEC */
+        if (!(prot & PROT_EXEC))
+                return 0;
+
+        /* If VM_EXEC is already set, the mapping is already executable — this
+         * mprotect isn't granting new executable capability, allow */
+        BPF_CORE_READ_INTO(&vm_flags, vma, vm_flags);
+        if (vm_flags & VM_EXEC)
+                return 0;
+
+        /* Anonymous executable mapping — no file backing, deny */
+        BPF_CORE_READ_INTO(&file, vma, vm_file);
+        if (!file)
+                return -EPERM;
+
+        return check_trusted_file(file);
+}
+
+static const char _license[] SEC("license") = "GPL";
diff --git a/src/core/bpf-restrict-fs.c b/src/core/bpf-restrict-fs.c

index e60e6ca7efcfe4bcfdd4346af4426a2415225bf4..d3c6adc14525165cdca74a57169f797396a833ec 100644 (file)
--- a/src/core/bpf-restrict-fs.c
+++ b/src/core/bpf-restrict-fs.c
@@ -30,19 +30,6 @@ static struct restrict_fs_bpf *restrict_fs_bpf_free(struct restrict_fs_bpf *obj)
  
  DEFINE_TRIVIAL_CLEANUP_FUNC(struct restrict_fs_bpf *, restrict_fs_bpf_free);
  
-static bool bpf_can_link_lsm_program(struct bpf_program *prog) {
-        _cleanup_(bpf_link_freep) struct bpf_link *link = NULL;
-
-        assert(prog);
-
-        link = sym_bpf_program__attach_lsm(prog);
-
-        /* If bpf_program__attach_lsm fails the resulting value stores libbpf error code instead of memory
-         * pointer. That is the case when the helper is called on architectures where BPF trampoline (hence
-         * BPF_LSM_MAC attach type) is not supported. */
-        return bpf_get_error_translated(link) == 0;
-}
-
  static int prepare_restrict_fs_bpf(struct restrict_fs_bpf **ret_obj) {
          _cleanup_(restrict_fs_bpf_freep) struct restrict_fs_bpf *obj = NULL;
          _cleanup_close_ int inner_map_fd = -EBADF;
diff --git a/src/core/bpf-restrict-fsaccess.c b/src/core/bpf-restrict-fsaccess.c

new file mode 100644 (file)

index 0000000..35bb2b8
--- /dev/null
+++ b/src/core/bpf-restrict-fsaccess.c
@@ -0,0 +1,286 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include <fcntl.h>
+#include <sys/mman.h>
+#include <sys/stat.h>
+
+#include "bpf-restrict-fsaccess.h"
+#include "fd-util.h"
+#include "fileio.h"
+#include "initrd-util.h"
+#include "log.h"
+#include "lsm-util.h"
+#include "manager.h"
+#include "memory-util.h"
+#include "string-table.h"
+
+/* DMVERITY_DEVICES_MAX lives in bpf-restrict-fsaccess.h for sharing with tests. */
+
+static const char* const restrict_filesystem_access_table[_RESTRICT_FILESYSTEM_ACCESS_MAX] = {
+        [RESTRICT_FILESYSTEM_ACCESS_NO]   = "no",
+        [RESTRICT_FILESYSTEM_ACCESS_EXEC] = "exec",
+};
+
+DEFINE_STRING_TABLE_LOOKUP_WITH_BOOLEAN(restrict_filesystem_access, RestrictFileSystemAccess, RESTRICT_FILESYSTEM_ACCESS_EXEC);
+
+const char* const restrict_fsaccess_link_names[_RESTRICT_FILESYSTEM_ACCESS_LINK_MAX] = {
+        [RESTRICT_FILESYSTEM_ACCESS_LINK_BDEV_SETINTEGRITY] = "restrict-fsaccess-bdev-setintegrity-link",
+        [RESTRICT_FILESYSTEM_ACCESS_LINK_BDEV_FREE]         = "restrict-fsaccess-bdev-free-link",
+        [RESTRICT_FILESYSTEM_ACCESS_LINK_BPRM_CHECK]        = "restrict-fsaccess-bprm-check-link",
+        [RESTRICT_FILESYSTEM_ACCESS_LINK_MMAP_FILE]         = "restrict-fsaccess-mmap-file-link",
+        [RESTRICT_FILESYSTEM_ACCESS_LINK_FILE_MPROTECT]     = "restrict-fsaccess-file-mprotect-link",
+};
+
+#if BPF_FRAMEWORK && HAVE_LSM_INTEGRITY_TYPE
+#include "bpf-dlopen.h"
+#include "bpf-link.h"
+#include "restrict-fsaccess-skel.h"
+
+static struct restrict_fsaccess_bpf *restrict_fsaccess_bpf_free(struct restrict_fsaccess_bpf *obj) {
+        restrict_fsaccess_bpf__destroy(obj);
+        return NULL;
+}
+
+DEFINE_TRIVIAL_CLEANUP_FUNC(struct restrict_fsaccess_bpf *, restrict_fsaccess_bpf_free);
+
+/* Verify that restrict_fsaccess_bss matches the skeleton's .bss layout */
+assert_cc(sizeof(struct restrict_fsaccess_bss) == sizeof_field(struct restrict_fsaccess_bpf, bss[0]));
+
+/* Build the skeleton links array indexed by the link enum. */
+#define RESTRICT_FSACCESS_LINKS(obj) {                                                                      \
+        [RESTRICT_FILESYSTEM_ACCESS_LINK_BDEV_SETINTEGRITY] = (obj)->links.restrict_fsaccess_bdev_setintegrity,          \
+        [RESTRICT_FILESYSTEM_ACCESS_LINK_BDEV_FREE]         = (obj)->links.restrict_fsaccess_bdev_free,                  \
+        [RESTRICT_FILESYSTEM_ACCESS_LINK_BPRM_CHECK]        = (obj)->links.restrict_fsaccess_bprm_check,                 \
+        [RESTRICT_FILESYSTEM_ACCESS_LINK_MMAP_FILE]         = (obj)->links.restrict_fsaccess_mmap_file,                  \
+        [RESTRICT_FILESYSTEM_ACCESS_LINK_FILE_MPROTECT]     = (obj)->links.restrict_fsaccess_file_mprotect,              \
+}
+
+static bool dm_verity_require_signatures(void) {
+        int r;
+
+        r = read_boolean_file("/sys/module/dm_verity/parameters/require_signatures");
+        if (r < 0) {
+                if (r != -ENOENT)
+                        log_warning_errno(r, "bpf-restrict-fsaccess: Failed to read dm-verity require_signatures: %m");
+                return false;
+        }
+
+        return r > 0;
+}
+
+static int get_root_s_dev(uint32_t *ret) {
+        struct stat st;
+
+        assert(ret);
+
+        /* Stat /usr/ rather than / — executable code lives in /usr/ and we push toward
+         * a writable non-executable /. On systems with a separate /usr partition this
+         * means / is intentionally not trusted. */
+        if (stat("/usr/", &st) < 0)
+                return log_error_errno(errno, "bpf-restrict-fsaccess: Failed to stat /usr/ filesystem: %m");
+
+        *ret = STAT_DEV_TO_KERNEL(st.st_dev);
+        return 0;
+}
+
+static int prepare_restrict_fsaccess_bpf(struct restrict_fsaccess_bpf **ret) {
+        _cleanup_(restrict_fsaccess_bpf_freep) struct restrict_fsaccess_bpf *obj = NULL;
+        int r;
+
+        assert(ret);
+
+        obj = restrict_fsaccess_bpf__open();
+        if (!obj)
+                return log_error_errno(errno, "bpf-restrict-fsaccess: Failed to open BPF object: %m");
+
+        r = sym_bpf_map__set_max_entries(obj->maps.verity_devices, DMVERITY_DEVICES_MAX);
+        if (r < 0)
+                return log_error_errno(r, "bpf-restrict-fsaccess: Failed to size hash table: %m");
+
+        r = restrict_fsaccess_bpf__load(obj);
+        if (r < 0)
+                return log_error_errno(r, "bpf-restrict-fsaccess: Failed to load BPF object: %m");
+
+        *ret = TAKE_PTR(obj);
+        return 0;
+}
+
+bool bpf_restrict_fsaccess_supported(void) {
+        _cleanup_(restrict_fsaccess_bpf_freep) struct restrict_fsaccess_bpf *obj = NULL;
+        static int supported = -1;
+        int r;
+
+        if (supported >= 0)
+                return supported;
+        if (dlopen_bpf(LOG_WARNING) < 0)
+                return (supported = false);
+
+        r = lsm_supported("bpf");
+        if (r == -ENOPKG) {
+                log_debug_errno(r, "bpf-restrict-fsaccess: securityfs not mounted, BPF LSM not available.");
+                return (supported = false);
+        }
+        if (r < 0) {
+                log_warning_errno(r, "bpf-restrict-fsaccess: Can't determine whether the BPF LSM module is used: %m");
+                return (supported = false);
+        }
+        if (r == 0) {
+                log_info("bpf-restrict-fsaccess: BPF LSM hook not enabled in the kernel, not supported.");
+                return (supported = false);
+        }
+
+        r = prepare_restrict_fsaccess_bpf(&obj);
+        if (r < 0)
+                return (supported = false);
+
+        if (!bpf_can_link_lsm_program(obj->progs.restrict_fsaccess_bprm_check)) {
+                log_warning("bpf-restrict-fsaccess: Failed to link program; assuming BPF LSM is not available.");
+                return (supported = false);
+        }
+
+        return (supported = true);
+}
+
+/* Close the initramfs trust window after switch_root by clearing initramfs_s_dev
+ * in the BPF .bss map. The .bss is a BPF_F_MMAPABLE array map — mmap it and do
+ * a single aligned 4-byte store instead of a full-value read-modify-write via
+ * bpf_map_update_elem, which would needlessly rewrite the guard globals too. */
+static int restrict_fsaccess_clear_initramfs_trust(int bss_map_fd) {
+        void *p;
+
+        assert(bss_map_fd >= 0);
+        assert_cc(offsetof(struct restrict_fsaccess_bss, initramfs_s_dev) == 0);
+
+        p = mmap(NULL, page_size(), PROT_READ | PROT_WRITE, MAP_SHARED, bss_map_fd, 0);
+        if (p == MAP_FAILED)
+                return log_error_errno(errno, "bpf-restrict-fsaccess: Failed to mmap .bss map: %m");
+
+        /* initramfs_s_dev is at offset 0 in the .bss layout. Single aligned
+         * 32-bit store is atomic — BPF programs see either the old or new value,
+         * no torn reads possible. Guard globals are untouched. */
+        *(uint32_t *) p = 0;
+
+        /* munmap failure here is harmless: the clear above already landed in
+         * the kernel, and the mapping is discarded by exec anyway. */
+        if (munmap(p, page_size()) < 0)
+                log_warning_errno(errno, "bpf-restrict-fsaccess: Failed to munmap .bss map, ignoring: %m");
+
+        log_info("bpf-restrict-fsaccess: Cleared initramfs trust window after switch_root.");
+        return 0;
+}
+
+int bpf_restrict_fsaccess_setup(Manager *m) {
+        _cleanup_(restrict_fsaccess_bpf_freep) struct restrict_fsaccess_bpf *obj = NULL;
+        int r;
+
+        assert(m);
+
+        if (!MANAGER_IS_SYSTEM(m) || m->restrict_filesystem_access <= RESTRICT_FILESYSTEM_ACCESS_NO)
+                return 0;
+
+        /* Fresh setup: verify BPF LSM is available */
+        if (!bpf_restrict_fsaccess_supported())
+                return log_warning_errno(SYNTHETIC_ERRNO(EOPNOTSUPP),
+                                         "bpf-restrict-fsaccess: BPF LSM is not available.");
+
+        /* Require dm-verity signature enforcement */
+        if (!dm_verity_require_signatures())
+                return log_error_errno(SYNTHETIC_ERRNO(ENOKEY),
+                                       "bpf-restrict-fsaccess: dm-verity require_signatures is not enabled. "
+                                       "RestrictFileSystemAccess= requires the kernel to enforce dm-verity signatures. "
+                                       "Set dm_verity.require_signatures=1 on the kernel command line.");
+
+        r = prepare_restrict_fsaccess_bpf(&obj);
+        if (r < 0)
+                return r;
+
+        /* If we're still in the initramfs, allow execution from it by recording
+         * its s_dev. After switch_root, PID1 re-execs and in_initrd() returns
+         * false — initramfs_s_dev stays at 0 (its default), closing the trust
+         * window. */
+        if (in_initrd()) {
+                uint32_t root_dev;
+
+                r = get_root_s_dev(&root_dev);
+                if (r < 0)
+                        return r;
+
+                obj->bss->initramfs_s_dev = root_dev;
+                log_info("bpf-restrict-fsaccess: Initramfs trusted (s_dev=%" PRIu32 ":%" PRIu32 ")",
+                         root_dev >> 20, root_dev & 0xFFFFF);
+        }
+
+        r = restrict_fsaccess_bpf__attach(obj);
+        if (r < 0)
+                return log_error_errno(r, "bpf-restrict-fsaccess: Failed to attach BPF programs: %m");
+
+        log_info("bpf-restrict-fsaccess: LSM BPF programs attached");
+
+        /* Extract owned FDs from the skeleton. These keep the kernel BPF objects
+         * alive after the skeleton is destroyed. Destroying the skeleton unmaps
+         * the .bss page from our address space so no BPF state is reachable via
+         * /proc/1/mem. */
+        struct bpf_link *links[] = RESTRICT_FSACCESS_LINKS(obj);
+
+        FOREACH_ELEMENT(link, links) {
+                size_t idx = link - links;
+
+                m->restrict_fsaccess_link_fds[idx] = fcntl(sym_bpf_link__fd(*link), F_DUPFD_CLOEXEC, 3);
+                if (m->restrict_fsaccess_link_fds[idx] < 0) {
+                        r = log_error_errno(errno, "bpf-restrict-fsaccess: Failed to dup link FD for %s: %m",
+                                            restrict_fsaccess_link_names[idx]);
+                        goto fail;
+                }
+        }
+
+        m->restrict_fsaccess_bss_map_fd = fcntl(sym_bpf_map__fd(obj->maps.bss), F_DUPFD_CLOEXEC, 3);
+        if (m->restrict_fsaccess_bss_map_fd < 0) {
+                r = log_error_errno(errno, "bpf-restrict-fsaccess: Failed to dup .bss map FD: %m");
+                goto fail;
+        }
+
+        return 0;
+
+fail:
+        /* Close partial FDs so we don't leave a half-baked policy attached
+         * once the skeleton is destroyed by _cleanup_. */
+        FOREACH_ELEMENT(fd, m->restrict_fsaccess_link_fds)
+                *fd = safe_close(*fd);
+        m->restrict_fsaccess_bss_map_fd = safe_close(m->restrict_fsaccess_bss_map_fd);
+        return r;
+}
+
+int bpf_restrict_fsaccess_close_initramfs_trust(Manager *m) {
+        assert(m);
+
+        /* Clear initramfs_s_dev in the BPF .bss map BEFORE switch_root unmounts
+         * the initramfs. This eliminates the dev_t recycling window: the anonymous
+         * dev_t is still held by the mounted initramfs superblock, so no other
+         * filesystem can recycle it yet. Anonymous dev_t recycling is immediate
+         * and lowest-first, so a stale initramfs_s_dev is a near-certain trust
+         * bypass — fail closed. */
+        if (!in_initrd() || m->restrict_fsaccess_bss_map_fd < 0)
+                return 0;
+
+        return restrict_fsaccess_clear_initramfs_trust(m->restrict_fsaccess_bss_map_fd);
+}
+
+#else /* ! BPF_FRAMEWORK || ! HAVE_LSM_INTEGRITY_TYPE */
+
+bool bpf_restrict_fsaccess_supported(void) {
+        return false;
+}
+
+int bpf_restrict_fsaccess_setup(Manager *m) {
+        if (!MANAGER_IS_SYSTEM(m) || m->restrict_filesystem_access <= RESTRICT_FILESYSTEM_ACCESS_NO)
+                return 0;
+
+        return log_warning_errno(SYNTHETIC_ERRNO(EOPNOTSUPP),
+                                 "bpf-restrict-fsaccess: RestrictFileSystemAccess= requested but BPF framework is not compiled in.");
+}
+
+int bpf_restrict_fsaccess_close_initramfs_trust(Manager *m) {
+        return 0;
+}
+
+#endif
diff --git a/src/core/bpf-restrict-fsaccess.h b/src/core/bpf-restrict-fsaccess.h

new file mode 100644 (file)

index 0000000..7abbb7d
--- /dev/null
+++ b/src/core/bpf-restrict-fsaccess.h
@@ -0,0 +1,49 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#include <sys/sysmacros.h>
+
+#include "core-forward.h"
+#include "macro.h"
+#include "shared-forward.h"
+
+typedef enum RestrictFileSystemAccess {
+        RESTRICT_FILESYSTEM_ACCESS_NO,
+        RESTRICT_FILESYSTEM_ACCESS_EXEC,
+        _RESTRICT_FILESYSTEM_ACCESS_MAX,
+        _RESTRICT_FILESYSTEM_ACCESS_INVALID = -EINVAL,
+} RestrictFileSystemAccess;
+
+const char* restrict_filesystem_access_to_string(RestrictFileSystemAccess i) _const_;
+RestrictFileSystemAccess restrict_filesystem_access_from_string(const char *s) _pure_;
+
+enum {
+        RESTRICT_FILESYSTEM_ACCESS_LINK_BDEV_SETINTEGRITY,
+        RESTRICT_FILESYSTEM_ACCESS_LINK_BDEV_FREE,
+        RESTRICT_FILESYSTEM_ACCESS_LINK_BPRM_CHECK,
+        RESTRICT_FILESYSTEM_ACCESS_LINK_MMAP_FILE,
+        RESTRICT_FILESYSTEM_ACCESS_LINK_FILE_MPROTECT,
+        _RESTRICT_FILESYSTEM_ACCESS_LINK_MAX,
+};
+
+/* Maximum number of dm-verity devices tracked in the BPF hash map. */
+#define DMVERITY_DEVICES_MAX (16U*1024U)
+
+/* Convert userspace dev_t (from stat()) to kernel dev_t encoding (MKDEV).
+ * stat() returns new_encode_dev(s_dev); the BPF program reads s_dev directly
+ * which uses MKDEV(major, minor) = (major << 20) | minor. */
+#define STAT_DEV_TO_KERNEL(dev) \
+        ((uint32_t)major(dev) << 20 | (uint32_t)minor(dev))
+
+/* Mirrors the BPF program's .bss section layout for read-modify-write via
+ * bpf_map_lookup_elem/bpf_map_update_elem on the serialized .bss map FD. */
+struct restrict_fsaccess_bss {
+        uint32_t initramfs_s_dev; /* kernel dev_t encoding: (major << 20) | minor */
+};
+
+extern const char* const restrict_fsaccess_link_names[_RESTRICT_FILESYSTEM_ACCESS_LINK_MAX];
+
+bool bpf_restrict_fsaccess_supported(void);
+int bpf_restrict_fsaccess_setup(Manager *m);
+
+int bpf_restrict_fsaccess_close_initramfs_trust(Manager *m);
diff --git a/src/core/core-forward.h b/src/core/core-forward.h

index 446cf501e43e775c28c5e92cf36f11567c4c637e..14bcc142a2e0c45bac4cc18cf7a98ac04d13efa0 100644 (file)
--- a/src/core/core-forward.h
+++ b/src/core/core-forward.h
@@ -52,3 +52,4 @@ typedef struct Unit Unit;
  typedef struct UnitRef UnitRef;
  
  struct restrict_fs_bpf;
+struct restrict_fsaccess_bpf;
diff --git a/src/core/main.c b/src/core/main.c

index 3bdce441a85bbc250930f54604badf1152858ddd..c10df7d87a4fca65bc6659daa91ea81e17174e12 100644 (file)
--- a/src/core/main.c
+++ b/src/core/main.c
@@ -22,6 +22,7 @@
  #include "apparmor-setup.h"
  #include "architecture.h"
  #include "argv-util.h"
+#include "bpf-restrict-fsaccess.h"
  #include "build.h"
  #include "bus-error.h"
  #include "capability-util.h"
@@ -150,6 +151,7 @@ static char **arg_manager_environment;
  static uint64_t arg_capability_bounding_set;
  static bool arg_no_new_privs;
  static int arg_protect_system;
+static RestrictFileSystemAccess arg_restrict_filesystem_access;
  static nsec_t arg_timer_slack_nsec;
  static Set* arg_syscall_archs;
  static FILE* arg_serialization;
@@ -566,6 +568,17 @@ static int parse_proc_cmdline_item(const char *key, const char *value, void *dat
                          return 0;
                  }
  
+        } else if (proc_cmdline_key_streq(key, "systemd.restrict_filesystem_access")) {
+
+                if (value) {
+                        r = restrict_filesystem_access_from_string(value);
+                        if (r < 0)
+                                log_warning_errno(r, "Failed to parse systemd.restrict_filesystem_access= argument '%s', ignoring: %m", value);
+                        else
+                                arg_restrict_filesystem_access = r;
+                } else
+                        arg_restrict_filesystem_access = RESTRICT_FILESYSTEM_ACCESS_EXEC;
+
          } else if (streq(key, "quiet") && !value) {
  
                  if (arg_show_status == _SHOW_STATUS_INVALID)
@@ -717,6 +730,29 @@ static int config_parse_protect_system_pid1(
          return 0;
  }
  
+static int config_parse_restrict_filesystem_access(
+                const char *unit,
+                const char *filename,
+                unsigned line,
+                const char *section,
+                unsigned section_line,
+                const char *lvalue,
+                int ltype,
+                const char *rvalue,
+                void *data,
+                void *userdata) {
+
+        RestrictFileSystemAccess *v = ASSERT_PTR(data);
+        RestrictFileSystemAccess re;
+
+        re = restrict_filesystem_access_from_string(rvalue);
+        if (re < 0)
+                return log_syntax_parse_error(unit, filename, line, re, lvalue, rvalue);
+
+        *v = re;
+        return 0;
+}
+
  static int config_parse_crash_reboot(
                  const char *unit,
                  const char *filename,
@@ -774,6 +810,7 @@ static int parse_config_file(void) {
                  { "Manager", "CapabilityBoundingSet",             config_parse_capability_set,        0,                        &arg_capability_bounding_set                           },
                  { "Manager", "NoNewPrivileges",                   config_parse_bool,                  0,                        &arg_no_new_privs                                      },
                  { "Manager", "ProtectSystem",                     config_parse_protect_system_pid1,   0,                        &arg_protect_system                                    },
+                { "Manager", "RestrictFileSystemAccess",         config_parse_restrict_filesystem_access, 0,                   &arg_restrict_filesystem_access                         },
  #if HAVE_SECCOMP
                  { "Manager", "SystemCallArchitectures",           config_parse_syscall_archs,         0,                        &arg_syscall_archs                                     },
  #else
@@ -925,6 +962,7 @@ static void set_manager_settings(Manager *m) {
  
          manager_set_show_status(m, arg_show_status, "command line");
          m->status_unit_format = arg_status_unit_format;
+        m->restrict_filesystem_access = arg_restrict_filesystem_access;
  }
  
  static int parse_argv(int argc, char *argv[]) {
@@ -1247,6 +1285,16 @@ static int prepare_reexecute(
          m->n_reloading++;
          bus_manager_send_reloading(m, true);
  
+        /* Only close the initramfs trust window when actually switching root.
+         * During a plain daemon-reexec in the initrd, PID1 still needs to
+         * execv() itself from the initramfs — clearing trust here would cause
+         * the BPF bprm_check_security hook to deny the exec. */
+        if (switching_root) {
+                r = bpf_restrict_fsaccess_close_initramfs_trust(m);
+                if (r < 0)
+                        return r;
+        }
+
          r = manager_open_serialization(m, &f);
          if (r < 0)
                  return log_error_errno(r, "Failed to create serialization file: %m");
@@ -2834,6 +2882,7 @@ static void reset_arguments(void) {
          arg_capability_bounding_set = CAP_MASK_ALL;
          arg_no_new_privs = false;
          arg_protect_system = -1;
+        arg_restrict_filesystem_access = RESTRICT_FILESYSTEM_ACCESS_NO;
          arg_timer_slack_nsec = NSEC_INFINITY;
  
          arg_syscall_archs = set_free(arg_syscall_archs);
diff --git a/src/core/manager.c b/src/core/manager.c

index da4e9ca408127ce172811b49990d9849455a822d..0e33273c3dc2464f6ccc623c8057c7fb29a96029 100644 (file)
--- a/src/core/manager.c
+++ b/src/core/manager.c
@@ -21,6 +21,7 @@
  #include "audit-fd.h"
  #include "boot-timestamps.h"
  #include "bpf-restrict-fs.h"
+#include "bpf-restrict-fsaccess.h"
  #include "build-path.h"
  #include "bus-common-errors.h"
  #include "bus-error.h"
@@ -941,8 +942,13 @@ int manager_new(RuntimeScope runtime_scope, ManagerTestRunFlags test_run_flags,
                  .dump_ratelimit = (const RateLimit) { .interval = 10 * USEC_PER_MINUTE, .burst = 10 },
  
                  .executor_fd = -EBADF,
+
+                .restrict_fsaccess_bss_map_fd = -EBADF,
          };
  
+        FOREACH_ELEMENT(fd, m->restrict_fsaccess_link_fds)
+                *fd = -EBADF;
+
          unit_defaults_init(&m->defaults, runtime_scope);
  
  #if ENABLE_EFI
@@ -1784,6 +1790,8 @@ Manager* manager_free(Manager *m) {
  #if BPF_FRAMEWORK
          bpf_restrict_fs_destroy(m->restrict_fs);
  #endif
+        close_many(m->restrict_fsaccess_link_fds, ELEMENTSOF(m->restrict_fsaccess_link_fds));
+        safe_close(m->restrict_fsaccess_bss_map_fd);
  
          safe_close(m->executor_fd);
          free(m->executor_path);
@@ -2140,6 +2148,13 @@ int manager_startup(Manager *m, FILE *serialization, FDSet *fds, const char *roo
                          m->send_reloading_done = true;
          }
  
+        /* Set up RestrictFileSystemAccess= BPF LSM after deserialization (so we can detect deserialized link FDs)
+         * and before clearing switching_root (so we can close the initramfs trust window). This must
+         * run after set_manager_settings() has set m->restrict_filesystem_access. */
+        r = bpf_restrict_fsaccess_setup(m);
+        if (r < 0)
+                return r;
+
          manager_ready(m);
  
          manager_set_switching_root(m, false);
diff --git a/src/core/manager.h b/src/core/manager.h

index 9afc70b39a1d8f09ab57175e9ee6fdd80ede5547..fb65705d321df08d270ef4f8a2c064d3afabeb93 100644 (file)
--- a/src/core/manager.h
+++ b/src/core/manager.h
@@ -3,6 +3,7 @@
  
  #include "sd-event.h"
  
+#include "bpf-restrict-fsaccess.h"
  #include "cgroup.h"
  #include "common-signal.h"
  #include "execute.h"
@@ -479,6 +480,16 @@ typedef struct Manager {
          /* Reference to RestrictFileSystems= BPF program */
          struct restrict_fs_bpf *restrict_fs;
  
+        /* Reference to RestrictFileSystemAccess= BPF LSM program */
+        RestrictFileSystemAccess restrict_filesystem_access;
+
+        /* Raw BPF FDs extracted from the skeleton after attach. The kernel
+         * reference chain (link FD -> bpf_link -> bpf_prog -> bpf_map) keeps
+         * programs attached and map data alive. The .bss map FD is used for
+         * targeted writes (clearing initramfs_s_dev after switch_root). */
+        int restrict_fsaccess_link_fds[_RESTRICT_FILESYSTEM_ACCESS_LINK_MAX];
+        int restrict_fsaccess_bss_map_fd;
+
          /* Allow users to configure a rate limit for Reload()/Reexecute() operations */
          RateLimit reload_reexec_ratelimit;
          /* Dump*() are slow, so always rate limit them to 10 per 10 minutes */
diff --git a/src/core/meson.build b/src/core/meson.build

index eef53be94bc7557f0cf0a90c8450634a704ea59e..7f5845244e650840f841a4619caa8835a4f4596d 100644 (file)
--- a/src/core/meson.build
+++ b/src/core/meson.build
@@ -7,6 +7,7 @@ libcore_sources = files(
          'bpf-firewall.c',
          'bpf-foreign.c',
          'bpf-restrict-fs.c',
+        'bpf-restrict-fsaccess.c',
          'bpf-restrict-ifaces.c',
          'bpf-socket-bind.c',
          'bpf-bind-iface.c',
@@ -86,6 +87,10 @@ if conf.get('BPF_FRAMEWORK') == 1
          endforeach
  endif
  
+if conf.get('HAVE_LSM_INTEGRITY_TYPE') == 1
+        libcore_sources += bpf_programs_by_name['restrict-fsaccess']
+endif
+
  sources += libcore_sources
  
  load_fragment_gperf_gperf = custom_target(
diff --git a/src/core/system.conf.in b/src/core/system.conf.in

index 63d28059305fe2419edf8f86412bf642309a0ccb..35c7cec6efcbbcbf8aeb23a22947e5658f28b221 100644 (file)
--- a/src/core/system.conf.in
+++ b/src/core/system.conf.in
@@ -41,6 +41,7 @@
  #CapabilityBoundingSet=
  #NoNewPrivileges=no
  #ProtectSystem=auto
+#RestrictFileSystemAccess=no
  #SystemCallArchitectures=
  #TimerSlackNSec=
  #StatusUnitFormat={{STATUS_UNIT_FORMAT_DEFAULT_STR}}
diff --git a/src/shared/bpf-link.c b/src/shared/bpf-link.c

index 95f7256a56795865a2a7886aaf70a4846025cc61..80a6db47ded2393dccd384fc5a210ba3ef3fa4ed 100644 (file)
--- a/src/shared/bpf-link.c
+++ b/src/shared/bpf-link.c
@@ -19,6 +19,22 @@ bool bpf_can_link_program(struct bpf_program *prog) {
          return bpf_get_error_translated(link) == -EBADF;
  }
  
+bool bpf_can_link_lsm_program(struct bpf_program *prog) {
+        _cleanup_(bpf_link_freep) struct bpf_link *link = NULL;
+
+        assert(prog);
+
+        if (dlopen_bpf(LOG_DEBUG) < 0)
+                return false;
+
+        link = sym_bpf_program__attach_lsm(prog);
+
+        /* If bpf_program__attach_lsm fails the resulting value stores libbpf error code instead of memory
+         * pointer. That is the case when the helper is called on architectures where BPF trampoline (hence
+         * BPF_LSM_MAC attach type) is not supported. */
+        return bpf_get_error_translated(link) == 0;
+}
+
  int bpf_serialize_link(FILE *f, FDSet *fds, const char *key, struct bpf_link *link) {
          assert(key);
  
diff --git a/src/shared/bpf-link.h b/src/shared/bpf-link.h

index 79da1c2fea26ce1b85acd131be00d47d48d37a84..4de95eb2e1ee3536bfb5395562b2d54a0e5a1d1b 100644 (file)
--- a/src/shared/bpf-link.h
+++ b/src/shared/bpf-link.h
@@ -8,6 +8,7 @@
  #include "shared-forward.h"
  
  bool bpf_can_link_program(struct bpf_program *prog);
+bool bpf_can_link_lsm_program(struct bpf_program *prog);
  
  int bpf_serialize_link(FILE *f, FDSet *fds, const char *key, struct bpf_link *link);
author	Christian Brauner <brauner@kernel.org>
	Fri, 8 May 2026 08:45:23 +0000 (10:45 +0200)
committer	Christian Brauner <brauner@kernel.org>
	Wed, 13 May 2026 08:36:12 +0000 (10:36 +0200)
man/kernel-command-line.xml		patch \| blob \| blame \| history
man/systemd-system.conf.xml		patch \| blob \| blame \| history
src/bpf/meson.build		patch \| blob \| blame \| history
src/bpf/restrict-fsaccess.bpf.c	[new file with mode: 0644]	patch \| blob
src/core/bpf-restrict-fs.c		patch \| blob \| blame \| history
src/core/bpf-restrict-fsaccess.c	[new file with mode: 0644]	patch \| blob
src/core/bpf-restrict-fsaccess.h	[new file with mode: 0644]	patch \| blob
src/core/core-forward.h		patch \| blob \| blame \| history
src/core/main.c		patch \| blob \| blame \| history
src/core/manager.c		patch \| blob \| blame \| history
src/core/manager.h		patch \| blob \| blame \| history
src/core/meson.build		patch \| blob \| blame \| history
src/core/system.conf.in		patch \| blob \| blame \| history
src/shared/bpf-link.c		patch \| blob \| blame \| history
src/shared/bpf-link.h		patch \| blob \| blame \| history