</listitem>
</varlistentry>
+ <varlistentry>
+ <term><varname>systemd.restrict_filesystem_access=</varname></term>
+ <listitem>
+ <para>Controls the <varname>RestrictFileSystemAccess=</varname> execution enforcement policy. For
+ details, see
+ <citerefentry><refentrytitle>systemd-system.conf</refentrytitle><manvolnum>5</manvolnum></citerefentry>.</para>
+
+ <xi:include href="version-info.xml" xpointer="v261"/>
+ </listitem>
+ </varlistentry>
+
<varlistentry>
<term><varname>systemd.mask=</varname></term>
<term><varname>systemd.wants=</varname></term>
<xi:include href="version-info.xml" xpointer="v256"/></listitem>
</varlistentry>
+ <varlistentry>
+ <term><varname>RestrictFileSystemAccess=</varname></term>
+
+ <listitem><para>Takes a boolean argument or the special value <literal>exec</literal>. Defaults to
+ <literal>no</literal>. When enabled, PID 1 loads a BPF LSM program that enforces a deny-default
+ execution policy: only binaries residing on signed dm-verity block devices (and the initramfs during
+ early boot) are permitted to execute. Execution from tmpfs, procfs, sysfs, unsigned dm-verity devices,
+ and anonymous executable memory mappings is denied.</para>
+
+ <para>This setting is intended as one component of an image-based, fully verified system, where the
+ whole boot chain (firmware, kernel image, kernel command line, initramfs) is measured and attested.
+ On a general-purpose system without such guarantees it does not provide a meaningful security
+ boundary on its own: an attacker with sufficient privilege to edit
+ <filename>system.conf</filename>, modify the kernel command line, or kexec into an unsigned initrd
+ can disable or bypass the policy.</para>
+
+ <para>The enforcement hooks block <function>execve()</function> of untrusted binaries
+ (<literal>bprm_check_security</literal>), <constant>PROT_EXEC</constant> memory mappings including
+ shared libraries (<literal>mmap_file</literal>), and write-to-execute transitions such as JIT
+ compilation (<literal>file_mprotect</literal>).</para>
+
+ <para>Note that execution from overlayfs mounts is blocked even if the underlying layers reside on
+ signed dm-verity devices, because the BPF program sees the overlay filesystem's anonymous device
+ number rather than the underlying block device. Multi-device filesystems such as btrfs are similarly
+ unsupported.</para>
+
+ <para>Note that, without further measures to secure the system, kexec can be used to circumvent this.</para>
+
+ <para>This requires the kernel to be booted with <literal>dm_verity.require_signatures=1</literal>
+ on the kernel command line and with BPF LSM enabled (<literal>lsm=...,bpf</literal>). If either
+ prerequisite is not met, PID 1 will refuse to complete startup.</para>
+
+ <para>The value <literal>yes</literal> is equivalent to <literal>exec</literal>. Additional
+ modes may be added in the future.</para>
+
+ <para>This option may also be set via the <varname>systemd.restrict_filesystem_access=</varname> kernel command
+ line option, see
+ <citerefentry><refentrytitle>kernel-command-line</refentrytitle><manvolnum>7</manvolnum></citerefentry>.
+ </para>
+
+ <xi:include href="version-info.xml" xpointer="v261"/></listitem>
+ </varlistentry>
+
<varlistentry>
<term><varname>SystemCallArchitectures=</varname></term>
conf.set10('HAVE_VMLINUX_H', use_provided_vmlinux_h or use_generated_vmlinux_h)
+# 'enum lsm_integrity_type' was added together with the bdev_setintegrity LSM
+# hook in kernel commit 2deeb6c333e5 (v6.5). The generated vmlinux.h reflects
+# the running kernel's BTF; a provided vmlinux.h can be older, so probe.
+have_lsm_integrity_type = false
+if use_generated_vmlinux_h
+ have_lsm_integrity_type = true
+elif use_provided_vmlinux_h
+ have_lsm_integrity_type = cc.compiles(
+ '#include "@0@"\nenum lsm_integrity_type _t;\n'.format(provided_vmlinux_h_path),
+ name : 'enum lsm_integrity_type in vmlinux.h')
+endif
+conf.set10('HAVE_LSM_INTEGRITY_TYPE', have_lsm_integrity_type)
+
conf.set10('ENABLE_SYSCTL_BPF', conf.get('HAVE_VMLINUX_H') == 1 and libbpf.version().version_compare('>= 0.7'))
bpf_programs = [
'source' : files('bind-iface.bpf.c'),
'condition' : 'BPF_FRAMEWORK',
},
+ {
+ 'source' : files('restrict-fsaccess.bpf.c'),
+ 'condition' : 'HAVE_LSM_INTEGRITY_TYPE',
+ 'depends' : vmlinux_h_dependency,
+ },
{
'source' : files('restrict-fs.bpf.c'),
'condition' : 'BPF_FRAMEWORK',
--- /dev/null
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+/* The SPDX header above is actually correct in claiming this was
+ * LGPL-2.1-or-later, because it is. Since the kernel doesn't consider that
+ * compatible with GPL we will claim this to be GPL however, which should be
+ * fine given that LGPL-2.1-or-later downgrades to GPL if needed.
+ */
+
+/* Trusted Execution BPF LSM program.
+ *
+ * Enforces that only binaries from signed dm-verity block devices (or the
+ * initramfs during early boot) can be executed.
+ *
+ * Architecture:
+ * - bdev_setintegrity hook: self-populates a map of trusted devices when
+ * dm-verity signals signature validity
+ * - bdev_free_security hook: removes devices from the map on teardown
+ * - bprm_check_security: blocks execve() from untrusted sources
+ * - mmap_file: blocks PROT_EXEC mmap from untrusted sources
+ * - file_mprotect: blocks W->X transitions from untrusted sources
+ */
+
+/* If offsetof() is implemented via __builtin_offset() then it doesn't work on current compilers, since the
+ * built-ins do not understand CO-RE. Let's undefine any such macros here, to force bpf_helpers.h to define
+ * its own definitions for this. (In new versions it will do so automatically, but at least in libbpf 1.1.0
+ * it does not.) */
+#undef offsetof
+#undef container_of
+
+#include "vmlinux.h"
+
+#include <errno.h> /* IWYU pragma: keep */
+#include <bpf/bpf_core_read.h>
+#include <bpf/bpf_helpers.h>
+#include <bpf/bpf_tracing.h>
+
+#define PROT_EXEC 0x4
+#define VM_EXEC 0x00000004
+
+/* ---- Maps ---- */
+
+struct {
+ __uint(type, BPF_MAP_TYPE_HASH);
+ __uint(max_entries, 0); /* placeholder */
+ __type(key, __u32); /* dev_t from bdev->bd_dev */
+ __type(value, __u8); /* 1 = signature valid */
+} verity_devices SEC(".maps");
+
+/* ---- Globals (set by PID1 via skeleton) ---- */
+
+/* Device number of the initramfs superblock. PID1 sets this at load time and
+ * clears it (to 0) after switch_root. A value of 0 means "no initramfs trust
+ * — the window is closed." */
+volatile __u32 initramfs_s_dev;
+
+/* ---- Integrity tracking hooks ---- */
+
+SEC("lsm/bdev_setintegrity")
+int BPF_PROG(restrict_fsaccess_bdev_setintegrity, struct block_device *bdev,
+ enum lsm_integrity_type type, const void *value, __u64 size)
+{
+ if (type == LSM_INT_DMVERITY_SIG_VALID) {
+ __u32 dev = bdev->bd_dev;
+ __u8 valid = value && size > 0;
+ bpf_map_update_elem(&verity_devices, &dev, &valid, BPF_ANY);
+ }
+
+ return 0;
+}
+
+SEC("lsm/bdev_free_security")
+void BPF_PROG(restrict_fsaccess_bdev_free, struct block_device *bdev)
+{
+ __u32 dev = bdev->bd_dev;
+ bpf_map_delete_elem(&verity_devices, &dev);
+}
+
+/* ---- Enforcement helpers ---- */
+
+/* Check whether a file is from a trusted source.
+ * Returns 0 (allow) or -EPERM (deny). */
+static __always_inline int check_trusted_file(struct file *file)
+{
+ __u32 s_dev;
+ __u8 *sig_valid;
+
+ BPF_CORE_READ_INTO(&s_dev, file, f_inode, i_sb, s_dev);
+
+ /* Check initramfs trust (active only during early boot) */
+ if (initramfs_s_dev != 0 && s_dev == initramfs_s_dev)
+ return 0;
+
+ /* Check verity device map */
+ sig_valid = bpf_map_lookup_elem(&verity_devices, &s_dev);
+ if (sig_valid && *sig_valid)
+ return 0;
+
+ return -EPERM;
+}
+
+/* ---- Enforcement hooks ---- */
+
+SEC("lsm/bprm_check_security")
+int BPF_PROG(restrict_fsaccess_bprm_check, struct linux_binprm *bprm)
+{
+ struct file *file;
+
+ BPF_CORE_READ_INTO(&file, bprm, file);
+ return check_trusted_file(file);
+}
+
+SEC("lsm/mmap_file")
+int BPF_PROG(restrict_fsaccess_mmap_file, struct file *file, unsigned long reqprot,
+ unsigned long prot, unsigned long flags)
+{
+ /* Only enforce on executable mappings */
+ if (!(prot & PROT_EXEC))
+ return 0;
+
+ /* Anonymous executable mapping — no file backing, deny */
+ if (!file)
+ return -EPERM;
+
+ return check_trusted_file(file);
+}
+
+SEC("lsm/file_mprotect")
+int BPF_PROG(restrict_fsaccess_file_mprotect, struct vm_area_struct *vma,
+ unsigned long reqprot, unsigned long prot)
+{
+ struct file *file;
+ unsigned long vm_flags;
+
+ /* Only enforce when adding PROT_EXEC */
+ if (!(prot & PROT_EXEC))
+ return 0;
+
+ /* If VM_EXEC is already set, the mapping is already executable — this
+ * mprotect isn't granting new executable capability, allow */
+ BPF_CORE_READ_INTO(&vm_flags, vma, vm_flags);
+ if (vm_flags & VM_EXEC)
+ return 0;
+
+ /* Anonymous executable mapping — no file backing, deny */
+ BPF_CORE_READ_INTO(&file, vma, vm_file);
+ if (!file)
+ return -EPERM;
+
+ return check_trusted_file(file);
+}
+
+static const char _license[] SEC("license") = "GPL";
DEFINE_TRIVIAL_CLEANUP_FUNC(struct restrict_fs_bpf *, restrict_fs_bpf_free);
-static bool bpf_can_link_lsm_program(struct bpf_program *prog) {
- _cleanup_(bpf_link_freep) struct bpf_link *link = NULL;
-
- assert(prog);
-
- link = sym_bpf_program__attach_lsm(prog);
-
- /* If bpf_program__attach_lsm fails the resulting value stores libbpf error code instead of memory
- * pointer. That is the case when the helper is called on architectures where BPF trampoline (hence
- * BPF_LSM_MAC attach type) is not supported. */
- return bpf_get_error_translated(link) == 0;
-}
-
static int prepare_restrict_fs_bpf(struct restrict_fs_bpf **ret_obj) {
_cleanup_(restrict_fs_bpf_freep) struct restrict_fs_bpf *obj = NULL;
_cleanup_close_ int inner_map_fd = -EBADF;
--- /dev/null
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include <fcntl.h>
+#include <sys/mman.h>
+#include <sys/stat.h>
+
+#include "bpf-restrict-fsaccess.h"
+#include "fd-util.h"
+#include "fileio.h"
+#include "initrd-util.h"
+#include "log.h"
+#include "lsm-util.h"
+#include "manager.h"
+#include "memory-util.h"
+#include "string-table.h"
+
+/* DMVERITY_DEVICES_MAX lives in bpf-restrict-fsaccess.h for sharing with tests. */
+
+static const char* const restrict_filesystem_access_table[_RESTRICT_FILESYSTEM_ACCESS_MAX] = {
+ [RESTRICT_FILESYSTEM_ACCESS_NO] = "no",
+ [RESTRICT_FILESYSTEM_ACCESS_EXEC] = "exec",
+};
+
+DEFINE_STRING_TABLE_LOOKUP_WITH_BOOLEAN(restrict_filesystem_access, RestrictFileSystemAccess, RESTRICT_FILESYSTEM_ACCESS_EXEC);
+
+const char* const restrict_fsaccess_link_names[_RESTRICT_FILESYSTEM_ACCESS_LINK_MAX] = {
+ [RESTRICT_FILESYSTEM_ACCESS_LINK_BDEV_SETINTEGRITY] = "restrict-fsaccess-bdev-setintegrity-link",
+ [RESTRICT_FILESYSTEM_ACCESS_LINK_BDEV_FREE] = "restrict-fsaccess-bdev-free-link",
+ [RESTRICT_FILESYSTEM_ACCESS_LINK_BPRM_CHECK] = "restrict-fsaccess-bprm-check-link",
+ [RESTRICT_FILESYSTEM_ACCESS_LINK_MMAP_FILE] = "restrict-fsaccess-mmap-file-link",
+ [RESTRICT_FILESYSTEM_ACCESS_LINK_FILE_MPROTECT] = "restrict-fsaccess-file-mprotect-link",
+};
+
+#if BPF_FRAMEWORK && HAVE_LSM_INTEGRITY_TYPE
+#include "bpf-dlopen.h"
+#include "bpf-link.h"
+#include "restrict-fsaccess-skel.h"
+
+static struct restrict_fsaccess_bpf *restrict_fsaccess_bpf_free(struct restrict_fsaccess_bpf *obj) {
+ restrict_fsaccess_bpf__destroy(obj);
+ return NULL;
+}
+
+DEFINE_TRIVIAL_CLEANUP_FUNC(struct restrict_fsaccess_bpf *, restrict_fsaccess_bpf_free);
+
+/* Verify that restrict_fsaccess_bss matches the skeleton's .bss layout */
+assert_cc(sizeof(struct restrict_fsaccess_bss) == sizeof_field(struct restrict_fsaccess_bpf, bss[0]));
+
+/* Build the skeleton links array indexed by the link enum. */
+#define RESTRICT_FSACCESS_LINKS(obj) { \
+ [RESTRICT_FILESYSTEM_ACCESS_LINK_BDEV_SETINTEGRITY] = (obj)->links.restrict_fsaccess_bdev_setintegrity, \
+ [RESTRICT_FILESYSTEM_ACCESS_LINK_BDEV_FREE] = (obj)->links.restrict_fsaccess_bdev_free, \
+ [RESTRICT_FILESYSTEM_ACCESS_LINK_BPRM_CHECK] = (obj)->links.restrict_fsaccess_bprm_check, \
+ [RESTRICT_FILESYSTEM_ACCESS_LINK_MMAP_FILE] = (obj)->links.restrict_fsaccess_mmap_file, \
+ [RESTRICT_FILESYSTEM_ACCESS_LINK_FILE_MPROTECT] = (obj)->links.restrict_fsaccess_file_mprotect, \
+}
+
+static bool dm_verity_require_signatures(void) {
+ int r;
+
+ r = read_boolean_file("/sys/module/dm_verity/parameters/require_signatures");
+ if (r < 0) {
+ if (r != -ENOENT)
+ log_warning_errno(r, "bpf-restrict-fsaccess: Failed to read dm-verity require_signatures: %m");
+ return false;
+ }
+
+ return r > 0;
+}
+
+static int get_root_s_dev(uint32_t *ret) {
+ struct stat st;
+
+ assert(ret);
+
+ /* Stat /usr/ rather than / — executable code lives in /usr/ and we push toward
+ * a writable non-executable /. On systems with a separate /usr partition this
+ * means / is intentionally not trusted. */
+ if (stat("/usr/", &st) < 0)
+ return log_error_errno(errno, "bpf-restrict-fsaccess: Failed to stat /usr/ filesystem: %m");
+
+ *ret = STAT_DEV_TO_KERNEL(st.st_dev);
+ return 0;
+}
+
+static int prepare_restrict_fsaccess_bpf(struct restrict_fsaccess_bpf **ret) {
+ _cleanup_(restrict_fsaccess_bpf_freep) struct restrict_fsaccess_bpf *obj = NULL;
+ int r;
+
+ assert(ret);
+
+ obj = restrict_fsaccess_bpf__open();
+ if (!obj)
+ return log_error_errno(errno, "bpf-restrict-fsaccess: Failed to open BPF object: %m");
+
+ r = sym_bpf_map__set_max_entries(obj->maps.verity_devices, DMVERITY_DEVICES_MAX);
+ if (r < 0)
+ return log_error_errno(r, "bpf-restrict-fsaccess: Failed to size hash table: %m");
+
+ r = restrict_fsaccess_bpf__load(obj);
+ if (r < 0)
+ return log_error_errno(r, "bpf-restrict-fsaccess: Failed to load BPF object: %m");
+
+ *ret = TAKE_PTR(obj);
+ return 0;
+}
+
+bool bpf_restrict_fsaccess_supported(void) {
+ _cleanup_(restrict_fsaccess_bpf_freep) struct restrict_fsaccess_bpf *obj = NULL;
+ static int supported = -1;
+ int r;
+
+ if (supported >= 0)
+ return supported;
+ if (dlopen_bpf(LOG_WARNING) < 0)
+ return (supported = false);
+
+ r = lsm_supported("bpf");
+ if (r == -ENOPKG) {
+ log_debug_errno(r, "bpf-restrict-fsaccess: securityfs not mounted, BPF LSM not available.");
+ return (supported = false);
+ }
+ if (r < 0) {
+ log_warning_errno(r, "bpf-restrict-fsaccess: Can't determine whether the BPF LSM module is used: %m");
+ return (supported = false);
+ }
+ if (r == 0) {
+ log_info("bpf-restrict-fsaccess: BPF LSM hook not enabled in the kernel, not supported.");
+ return (supported = false);
+ }
+
+ r = prepare_restrict_fsaccess_bpf(&obj);
+ if (r < 0)
+ return (supported = false);
+
+ if (!bpf_can_link_lsm_program(obj->progs.restrict_fsaccess_bprm_check)) {
+ log_warning("bpf-restrict-fsaccess: Failed to link program; assuming BPF LSM is not available.");
+ return (supported = false);
+ }
+
+ return (supported = true);
+}
+
+/* Close the initramfs trust window after switch_root by clearing initramfs_s_dev
+ * in the BPF .bss map. The .bss is a BPF_F_MMAPABLE array map — mmap it and do
+ * a single aligned 4-byte store instead of a full-value read-modify-write via
+ * bpf_map_update_elem, which would needlessly rewrite the guard globals too. */
+static int restrict_fsaccess_clear_initramfs_trust(int bss_map_fd) {
+ void *p;
+
+ assert(bss_map_fd >= 0);
+ assert_cc(offsetof(struct restrict_fsaccess_bss, initramfs_s_dev) == 0);
+
+ p = mmap(NULL, page_size(), PROT_READ | PROT_WRITE, MAP_SHARED, bss_map_fd, 0);
+ if (p == MAP_FAILED)
+ return log_error_errno(errno, "bpf-restrict-fsaccess: Failed to mmap .bss map: %m");
+
+ /* initramfs_s_dev is at offset 0 in the .bss layout. Single aligned
+ * 32-bit store is atomic — BPF programs see either the old or new value,
+ * no torn reads possible. Guard globals are untouched. */
+ *(uint32_t *) p = 0;
+
+ /* munmap failure here is harmless: the clear above already landed in
+ * the kernel, and the mapping is discarded by exec anyway. */
+ if (munmap(p, page_size()) < 0)
+ log_warning_errno(errno, "bpf-restrict-fsaccess: Failed to munmap .bss map, ignoring: %m");
+
+ log_info("bpf-restrict-fsaccess: Cleared initramfs trust window after switch_root.");
+ return 0;
+}
+
+int bpf_restrict_fsaccess_setup(Manager *m) {
+ _cleanup_(restrict_fsaccess_bpf_freep) struct restrict_fsaccess_bpf *obj = NULL;
+ int r;
+
+ assert(m);
+
+ if (!MANAGER_IS_SYSTEM(m) || m->restrict_filesystem_access <= RESTRICT_FILESYSTEM_ACCESS_NO)
+ return 0;
+
+ /* Fresh setup: verify BPF LSM is available */
+ if (!bpf_restrict_fsaccess_supported())
+ return log_warning_errno(SYNTHETIC_ERRNO(EOPNOTSUPP),
+ "bpf-restrict-fsaccess: BPF LSM is not available.");
+
+ /* Require dm-verity signature enforcement */
+ if (!dm_verity_require_signatures())
+ return log_error_errno(SYNTHETIC_ERRNO(ENOKEY),
+ "bpf-restrict-fsaccess: dm-verity require_signatures is not enabled. "
+ "RestrictFileSystemAccess= requires the kernel to enforce dm-verity signatures. "
+ "Set dm_verity.require_signatures=1 on the kernel command line.");
+
+ r = prepare_restrict_fsaccess_bpf(&obj);
+ if (r < 0)
+ return r;
+
+ /* If we're still in the initramfs, allow execution from it by recording
+ * its s_dev. After switch_root, PID1 re-execs and in_initrd() returns
+ * false — initramfs_s_dev stays at 0 (its default), closing the trust
+ * window. */
+ if (in_initrd()) {
+ uint32_t root_dev;
+
+ r = get_root_s_dev(&root_dev);
+ if (r < 0)
+ return r;
+
+ obj->bss->initramfs_s_dev = root_dev;
+ log_info("bpf-restrict-fsaccess: Initramfs trusted (s_dev=%" PRIu32 ":%" PRIu32 ")",
+ root_dev >> 20, root_dev & 0xFFFFF);
+ }
+
+ r = restrict_fsaccess_bpf__attach(obj);
+ if (r < 0)
+ return log_error_errno(r, "bpf-restrict-fsaccess: Failed to attach BPF programs: %m");
+
+ log_info("bpf-restrict-fsaccess: LSM BPF programs attached");
+
+ /* Extract owned FDs from the skeleton. These keep the kernel BPF objects
+ * alive after the skeleton is destroyed. Destroying the skeleton unmaps
+ * the .bss page from our address space so no BPF state is reachable via
+ * /proc/1/mem. */
+ struct bpf_link *links[] = RESTRICT_FSACCESS_LINKS(obj);
+
+ FOREACH_ELEMENT(link, links) {
+ size_t idx = link - links;
+
+ m->restrict_fsaccess_link_fds[idx] = fcntl(sym_bpf_link__fd(*link), F_DUPFD_CLOEXEC, 3);
+ if (m->restrict_fsaccess_link_fds[idx] < 0) {
+ r = log_error_errno(errno, "bpf-restrict-fsaccess: Failed to dup link FD for %s: %m",
+ restrict_fsaccess_link_names[idx]);
+ goto fail;
+ }
+ }
+
+ m->restrict_fsaccess_bss_map_fd = fcntl(sym_bpf_map__fd(obj->maps.bss), F_DUPFD_CLOEXEC, 3);
+ if (m->restrict_fsaccess_bss_map_fd < 0) {
+ r = log_error_errno(errno, "bpf-restrict-fsaccess: Failed to dup .bss map FD: %m");
+ goto fail;
+ }
+
+ return 0;
+
+fail:
+ /* Close partial FDs so we don't leave a half-baked policy attached
+ * once the skeleton is destroyed by _cleanup_. */
+ FOREACH_ELEMENT(fd, m->restrict_fsaccess_link_fds)
+ *fd = safe_close(*fd);
+ m->restrict_fsaccess_bss_map_fd = safe_close(m->restrict_fsaccess_bss_map_fd);
+ return r;
+}
+
+int bpf_restrict_fsaccess_close_initramfs_trust(Manager *m) {
+ assert(m);
+
+ /* Clear initramfs_s_dev in the BPF .bss map BEFORE switch_root unmounts
+ * the initramfs. This eliminates the dev_t recycling window: the anonymous
+ * dev_t is still held by the mounted initramfs superblock, so no other
+ * filesystem can recycle it yet. Anonymous dev_t recycling is immediate
+ * and lowest-first, so a stale initramfs_s_dev is a near-certain trust
+ * bypass — fail closed. */
+ if (!in_initrd() || m->restrict_fsaccess_bss_map_fd < 0)
+ return 0;
+
+ return restrict_fsaccess_clear_initramfs_trust(m->restrict_fsaccess_bss_map_fd);
+}
+
+#else /* ! BPF_FRAMEWORK || ! HAVE_LSM_INTEGRITY_TYPE */
+
+bool bpf_restrict_fsaccess_supported(void) {
+ return false;
+}
+
+int bpf_restrict_fsaccess_setup(Manager *m) {
+ if (!MANAGER_IS_SYSTEM(m) || m->restrict_filesystem_access <= RESTRICT_FILESYSTEM_ACCESS_NO)
+ return 0;
+
+ return log_warning_errno(SYNTHETIC_ERRNO(EOPNOTSUPP),
+ "bpf-restrict-fsaccess: RestrictFileSystemAccess= requested but BPF framework is not compiled in.");
+}
+
+int bpf_restrict_fsaccess_close_initramfs_trust(Manager *m) {
+ return 0;
+}
+
+#endif
--- /dev/null
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#include <sys/sysmacros.h>
+
+#include "core-forward.h"
+#include "macro.h"
+#include "shared-forward.h"
+
+typedef enum RestrictFileSystemAccess {
+ RESTRICT_FILESYSTEM_ACCESS_NO,
+ RESTRICT_FILESYSTEM_ACCESS_EXEC,
+ _RESTRICT_FILESYSTEM_ACCESS_MAX,
+ _RESTRICT_FILESYSTEM_ACCESS_INVALID = -EINVAL,
+} RestrictFileSystemAccess;
+
+const char* restrict_filesystem_access_to_string(RestrictFileSystemAccess i) _const_;
+RestrictFileSystemAccess restrict_filesystem_access_from_string(const char *s) _pure_;
+
+enum {
+ RESTRICT_FILESYSTEM_ACCESS_LINK_BDEV_SETINTEGRITY,
+ RESTRICT_FILESYSTEM_ACCESS_LINK_BDEV_FREE,
+ RESTRICT_FILESYSTEM_ACCESS_LINK_BPRM_CHECK,
+ RESTRICT_FILESYSTEM_ACCESS_LINK_MMAP_FILE,
+ RESTRICT_FILESYSTEM_ACCESS_LINK_FILE_MPROTECT,
+ _RESTRICT_FILESYSTEM_ACCESS_LINK_MAX,
+};
+
+/* Maximum number of dm-verity devices tracked in the BPF hash map. */
+#define DMVERITY_DEVICES_MAX (16U*1024U)
+
+/* Convert userspace dev_t (from stat()) to kernel dev_t encoding (MKDEV).
+ * stat() returns new_encode_dev(s_dev); the BPF program reads s_dev directly
+ * which uses MKDEV(major, minor) = (major << 20) | minor. */
+#define STAT_DEV_TO_KERNEL(dev) \
+ ((uint32_t)major(dev) << 20 | (uint32_t)minor(dev))
+
+/* Mirrors the BPF program's .bss section layout for read-modify-write via
+ * bpf_map_lookup_elem/bpf_map_update_elem on the serialized .bss map FD. */
+struct restrict_fsaccess_bss {
+ uint32_t initramfs_s_dev; /* kernel dev_t encoding: (major << 20) | minor */
+};
+
+extern const char* const restrict_fsaccess_link_names[_RESTRICT_FILESYSTEM_ACCESS_LINK_MAX];
+
+bool bpf_restrict_fsaccess_supported(void);
+int bpf_restrict_fsaccess_setup(Manager *m);
+
+int bpf_restrict_fsaccess_close_initramfs_trust(Manager *m);
typedef struct UnitRef UnitRef;
struct restrict_fs_bpf;
+struct restrict_fsaccess_bpf;
#include "apparmor-setup.h"
#include "architecture.h"
#include "argv-util.h"
+#include "bpf-restrict-fsaccess.h"
#include "build.h"
#include "bus-error.h"
#include "capability-util.h"
static uint64_t arg_capability_bounding_set;
static bool arg_no_new_privs;
static int arg_protect_system;
+static RestrictFileSystemAccess arg_restrict_filesystem_access;
static nsec_t arg_timer_slack_nsec;
static Set* arg_syscall_archs;
static FILE* arg_serialization;
return 0;
}
+ } else if (proc_cmdline_key_streq(key, "systemd.restrict_filesystem_access")) {
+
+ if (value) {
+ r = restrict_filesystem_access_from_string(value);
+ if (r < 0)
+ log_warning_errno(r, "Failed to parse systemd.restrict_filesystem_access= argument '%s', ignoring: %m", value);
+ else
+ arg_restrict_filesystem_access = r;
+ } else
+ arg_restrict_filesystem_access = RESTRICT_FILESYSTEM_ACCESS_EXEC;
+
} else if (streq(key, "quiet") && !value) {
if (arg_show_status == _SHOW_STATUS_INVALID)
return 0;
}
+static int config_parse_restrict_filesystem_access(
+ const char *unit,
+ const char *filename,
+ unsigned line,
+ const char *section,
+ unsigned section_line,
+ const char *lvalue,
+ int ltype,
+ const char *rvalue,
+ void *data,
+ void *userdata) {
+
+ RestrictFileSystemAccess *v = ASSERT_PTR(data);
+ RestrictFileSystemAccess re;
+
+ re = restrict_filesystem_access_from_string(rvalue);
+ if (re < 0)
+ return log_syntax_parse_error(unit, filename, line, re, lvalue, rvalue);
+
+ *v = re;
+ return 0;
+}
+
static int config_parse_crash_reboot(
const char *unit,
const char *filename,
{ "Manager", "CapabilityBoundingSet", config_parse_capability_set, 0, &arg_capability_bounding_set },
{ "Manager", "NoNewPrivileges", config_parse_bool, 0, &arg_no_new_privs },
{ "Manager", "ProtectSystem", config_parse_protect_system_pid1, 0, &arg_protect_system },
+ { "Manager", "RestrictFileSystemAccess", config_parse_restrict_filesystem_access, 0, &arg_restrict_filesystem_access },
#if HAVE_SECCOMP
{ "Manager", "SystemCallArchitectures", config_parse_syscall_archs, 0, &arg_syscall_archs },
#else
manager_set_show_status(m, arg_show_status, "command line");
m->status_unit_format = arg_status_unit_format;
+ m->restrict_filesystem_access = arg_restrict_filesystem_access;
}
static int parse_argv(int argc, char *argv[]) {
m->n_reloading++;
bus_manager_send_reloading(m, true);
+ /* Only close the initramfs trust window when actually switching root.
+ * During a plain daemon-reexec in the initrd, PID1 still needs to
+ * execv() itself from the initramfs — clearing trust here would cause
+ * the BPF bprm_check_security hook to deny the exec. */
+ if (switching_root) {
+ r = bpf_restrict_fsaccess_close_initramfs_trust(m);
+ if (r < 0)
+ return r;
+ }
+
r = manager_open_serialization(m, &f);
if (r < 0)
return log_error_errno(r, "Failed to create serialization file: %m");
arg_capability_bounding_set = CAP_MASK_ALL;
arg_no_new_privs = false;
arg_protect_system = -1;
+ arg_restrict_filesystem_access = RESTRICT_FILESYSTEM_ACCESS_NO;
arg_timer_slack_nsec = NSEC_INFINITY;
arg_syscall_archs = set_free(arg_syscall_archs);
#include "audit-fd.h"
#include "boot-timestamps.h"
#include "bpf-restrict-fs.h"
+#include "bpf-restrict-fsaccess.h"
#include "build-path.h"
#include "bus-common-errors.h"
#include "bus-error.h"
.dump_ratelimit = (const RateLimit) { .interval = 10 * USEC_PER_MINUTE, .burst = 10 },
.executor_fd = -EBADF,
+
+ .restrict_fsaccess_bss_map_fd = -EBADF,
};
+ FOREACH_ELEMENT(fd, m->restrict_fsaccess_link_fds)
+ *fd = -EBADF;
+
unit_defaults_init(&m->defaults, runtime_scope);
#if ENABLE_EFI
#if BPF_FRAMEWORK
bpf_restrict_fs_destroy(m->restrict_fs);
#endif
+ close_many(m->restrict_fsaccess_link_fds, ELEMENTSOF(m->restrict_fsaccess_link_fds));
+ safe_close(m->restrict_fsaccess_bss_map_fd);
safe_close(m->executor_fd);
free(m->executor_path);
m->send_reloading_done = true;
}
+ /* Set up RestrictFileSystemAccess= BPF LSM after deserialization (so we can detect deserialized link FDs)
+ * and before clearing switching_root (so we can close the initramfs trust window). This must
+ * run after set_manager_settings() has set m->restrict_filesystem_access. */
+ r = bpf_restrict_fsaccess_setup(m);
+ if (r < 0)
+ return r;
+
manager_ready(m);
manager_set_switching_root(m, false);
#include "sd-event.h"
+#include "bpf-restrict-fsaccess.h"
#include "cgroup.h"
#include "common-signal.h"
#include "execute.h"
/* Reference to RestrictFileSystems= BPF program */
struct restrict_fs_bpf *restrict_fs;
+ /* Reference to RestrictFileSystemAccess= BPF LSM program */
+ RestrictFileSystemAccess restrict_filesystem_access;
+
+ /* Raw BPF FDs extracted from the skeleton after attach. The kernel
+ * reference chain (link FD -> bpf_link -> bpf_prog -> bpf_map) keeps
+ * programs attached and map data alive. The .bss map FD is used for
+ * targeted writes (clearing initramfs_s_dev after switch_root). */
+ int restrict_fsaccess_link_fds[_RESTRICT_FILESYSTEM_ACCESS_LINK_MAX];
+ int restrict_fsaccess_bss_map_fd;
+
/* Allow users to configure a rate limit for Reload()/Reexecute() operations */
RateLimit reload_reexec_ratelimit;
/* Dump*() are slow, so always rate limit them to 10 per 10 minutes */
'bpf-firewall.c',
'bpf-foreign.c',
'bpf-restrict-fs.c',
+ 'bpf-restrict-fsaccess.c',
'bpf-restrict-ifaces.c',
'bpf-socket-bind.c',
'bpf-bind-iface.c',
endforeach
endif
+if conf.get('HAVE_LSM_INTEGRITY_TYPE') == 1
+ libcore_sources += bpf_programs_by_name['restrict-fsaccess']
+endif
+
sources += libcore_sources
load_fragment_gperf_gperf = custom_target(
#CapabilityBoundingSet=
#NoNewPrivileges=no
#ProtectSystem=auto
+#RestrictFileSystemAccess=no
#SystemCallArchitectures=
#TimerSlackNSec=
#StatusUnitFormat={{STATUS_UNIT_FORMAT_DEFAULT_STR}}
return bpf_get_error_translated(link) == -EBADF;
}
+bool bpf_can_link_lsm_program(struct bpf_program *prog) {
+ _cleanup_(bpf_link_freep) struct bpf_link *link = NULL;
+
+ assert(prog);
+
+ if (dlopen_bpf(LOG_DEBUG) < 0)
+ return false;
+
+ link = sym_bpf_program__attach_lsm(prog);
+
+ /* If bpf_program__attach_lsm fails the resulting value stores libbpf error code instead of memory
+ * pointer. That is the case when the helper is called on architectures where BPF trampoline (hence
+ * BPF_LSM_MAC attach type) is not supported. */
+ return bpf_get_error_translated(link) == 0;
+}
+
int bpf_serialize_link(FILE *f, FDSet *fds, const char *key, struct bpf_link *link) {
assert(key);
#include "shared-forward.h"
bool bpf_can_link_program(struct bpf_program *prog);
+bool bpf_can_link_lsm_program(struct bpf_program *prog);
int bpf_serialize_link(FILE *f, FDSet *fds, const char *key, struct bpf_link *link);