From: Daniel Borkmann Date: Tue, 2 Jun 2026 07:40:12 +0000 (+0200) Subject: bpf: Add simple xattr support to bpffs X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=9722955b54307e9070994f2382ec06af3d7405e0;p=thirdparty%2Flinux.git bpf: Add simple xattr support to bpffs Add support for extended attributes on bpffs inodes so that user space and BPF LSM programs can attach metadata, for example, a content hash or a security label - to a pinned object or directory. BPF LSM or user space tooling can then uniformly look at this (e.g. security.bpf.*) in similar way to other fs'es. The store is in-memory and non-persistent: it lives only for the lifetime of the mount, like everything else in bpffs. The modelling is similar to tmpfs. bpffs serves the trusted.* and security.* namespaces; user.* is left unsupported. As bpffs is FS_USERNS_MOUNT, security.* is reachable by the unprivileged mounter in a user namespace, and thus we are using the simple_xattr_set_limited infra there (trusted.* needs global CAP_SYS_ADMIN). bpf_fill_super() is open-coded instead of using simple_fill_super(), because the root inode must now be allocated through bpf_fs_alloc_inode() i.e. carry the bpf_fs_inode wrapper and come from the right cache - which requires s_op (and s_xattr) to be installed before the first inode is created. While at it, also harden s_iflags with SB_I_NOEXEC and SB_I_NODEV. bpf_fs_listxattr() is only reachable through the filesystem via i_op->listxattr, so the BPF token inode is left untouched. Name-based fsetxattr()/fgetxattr() on a token fd still work since the get/set handlers are installed at the superblock. For security.* namespace, we use simple_xattr_set_limited() but there was no simple_xattr_add_limited() API yet which was needed in bpf_fs_initxattrs() to avoid underflows in the accounting. The symlink target is freed in bpf_free_inode() rather than in bpf_destroy_inode() so that it is released only after an RCU grace period, as an RCU path walk following the symlink may still dereference inode->i_link in security_inode_follow_link(). Lastly, the bpf_symlink() allocated the symlink target is switched to GFP_KERNEL_ACCOUNT, so the string is charged to the caller's memcg. Signed-off-by: Daniel Borkmann Link: https://patch.msgid.link/20260602074012.416289-1-daniel@iogearbox.net Cc: Christian Brauner Signed-off-by: Christian Brauner (Amutable) --- diff --git a/fs/xattr.c b/fs/xattr.c index 89374cd9029a7..ec2a4f3759d8b 100644 --- a/fs/xattr.c +++ b/fs/xattr.c @@ -1678,6 +1678,39 @@ int simple_xattr_add(struct simple_xattr_cache *cache, struct list_head *xattrs, return 0; } +/** + * simple_xattr_add_limited - add an xattr object, charging per-inode limits + * @cache: anchor for the hash table + * @xattrs: the header of the xattr object + * @limits: per-inode limit counters + * @new_xattr: the xattr object to add + * + * Like simple_xattr_add(), but also accounts @new_xattr against @limits so + * that a later removal or replacement of it through simple_xattr_set_limited() + * decrements counters that were actually incremented, rather than underflowing + * them. Use this instead of simple_xattr_add() when seeding initial xattrs + * that share a namespace with the limited set/remove path. + * + * Return: On success zero is returned. On failure a negative error code is + * returned. + */ +int simple_xattr_add_limited(struct simple_xattr_cache *cache, + struct list_head *xattrs, + struct simple_xattr_limits *limits, + struct simple_xattr *new_xattr) +{ + int err; + + err = simple_xattr_limits_inc(limits, new_xattr->size); + if (err) + return err; + + err = simple_xattr_add(cache, xattrs, new_xattr); + if (err) + simple_xattr_limits_dec(limits, new_xattr->size); + return err; +} + /** * simple_xattrs_free - free xattrs * @cache: anchor for the hash table diff --git a/include/linux/bpf.h b/include/linux/bpf.h index b4b703c90ca94..434ba91401c68 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -31,6 +31,7 @@ #include #include #include +#include #include struct bpf_verifier_env; @@ -1918,6 +1919,8 @@ struct bpf_mount_opts { u64 delegate_maps; u64 delegate_progs; u64 delegate_attachs; + + struct simple_xattr_cache xa_cache; }; struct bpf_token { diff --git a/include/linux/xattr.h b/include/linux/xattr.h index 7aaaf4f8aff5b..54ac3cbc133f8 100644 --- a/include/linux/xattr.h +++ b/include/linux/xattr.h @@ -155,6 +155,10 @@ ssize_t simple_xattr_list(struct inode *inode, struct list_head *xattrs, char *buffer, size_t size); int simple_xattr_add(struct simple_xattr_cache *cache, struct list_head *xattrs, struct simple_xattr *new_xattr); +int simple_xattr_add_limited(struct simple_xattr_cache *cache, + struct list_head *xattrs, + struct simple_xattr_limits *limits, + struct simple_xattr *new_xattr); int xattr_list_one(char **buffer, ssize_t *remaining_size, const char *name); void simple_xattr_cache_cleanup(struct simple_xattr_cache *cache); diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c index 25c06a0118258..c3f79b5a2f8c0 100644 --- a/kernel/bpf/inode.c +++ b/kernel/bpf/inode.c @@ -21,6 +21,9 @@ #include #include #include +#include +#include + #include "preload/bpf_preload.h" enum bpf_type { @@ -30,6 +33,23 @@ enum bpf_type { BPF_TYPE_LINK, }; +struct bpf_fs_inode { + struct list_head xattrs; + struct simple_xattr_limits xlimits; + struct inode vfs_inode; +}; + +static inline struct bpf_fs_inode *BPF_FS_I(struct inode *inode) +{ + return container_of(inode, struct bpf_fs_inode, vfs_inode); +} + +static struct kmem_cache *bpf_fs_inode_cachep __ro_after_init; + +static int bpf_fs_initxattrs(struct inode *inode, + const struct xattr *xattr_array, void *fs_info); +static ssize_t bpf_fs_listxattr(struct dentry *dentry, char *buf, size_t size); + static void *bpf_any_get(void *raw, enum bpf_type type) { switch (type) { @@ -94,10 +114,17 @@ static void *bpf_fd_probe_obj(u32 ufd, enum bpf_type *type) } static const struct inode_operations bpf_dir_iops; +static const struct inode_operations bpf_symlink_iops; -static const struct inode_operations bpf_prog_iops = { }; -static const struct inode_operations bpf_map_iops = { }; -static const struct inode_operations bpf_link_iops = { }; +static const struct inode_operations bpf_prog_iops = { + .listxattr = bpf_fs_listxattr, +}; +static const struct inode_operations bpf_map_iops = { + .listxattr = bpf_fs_listxattr, +}; +static const struct inode_operations bpf_link_iops = { + .listxattr = bpf_fs_listxattr, +}; struct inode *bpf_get_inode(struct super_block *sb, const struct inode *dir, @@ -153,11 +180,19 @@ static struct dentry *bpf_mkdir(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode) { struct inode *inode; + int ret; inode = bpf_get_inode(dir->i_sb, dir, mode | S_IFDIR); if (IS_ERR(inode)) return ERR_CAST(inode); + ret = security_inode_init_security(inode, dir, &dentry->d_name, + bpf_fs_initxattrs, NULL); + if (ret && ret != -EOPNOTSUPP) { + iput(inode); + return ERR_PTR(ret); + } + inode->i_op = &bpf_dir_iops; inode->i_fop = &simple_dir_operations; @@ -330,10 +365,20 @@ static int bpf_mkobj_ops(struct dentry *dentry, umode_t mode, void *raw, const struct file_operations *fops) { struct inode *dir = dentry->d_parent->d_inode; - struct inode *inode = bpf_get_inode(dir->i_sb, dir, mode); + struct inode *inode; + int ret; + + inode = bpf_get_inode(dir->i_sb, dir, mode); if (IS_ERR(inode)) return PTR_ERR(inode); + ret = security_inode_init_security(inode, dir, &dentry->d_name, + bpf_fs_initxattrs, NULL); + if (ret && ret != -EOPNOTSUPP) { + iput(inode); + return ret; + } + inode->i_op = iops; inode->i_fop = fops; inode->i_private = raw; @@ -382,9 +427,11 @@ bpf_lookup(struct inode *dir, struct dentry *dentry, unsigned flags) static int bpf_symlink(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, const char *target) { - char *link = kstrdup(target, GFP_USER | __GFP_NOWARN); struct inode *inode; + char *link; + int ret; + link = kstrdup(target, GFP_KERNEL_ACCOUNT | __GFP_NOWARN); if (!link) return -ENOMEM; @@ -394,13 +441,25 @@ static int bpf_symlink(struct mnt_idmap *idmap, struct inode *dir, return PTR_ERR(inode); } - inode->i_op = &simple_symlink_inode_operations; + inode->i_op = &bpf_symlink_iops; inode->i_link = link; + ret = security_inode_init_security(inode, dir, &dentry->d_name, + bpf_fs_initxattrs, NULL); + if (ret && ret != -EOPNOTSUPP) { + iput(inode); + return ret; + } + bpf_dentry_finalize(dentry, inode, dir); return 0; } +static const struct inode_operations bpf_symlink_iops = { + .get_link = simple_get_link, + .listxattr = bpf_fs_listxattr, +}; + static const struct inode_operations bpf_dir_iops = { .lookup = bpf_lookup, .mkdir = bpf_mkdir, @@ -409,6 +468,7 @@ static const struct inode_operations bpf_dir_iops = { .rename = simple_rename, .link = simple_link, .unlink = simple_unlink, + .listxattr = bpf_fs_listxattr, }; /* pin iterator link into bpffs */ @@ -762,22 +822,147 @@ static int bpf_show_options(struct seq_file *m, struct dentry *root) return 0; } +static struct inode *bpf_fs_alloc_inode(struct super_block *sb) +{ + struct bpf_fs_inode *bi; + + bi = alloc_inode_sb(sb, bpf_fs_inode_cachep, GFP_KERNEL); + if (!bi) + return NULL; + INIT_LIST_HEAD_RCU(&bi->xattrs); + simple_xattr_limits_init(&bi->xlimits); + return &bi->vfs_inode; +} + static void bpf_destroy_inode(struct inode *inode) { + struct bpf_mount_opts *opts = inode->i_sb->s_fs_info; + struct bpf_fs_inode *bi = BPF_FS_I(inode); enum bpf_type type; - if (S_ISLNK(inode->i_mode)) - kfree(inode->i_link); if (!bpf_inode_type(inode, &type)) bpf_any_put(inode->i_private, type); - free_inode_nonrcu(inode); + simple_xattrs_free(&opts->xa_cache, &bi->xattrs, NULL); +} + +static void bpf_free_inode(struct inode *inode) +{ + if (S_ISLNK(inode->i_mode)) + kfree(inode->i_link); + kmem_cache_free(bpf_fs_inode_cachep, BPF_FS_I(inode)); +} + +static int bpf_fs_xattr_get(const struct xattr_handler *handler, + struct dentry *unused, struct inode *inode, + const char *name, void *value, size_t size) +{ + struct bpf_mount_opts *opts = inode->i_sb->s_fs_info; + struct bpf_fs_inode *bi = BPF_FS_I(inode); + + name = xattr_full_name(handler, name); + return simple_xattr_get(&opts->xa_cache, &bi->xattrs, name, value, size); +} + +enum { + BPF_FS_XATTR_UNSPEC, + BPF_FS_XATTR_SECURITY, + BPF_FS_XATTR_TRUSTED, +}; + +static int bpf_fs_xattr_set(const struct xattr_handler *handler, + struct mnt_idmap *idmap, struct dentry *unused, + struct inode *inode, const char *name, + const void *value, size_t size, int flags) +{ + struct bpf_mount_opts *opts = inode->i_sb->s_fs_info; + struct bpf_fs_inode *bi = BPF_FS_I(inode); + struct simple_xattr *old; + int err = -EINVAL; + + name = xattr_full_name(handler, name); + switch (handler->flags) { + case BPF_FS_XATTR_SECURITY: + err = simple_xattr_set_limited(&opts->xa_cache, &bi->xattrs, + &bi->xlimits, name, value, size, + flags); + break; + case BPF_FS_XATTR_TRUSTED: + old = simple_xattr_set(&opts->xa_cache, &bi->xattrs, name, + value, size, flags); + err = IS_ERR(old) ? PTR_ERR(old) : 0; + if (!err) + simple_xattr_free_rcu(old); + break; + } + if (err) + return err; + inode_set_ctime_current(inode); + return 0; +} + +static const struct xattr_handler bpf_fs_trusted_xattr_handler = { + .prefix = XATTR_TRUSTED_PREFIX, + .flags = BPF_FS_XATTR_TRUSTED, + .get = bpf_fs_xattr_get, + .set = bpf_fs_xattr_set, +}; + +static const struct xattr_handler bpf_fs_security_xattr_handler = { + .prefix = XATTR_SECURITY_PREFIX, + .flags = BPF_FS_XATTR_SECURITY, + .get = bpf_fs_xattr_get, + .set = bpf_fs_xattr_set, +}; + +static const struct xattr_handler * const bpf_fs_xattr_handlers[] = { + &bpf_fs_trusted_xattr_handler, + &bpf_fs_security_xattr_handler, + NULL, +}; + +static ssize_t bpf_fs_listxattr(struct dentry *dentry, char *buf, size_t size) +{ + struct inode *inode = d_inode(dentry); + + return simple_xattr_list(inode, &BPF_FS_I(inode)->xattrs, buf, size); +} + +static int bpf_fs_initxattrs(struct inode *inode, + const struct xattr *xattr_array, void *fs_info) +{ + struct bpf_mount_opts *opts = inode->i_sb->s_fs_info; + struct bpf_fs_inode *bi = BPF_FS_I(inode); + const struct xattr *xattr; + int err; + + for (xattr = xattr_array; xattr->name != NULL; xattr++) { + CLASS(simple_xattr, new_xattr)(xattr->value, xattr->value_len); + if (IS_ERR(new_xattr)) + return PTR_ERR(new_xattr); + + new_xattr->name = kasprintf(GFP_KERNEL_ACCOUNT, + XATTR_SECURITY_PREFIX "%s", + xattr->name); + if (!new_xattr->name) + return -ENOMEM; + + err = simple_xattr_add_limited(&opts->xa_cache, &bi->xattrs, + &bi->xlimits, new_xattr); + if (err) + return err; + + retain_and_null_ptr(new_xattr); + } + return 0; } const struct super_operations bpf_super_ops = { .statfs = simple_statfs, .drop_inode = inode_just_drop, .show_options = bpf_show_options, + .alloc_inode = bpf_fs_alloc_inode, .destroy_inode = bpf_destroy_inode, + .free_inode = bpf_free_inode, }; enum { @@ -996,25 +1181,38 @@ out: static int bpf_fill_super(struct super_block *sb, struct fs_context *fc) { - static const struct tree_descr bpf_rfiles[] = { { "" } }; struct bpf_mount_opts *opts = sb->s_fs_info; struct inode *inode; - int ret; /* Mounting an instance of BPF FS requires privileges */ if (fc->user_ns != &init_user_ns && !capable(CAP_SYS_ADMIN)) return -EPERM; - ret = simple_fill_super(sb, BPF_FS_MAGIC, bpf_rfiles); - if (ret) - return ret; - + sb->s_blocksize = PAGE_SIZE; + sb->s_blocksize_bits = PAGE_SHIFT; + sb->s_magic = BPF_FS_MAGIC; sb->s_op = &bpf_super_ops; + sb->s_xattr = bpf_fs_xattr_handlers; + sb->s_iflags |= SB_I_NOEXEC; + sb->s_iflags |= SB_I_NODEV; + sb->s_time_gran = 1; + + inode = bpf_get_inode(sb, NULL, S_IFDIR | 0777); + if (IS_ERR(inode)) + return PTR_ERR(inode); + + inode->i_ino = 1; + inode->i_op = &bpf_dir_iops; + inode->i_fop = &simple_dir_operations; + set_nlink(inode, 2); + + sb->s_root = d_make_root(inode); + if (!sb->s_root) + return -ENOMEM; - inode = sb->s_root->d_inode; + inode = d_inode(sb->s_root); inode->i_uid = opts->uid; inode->i_gid = opts->gid; - inode->i_op = &bpf_dir_iops; inode->i_mode &= ~S_IALLUGO; populate_bpffs(sb->s_root); inode->i_mode |= S_ISVTX | opts->mode; @@ -1068,6 +1266,7 @@ static void bpf_kill_super(struct super_block *sb) struct bpf_mount_opts *opts = sb->s_fs_info; kill_anon_super(sb); + simple_xattr_cache_cleanup(&opts->xa_cache); kfree(opts); } @@ -1080,18 +1279,37 @@ static struct file_system_type bpf_fs_type = { .fs_flags = FS_USERNS_MOUNT, }; +static void bpf_fs_inode_init_once(void *foo) +{ + struct bpf_fs_inode *bi = foo; + + inode_init_once(&bi->vfs_inode); +} + static int __init bpf_init(void) { int ret; + bpf_fs_inode_cachep = kmem_cache_create("bpf_fs_inode_cache", + sizeof(struct bpf_fs_inode), + 0, SLAB_ACCOUNT, + bpf_fs_inode_init_once); + if (!bpf_fs_inode_cachep) + return -ENOMEM; + ret = sysfs_create_mount_point(fs_kobj, "bpf"); if (ret) - return ret; + goto out_cache; ret = register_filesystem(&bpf_fs_type); - if (ret) + if (ret) { sysfs_remove_mount_point(fs_kobj, "bpf"); + goto out_cache; + } + return 0; +out_cache: + kmem_cache_destroy(bpf_fs_inode_cachep); return ret; } fs_initcall(bpf_init);