#include <bpf/bpf_helpers.h>
#include <bpf/bpf_tracing.h>
+#define CONTAINER_UID_MIN ((uid_t) CONTAINER_UID_BASE_MIN)
+#define CONTAINER_UID_MAX ((uid_t) CONTAINER_UID_BASE_MAX + 0xFFFFU)
+
#ifndef bpf_core_cast
/* bpf_rdonly_cast() was introduced in libbpf commit 688879f together with
* the definition of a bpf_core_cast macro. So use that one to avoid
return container_of(mnt, struct mount, mnt);
}
-static int validate_mount(struct vfsmount *v) {
- struct user_namespace *mount_userns, *task_userns, *p;
- unsigned task_userns_inode;
- struct task_struct *task;
- void *mnt_id_map;
+static inline bool uid_is_dynamic(uid_t uid) {
+ return DYNAMIC_UID_MIN <= uid && uid <= DYNAMIC_UID_MAX;
+}
+
+static inline bool uid_is_container(uid_t uid) {
+ return CONTAINER_UID_MIN <= uid && uid <= CONTAINER_UID_MAX;
+}
+
+static inline bool uid_is_transient(uid_t uid) {
+ return uid_is_dynamic(uid) || uid_is_container(uid);
+}
+
+static int userns_owns_mount(struct user_namespace *userns, struct vfsmount *v) {
+ struct user_namespace *mount_userns, *p;
struct mount *m;
- int mnt_id;
/* Get user namespace from vfsmount */
m = bpf_rdonly_cast(real_mount(v), bpf_core_type_id_kernel(struct mount));
mount_userns = m->mnt_ns->user_ns;
- /* Get user namespace from task */
- task = (struct task_struct*) bpf_get_current_task_btf();
- task_userns = task->cred->user_ns;
-
- /* Is the file on a mount that belongs to our own user namespace or a child of it? If so, say
- * yes immediately. */
p = mount_userns;
for (unsigned i = 0; i < USER_NAMESPACE_DEPTH_MAX; i++) {
- if (p == task_userns)
- return 0; /* our task's user namespace (or a child thereof) owns this superblock: allow! */
+ if (p == userns)
+ return true;
p = p->parent;
if (!p)
if (p)
return -EPERM;
+ return false;
+}
+
+static int validate_mount(struct vfsmount *v, int ret) {
+ struct user_namespace *task_userns;
+ unsigned task_userns_inode;
+ struct task_struct *task;
+ void *mnt_id_map;
+ struct mount *m;
+ int mnt_id, r;
+
+ if (ret != 0) /* propagate earlier error */
+ return ret;
+
+ /* Get user namespace from task */
+ task = (struct task_struct*) bpf_get_current_task_btf();
+ task_userns = task->cred->user_ns;
+
+ /* fsuid/fsgid are the UID/GID in the initial user namespace, before any idmapped mounts have been
+ * applied. There is no way (yet) to figure out what the UID/GID that will be written to disk will be
+ * after idmapped mounts are taken into account, hence we have to rely on an allowlist of mounts
+ * populated by userspace which tells us if a mount has an appropriate uid mapping in place to
+ * translate the transient UID range to something else. For other UIDs/GIDs, there's no need to do
+ * these checks as we don't insist on idmapped mounts or such for UIDs/GIDs outside the transient
+ * ranges. */
+ if (!uid_is_transient(task->cred->fsuid.val) && !uid_is_transient((uid_t) task->cred->fsgid.val))
+ return 0;
+
+ r = userns_owns_mount(task_userns, v);
+ if (r < 0)
+ return r;
+ /* Is the file on a mount that belongs to our own user namespace or a child of it? If so, say
+ * yes immediately. */
+ if (r > 0)
+ return 0;
+
/* This is a mount foreign to our task's user namespace, let's consult our allow list */
task_userns_inode = task_userns->ns.inum;
if (!mnt_id_map) /* No rules installed for this userns? Then say yes, too! */
return 0;
+ m = bpf_rdonly_cast(real_mount(v), bpf_core_type_id_kernel(struct mount));
mnt_id = m->mnt_id;
/* Otherwise, say yes if the mount ID is allowlisted */
return -EPERM;
}
-static int validate_path(const struct path *path, int ret) {
- struct inode *inode;
+SEC("lsm/path_chown")
+int BPF_PROG(userns_restrict_path_chown, struct path *path, unsigned long long uid, unsigned long long gid, int ret) {
+ struct user_namespace *task_userns;
+ unsigned task_userns_inode;
+ struct task_struct *task;
struct vfsmount *v;
+ void *mnt_id_map;
+ int r;
if (ret != 0) /* propagate earlier error */
return ret;
+ /* Get user namespace from task */
+ task = (struct task_struct*) bpf_get_current_task_btf();
+ task_userns = task->cred->user_ns;
v = path->mnt;
- return validate_mount(v);
-}
+ r = userns_owns_mount(task_userns, v);
+ if (r < 0)
+ return r;
+ /* Is the file on a mount that belongs to our own user namespace or a child of it? If so, say
+ * yes immediately. */
+ if (r > 0)
+ return 0;
-SEC("lsm/path_chown")
-int BPF_PROG(userns_restrict_path_chown, struct path *path, void* uid, void *gid, int ret) {
- return validate_path(path, ret);
+ /* This is a mount foreign to our task's user namespace, if the user namespace was provisioned by
+ * nsresourced, refuse any UIDs/GIDs in the transient ranges. Note that we can only do this check in
+ * the chown() hook because it receives the UID/GID with idmapped mounts already taken into account,
+ * unlike the other hooks where we cannot (yet) figure out the UID/GID after idmapped mounts are
+ * applied. Hence in the other hooks we have to rely on the mount allowlist to ensure the transient
+ * fsuid/fsgid will be translated to something else when written to disk but in the chown() hook we
+ * can check the provided UID/GID directly to see if it is transient or not. */
+
+ /* User namespaces that were not provisioned by nsresourced can still write to the transient ranges
+ * so that we don't break use cases like systemd-nspawn's --private-users=pick switch. */
+
+ task_userns_inode = task_userns->ns.inum;
+
+ mnt_id_map = bpf_map_lookup_elem(&userns_mnt_id_hash, &task_userns_inode);
+ if (!mnt_id_map) /* No rules installed for this userns? Then say yes, too! */
+ return 0;
+
+ if (uid_is_transient((uid_t) uid) || uid_is_transient((uid_t) gid))
+ return -EPERM;
+
+ return 0;
}
SEC("lsm/path_mkdir")
int BPF_PROG(userns_restrict_path_mkdir, struct path *dir, struct dentry *dentry, umode_t mode, int ret) {
- return validate_path(dir, ret);
+ return validate_mount(dir->mnt, ret);
}
/* The mknod hook covers all file creations, including regular files, in case the reader is looking for a
* missing hook for open(). */
SEC("lsm/path_mknod")
int BPF_PROG(userns_restrict_path_mknod, const struct path *dir, struct dentry *dentry, umode_t mode, unsigned dev, int ret) {
- return validate_path(dir, ret);
+ return validate_mount(dir->mnt, ret);
}
SEC("lsm/path_symlink")
int BPF_PROG(userns_restrict_path_symlink, const struct path *dir, struct dentry *dentry, const char *old_name, int ret) {
- return validate_path(dir, ret);
+ return validate_mount(dir->mnt, ret);
}
SEC("lsm/path_link")
int BPF_PROG(userns_restrict_path_link, struct dentry *old_dentry, const struct path *new_dir, struct dentry *new_dentry, int ret) {
- return validate_path(new_dir, ret);
+ return validate_mount(new_dir->mnt, ret);
}
SEC("kprobe/retire_userns_sysctls")
#include "rm-rf.h"
#include "tests.h"
#include "tmpfile-util.h"
+#include "uid-classification.h"
#include "userns-restrict.h"
static int make_tmpfs_fsmount(void) {
int r;
ASSERT_OK(mkdtemp_malloc(NULL, &t));
+ /* Make sure the dir is owned by the transient UID we'll be using so we don't get rejected with a
+ * permission error before we even get to the BPF-LSM. */
+ ASSERT_OK_ERRNO(chown(t, CONTAINER_UID_MIN, CONTAINER_UID_MIN));
host_fd1 = ASSERT_OK_ERRNO(open(t, O_DIRECTORY|O_CLOEXEC));
host_tmpfs = ASSERT_OK(make_tmpfs_fsmount());
- userns_fd = ASSERT_OK(userns_acquire("0 0 1", "0 0 1", /* setgroups_deny= */ true));
+
+ _cleanup_free_ char *idmap = NULL;
+ ASSERT_OK(asprintf(&idmap, "0 "UID_FMT" 1", CONTAINER_UID_MIN));
+ userns_fd = ASSERT_OK(userns_acquire(idmap, idmap, /* setgroups_deny= */ true));
ASSERT_OK(userns_restrict_put_by_fd(
bpf_obj,
if (r == 0) {
_cleanup_close_ int private_tmpfs = -EBADF;
- ASSERT_OK_ERRNO(setns(userns_fd, CLONE_NEWUSER));
+ ASSERT_OK(namespace_enter(-EBADF, -EBADF, -EBADF, userns_fd, -EBADF));
ASSERT_OK_ERRNO(unshare(CLONE_NEWNS));
/* Allocate tmpfs locally */