nsresourced: Allow operations outside transient UID/GID ranges

author Daan De Meyer <daan.j.demeyer@gmail.com>

Fri, 23 Jan 2026 11:30:59 +0000 (12:30 +0100)

committer Daan De Meyer <daan@amutable.com>

Tue, 24 Feb 2026 17:29:37 +0000 (18:29 +0100)
author Daan De Meyer <daan.j.demeyer@gmail.com>
Fri, 23 Jan 2026 11:30:59 +0000 (12:30 +0100)
committer Daan De Meyer <daan@amutable.com>
Tue, 24 Feb 2026 17:29:37 +0000 (18:29 +0100)
diff --git a/meson.build b/meson.build

index b50466dcfd0ea99fe7a302f9b135eb168bbf3297..e021b568ca53fde6935ed8b59b8cfc0b03d9a936 100644 (file)
--- a/meson.build
+++ b/meson.build
@@ -1798,7 +1798,7 @@ if conf.get('BPF_FRAMEWORK') == 1
                  ]
          endif
  
-        bpf_o_unstripped_cmd += ['-I.']
+        bpf_o_unstripped_cmd += ['-I.', '-include', 'config.h']
  
          if cc.get_id() == 'gcc' or meson.is_cross_build()
                  if cc.get_id() != 'gcc'
diff --git a/src/basic/uid-classification.h b/src/basic/uid-classification.h

index 5b75c0ab7869cba1bd467c8c3472576129668e5b..6735e06b322184e3afed7f166b8585d0e2fa06e6 100644 (file)
--- a/src/basic/uid-classification.h
+++ b/src/basic/uid-classification.h
@@ -8,8 +8,8 @@ assert_cc((CONTAINER_UID_BASE_MIN & 0xFFFFU) == 0);
  assert_cc((CONTAINER_UID_BASE_MAX & 0xFFFFU) == 0);
  
  /* Given we assign 64K UIDs to containers, the last container UID is 0xFFFF larger than the base */
-#define CONTAINER_UID_MIN (CONTAINER_UID_BASE_MIN)
-#define CONTAINER_UID_MAX (CONTAINER_UID_BASE_MAX + 0xFFFFU)
+#define CONTAINER_UID_MIN ((uid_t) CONTAINER_UID_BASE_MIN)
+#define CONTAINER_UID_MAX ((uid_t) CONTAINER_UID_BASE_MAX + 0xFFFFU)
  
  assert_cc((FOREIGN_UID_BASE & 0xFFFFU) == 0);
  #define FOREIGN_UID_MIN (FOREIGN_UID_BASE)
diff --git a/src/nsresourced/bpf/userns-restrict/userns-restrict.bpf.c b/src/nsresourced/bpf/userns-restrict/userns-restrict.bpf.c

index dbb0858682da554c8d6a99fe8ff5bc66d35d84af..dbfcf59b28671eabb9d6c693aed5901037245f3c 100644 (file)
--- a/src/nsresourced/bpf/userns-restrict/userns-restrict.bpf.c
+++ b/src/nsresourced/bpf/userns-restrict/userns-restrict.bpf.c
@@ -20,6 +20,9 @@
  #include <bpf/bpf_helpers.h>
  #include <bpf/bpf_tracing.h>
  
+#define CONTAINER_UID_MIN ((uid_t) CONTAINER_UID_BASE_MIN)
+#define CONTAINER_UID_MAX ((uid_t) CONTAINER_UID_BASE_MAX + 0xFFFFU)
+
  #ifndef bpf_core_cast
  /* bpf_rdonly_cast() was introduced in libbpf commit 688879f together with
   * the definition of a bpf_core_cast macro. So use that one to avoid
@@ -68,28 +71,30 @@ static inline struct mount *real_mount(struct vfsmount *mnt) {
          return container_of(mnt, struct mount, mnt);
  }
  
-static int validate_mount(struct vfsmount *v) {
-        struct user_namespace *mount_userns, *task_userns, *p;
-        unsigned task_userns_inode;
-        struct task_struct *task;
-        void *mnt_id_map;
+static inline bool uid_is_dynamic(uid_t uid) {
+        return DYNAMIC_UID_MIN <= uid && uid <= DYNAMIC_UID_MAX;
+}
+
+static inline bool uid_is_container(uid_t uid) {
+        return CONTAINER_UID_MIN <= uid && uid <= CONTAINER_UID_MAX;
+}
+
+static inline bool uid_is_transient(uid_t uid) {
+        return uid_is_dynamic(uid) || uid_is_container(uid);
+}
+
+static int userns_owns_mount(struct user_namespace *userns, struct vfsmount *v) {
+        struct user_namespace *mount_userns, *p;
          struct mount *m;
-        int mnt_id;
  
          /* Get user namespace from vfsmount */
          m = bpf_rdonly_cast(real_mount(v), bpf_core_type_id_kernel(struct mount));
          mount_userns = m->mnt_ns->user_ns;
  
-        /* Get user namespace from task */
-        task = (struct task_struct*) bpf_get_current_task_btf();
-        task_userns = task->cred->user_ns;
-
-        /* Is the file on a mount that belongs to our own user namespace or a child of it? If so, say
-         * yes immediately. */
          p = mount_userns;
          for (unsigned i = 0; i < USER_NAMESPACE_DEPTH_MAX; i++) {
-                if (p == task_userns)
-                        return 0; /* our task's user namespace (or a child thereof) owns this superblock: allow! */
+                if (p == userns)
+                        return true;
  
                  p = p->parent;
                  if (!p)
@@ -101,6 +106,42 @@ static int validate_mount(struct vfsmount *v) {
          if (p)
                  return -EPERM;
  
+        return false;
+}
+
+static int validate_mount(struct vfsmount *v, int ret) {
+        struct user_namespace *task_userns;
+        unsigned task_userns_inode;
+        struct task_struct *task;
+        void *mnt_id_map;
+        struct mount *m;
+        int mnt_id, r;
+
+        if (ret != 0) /* propagate earlier error */
+                return ret;
+
+        /* Get user namespace from task */
+        task = (struct task_struct*) bpf_get_current_task_btf();
+        task_userns = task->cred->user_ns;
+
+        /* fsuid/fsgid are the UID/GID in the initial user namespace, before any idmapped mounts have been
+         * applied. There is no way (yet) to figure out what the UID/GID that will be written to disk will be
+         * after idmapped mounts are taken into account, hence we have to rely on an allowlist of mounts
+         * populated by userspace which tells us if a mount has an appropriate uid mapping in place to
+         * translate the transient UID range to something else. For other UIDs/GIDs, there's no need to do
+         * these checks as we don't insist on idmapped mounts or such for UIDs/GIDs outside the transient
+         * ranges. */
+        if (!uid_is_transient(task->cred->fsuid.val) && !uid_is_transient((uid_t) task->cred->fsgid.val))
+                return 0;
+
+        r = userns_owns_mount(task_userns, v);
+        if (r < 0)
+                return r;
+        /* Is the file on a mount that belongs to our own user namespace or a child of it? If so, say
+         * yes immediately. */
+        if (r > 0)
+                return 0;
+
          /* This is a mount foreign to our task's user namespace, let's consult our allow list */
          task_userns_inode = task_userns->ns.inum;
  
@@ -108,6 +149,7 @@ static int validate_mount(struct vfsmount *v) {
          if (!mnt_id_map) /* No rules installed for this userns? Then say yes, too! */
                  return 0;
  
+        m = bpf_rdonly_cast(real_mount(v), bpf_core_type_id_kernel(struct mount));
          mnt_id = m->mnt_id;
  
          /* Otherwise, say yes if the mount ID is allowlisted */
@@ -117,43 +159,74 @@ static int validate_mount(struct vfsmount *v) {
          return -EPERM;
  }
  
-static int validate_path(const struct path *path, int ret) {
-        struct inode *inode;
+SEC("lsm/path_chown")
+int BPF_PROG(userns_restrict_path_chown, struct path *path, unsigned long long uid, unsigned long long gid, int ret) {
+        struct user_namespace *task_userns;
+        unsigned task_userns_inode;
+        struct task_struct *task;
          struct vfsmount *v;
+        void *mnt_id_map;
+        int r;
  
          if (ret != 0) /* propagate earlier error */
                  return ret;
  
+        /* Get user namespace from task */
+        task = (struct task_struct*) bpf_get_current_task_btf();
+        task_userns = task->cred->user_ns;
          v = path->mnt;
  
-        return validate_mount(v);
-}
+        r = userns_owns_mount(task_userns, v);
+        if (r < 0)
+                return r;
+        /* Is the file on a mount that belongs to our own user namespace or a child of it? If so, say
+         * yes immediately. */
+        if (r > 0)
+                return 0;
  
-SEC("lsm/path_chown")
-int BPF_PROG(userns_restrict_path_chown, struct path *path, void* uid, void *gid, int ret) {
-        return validate_path(path, ret);
+        /* This is a mount foreign to our task's user namespace, if the user namespace was provisioned by
+         * nsresourced, refuse any UIDs/GIDs in the transient ranges. Note that we can only do this check in
+         * the chown() hook because it receives the UID/GID with idmapped mounts already taken into account,
+         * unlike the other hooks where we cannot (yet) figure out the UID/GID after idmapped mounts are
+         * applied. Hence in the other hooks we have to rely on the mount allowlist to ensure the transient
+         * fsuid/fsgid will be translated to something else when written to disk but in the chown() hook we
+         * can check the provided UID/GID directly to see if it is transient or not. */
+
+        /* User namespaces that were not provisioned by nsresourced can still write to the transient ranges
+         * so that we don't break use cases like systemd-nspawn's --private-users=pick switch. */
+
+        task_userns_inode = task_userns->ns.inum;
+
+        mnt_id_map = bpf_map_lookup_elem(&userns_mnt_id_hash, &task_userns_inode);
+        if (!mnt_id_map) /* No rules installed for this userns? Then say yes, too! */
+                return 0;
+
+        if (uid_is_transient((uid_t) uid) || uid_is_transient((uid_t) gid))
+                return -EPERM;
+
+        return 0;
  }
  
  SEC("lsm/path_mkdir")
  int BPF_PROG(userns_restrict_path_mkdir, struct path *dir, struct dentry *dentry, umode_t mode, int ret) {
-        return validate_path(dir, ret);
+        return validate_mount(dir->mnt, ret);
  }
  
  /* The mknod hook covers all file creations, including regular files, in case the reader is looking for a
   * missing hook for open(). */
  SEC("lsm/path_mknod")
  int BPF_PROG(userns_restrict_path_mknod, const struct path *dir, struct dentry *dentry, umode_t mode, unsigned dev, int ret) {
-        return validate_path(dir, ret);
+        return validate_mount(dir->mnt, ret);
  }
  
  SEC("lsm/path_symlink")
  int BPF_PROG(userns_restrict_path_symlink, const struct path *dir, struct dentry *dentry, const char *old_name, int ret) {
-        return validate_path(dir, ret);
+        return validate_mount(dir->mnt, ret);
  }
  
  SEC("lsm/path_link")
  int BPF_PROG(userns_restrict_path_link, struct dentry *old_dentry, const struct path *new_dir, struct dentry *new_dentry, int ret) {
-        return validate_path(new_dir, ret);
+        return validate_mount(new_dir->mnt, ret);
  }
  
  SEC("kprobe/retire_userns_sysctls")
diff --git a/src/nsresourced/test-userns-restrict.c b/src/nsresourced/test-userns-restrict.c

index dc06b7b08f75b2284dade2400a2063fdd327439e..853fc1441f6d7c95874862244d486103d03f01f6 100644 (file)
--- a/src/nsresourced/test-userns-restrict.c
+++ b/src/nsresourced/test-userns-restrict.c
@@ -14,6 +14,7 @@
  #include "rm-rf.h"
  #include "tests.h"
  #include "tmpfile-util.h"
+#include "uid-classification.h"
  #include "userns-restrict.h"
  
  static int make_tmpfs_fsmount(void) {
@@ -50,10 +51,16 @@ TEST(userns_restrict) {
          int r;
  
          ASSERT_OK(mkdtemp_malloc(NULL, &t));
+        /* Make sure the dir is owned by the transient UID we'll be using so we don't get rejected with a
+         * permission error before we even get to the BPF-LSM. */
+        ASSERT_OK_ERRNO(chown(t, CONTAINER_UID_MIN, CONTAINER_UID_MIN));
  
          host_fd1 = ASSERT_OK_ERRNO(open(t, O_DIRECTORY|O_CLOEXEC));
          host_tmpfs = ASSERT_OK(make_tmpfs_fsmount());
-        userns_fd = ASSERT_OK(userns_acquire("0 0 1", "0 0 1", /* setgroups_deny= */ true));
+
+        _cleanup_free_ char *idmap = NULL;
+        ASSERT_OK(asprintf(&idmap, "0 "UID_FMT" 1", CONTAINER_UID_MIN));
+        userns_fd = ASSERT_OK(userns_acquire(idmap, idmap, /* setgroups_deny= */ true));
  
          ASSERT_OK(userns_restrict_put_by_fd(
                          bpf_obj,
@@ -69,7 +76,7 @@ TEST(userns_restrict) {
          if (r == 0) {
                  _cleanup_close_ int private_tmpfs = -EBADF;
  
-                ASSERT_OK_ERRNO(setns(userns_fd, CLONE_NEWUSER));
+                ASSERT_OK(namespace_enter(-EBADF, -EBADF, -EBADF, userns_fd, -EBADF));
                  ASSERT_OK_ERRNO(unshare(CLONE_NEWNS));
  
                  /* Allocate tmpfs locally */
author	Daan De Meyer <daan.j.demeyer@gmail.com>
	Fri, 23 Jan 2026 11:30:59 +0000 (12:30 +0100)
committer	Daan De Meyer <daan@amutable.com>
	Tue, 24 Feb 2026 17:29:37 +0000 (18:29 +0100)
meson.build		patch \| blob \| blame \| history
src/basic/uid-classification.h		patch \| blob \| blame \| history
src/nsresourced/bpf/userns-restrict/userns-restrict.bpf.c		patch \| blob \| blame \| history
src/nsresourced/test-userns-restrict.c		patch \| blob \| blame \| history