pidfs: implement ino allocation without the pidmap lock

author Mateusz Guzik <mjguzik@gmail.com>

Tue, 20 Jan 2026 18:45:39 +0000 (19:45 +0100)

committer Christian Brauner <brauner@kernel.org>

Tue, 10 Feb 2026 10:39:30 +0000 (11:39 +0100)
author Mateusz Guzik <mjguzik@gmail.com>
Tue, 20 Jan 2026 18:45:39 +0000 (19:45 +0100)
committer Christian Brauner <brauner@kernel.org>
Tue, 10 Feb 2026 10:39:30 +0000 (11:39 +0100)
diff --git a/fs/pidfs.c b/fs/pidfs.c

index ee0e36dd29d2cd1571ccc377981a561204778df5..b984d0e9573450c8bbcaa831b71a80ce982a7e9d 100644 (file)
--- a/fs/pidfs.c
+++ b/fs/pidfs.c
@@ -23,6 +23,7 @@
  #include <linux/coredump.h>
  #include <linux/rhashtable.h>
  #include <linux/xattr.h>
+#include <linux/cookie.h>
  
  #include "internal.h"
  #include "mount.h"
@@ -65,7 +66,39 @@ static const struct rhashtable_params pidfs_ino_ht_params = {
         .automatic_shrinking    = true,
  };
  
+/*
+ * inode number handling
+ *
+ * On 64 bit nothing special happens. The 64bit number assigned
+ * to struct pid is the inode number.
+ *
+ * On 32 bit the 64 bit number assigned to struct pid is split
+ * into two 32 bit numbers. The lower 32 bits are used as the
+ * inode number and the upper 32 bits are used as the inode
+ * generation number.
+ *
+ * On 32 bit pidfs_ino() will return the lower 32 bit. When
+ * pidfs_ino() returns zero a wrap around happened. When a
+ * wraparound happens the 64 bit number will be incremented by 1
+ * so inode numbering starts at 1 again.
+ *
+ * On 64 bit comparing two pidfds is as simple as comparing
+ * inode numbers.
+ *
+ * When a wraparound happens on 32 bit multiple pidfds with the
+ * same inode number are likely to exist (This isn't a problem
+ * since before pidfs pidfds used the anonymous inode meaning
+ * all pidfds had the same inode number.). Userspace can
+ * reconstruct the 64 bit identifier by retrieving both the
+ * inode number and the inode generation number to compare or
+ * use file handles.
+ */
+
  #if BITS_PER_LONG == 32
+
+DEFINE_SPINLOCK(pidfs_ino_lock);
+static u64 pidfs_ino_nr = 1;
+
  static inline unsigned long pidfs_ino(u64 ino)
  {
         return lower_32_bits(ino);
@@ -77,6 +110,18 @@ static inline u32 pidfs_gen(u64 ino)
         return upper_32_bits(ino);
  }
  
+static inline u64 pidfs_alloc_ino(void)
+{
+       u64 ino;
+
+       spin_lock(&pidfs_ino_lock);
+       if (pidfs_ino(pidfs_ino_nr) == 0)
+               pidfs_ino_nr++;
+       ino = pidfs_ino_nr++;
+       spin_unlock(&pidfs_ino_lock);
+       return ino;
+}
+
  #else
  
  /* On 64 bit simply return ino. */
@@ -90,61 +135,47 @@ static inline u32 pidfs_gen(u64 ino)
  {
         return 0;
  }
-#endif
  
-/*
- * Allocate inode number and initialize pidfs fields.
- * Called with pidmap_lock held.
- */
-void pidfs_prepare_pid(struct pid *pid)
+DEFINE_COOKIE(pidfs_ino_cookie);
+
+static u64 pidfs_alloc_ino(void)
  {
-       static u64 pidfs_ino_nr = 2;
+       u64 ino;
  
-       /*
-        * On 64 bit nothing special happens. The 64bit number assigned
-        * to struct pid is the inode number.
-        *
-        * On 32 bit the 64 bit number assigned to struct pid is split
-        * into two 32 bit numbers. The lower 32 bits are used as the
-        * inode number and the upper 32 bits are used as the inode
-        * generation number.
-        *
-        * On 32 bit pidfs_ino() will return the lower 32 bit. When
-        * pidfs_ino() returns zero a wrap around happened. When a
-        * wraparound happens the 64 bit number will be incremented by 2
-        * so inode numbering starts at 2 again.
-        *
-        * On 64 bit comparing two pidfds is as simple as comparing
-        * inode numbers.
-        *
-        * When a wraparound happens on 32 bit multiple pidfds with the
-        * same inode number are likely to exist (This isn't a problem
-        * since before pidfs pidfds used the anonymous inode meaning
-        * all pidfds had the same inode number.). Userspace can
-        * reconstruct the 64 bit identifier by retrieving both the
-        * inode number and the inode generation number to compare or
-        * use file handles.
-        */
-       if (pidfs_ino(pidfs_ino_nr) == 0)
-               pidfs_ino_nr += 2;
+       preempt_disable();
+       ino = gen_cookie_next(&pidfs_ino_cookie);
+       preempt_enable();
+
+       VFS_WARN_ON_ONCE(ino < 1);
+       return ino;
+}
+
+#endif
  
-       pid->ino = pidfs_ino_nr;
-       pid->pidfs_hash.next = NULL;
+void pidfs_prepare_pid(struct pid *pid)
+{
         pid->stashed = NULL;
         pid->attr = NULL;
-       pidfs_ino_nr++;
+       pid->ino = 0;
  }
  
  int pidfs_add_pid(struct pid *pid)
  {
-       return rhashtable_insert_fast(&pidfs_ino_ht, &pid->pidfs_hash,
-                                     pidfs_ino_ht_params);
+       int ret;
+
+       pid->ino = pidfs_alloc_ino();
+       ret = rhashtable_insert_fast(&pidfs_ino_ht, &pid->pidfs_hash,
+                                    pidfs_ino_ht_params);
+       if (unlikely(ret))
+               pid->ino = 0;
+       return ret;
  }
  
  void pidfs_remove_pid(struct pid *pid)
  {
-       rhashtable_remove_fast(&pidfs_ino_ht, &pid->pidfs_hash,
-                              pidfs_ino_ht_params);
+       if (likely(pid->ino))
+               rhashtable_remove_fast(&pidfs_ino_ht, &pid->pidfs_hash,
+                                      pidfs_ino_ht_params);
  }
  
  void pidfs_free_pid(struct pid *pid)
diff --git a/kernel/pid.c b/kernel/pid.c

index 06356e40ac000b902e66418ad2f56830a2fb1f97..72c9372b84b8f8f9572f44d272cf1162ed250ac8 100644 (file)
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -198,6 +198,7 @@ struct pid *alloc_pid(struct pid_namespace *ns, pid_t *arg_set_tid,
                 INIT_HLIST_HEAD(&pid->tasks[type]);
         init_waitqueue_head(&pid->wait_pidfd);
         INIT_HLIST_HEAD(&pid->inodes);
+       pidfs_prepare_pid(pid);
  
         /*
          * 2. perm check checkpoint_restore_ns_capable()
@@ -314,8 +315,6 @@ struct pid *alloc_pid(struct pid_namespace *ns, pid_t *arg_set_tid,
         retval = -ENOMEM;
         if (unlikely(!(ns->pid_allocated & PIDNS_ADDING)))
                 goto out_free;
-       pidfs_prepare_pid(pid);
-
         for (upid = pid->numbers + ns->level; upid >= pid->numbers; --upid) {
                 /* Make the PID visible to find_pid_ns. */
                 idr_replace(&upid->ns->idr, pid, upid->nr);
author	Mateusz Guzik <mjguzik@gmail.com>
	Tue, 20 Jan 2026 18:45:39 +0000 (19:45 +0100)
committer	Christian Brauner <brauner@kernel.org>
	Tue, 10 Feb 2026 10:39:30 +0000 (11:39 +0100)
fs/pidfs.c		patch \| blob \| blame \| history
kernel/pid.c		patch \| blob \| blame \| history