From 87caaeef79950377b616f3ba2265a82742cb9583 Mon Sep 17 00:00:00 2001 From: Mateusz Guzik Date: Tue, 20 Jan 2026 19:45:39 +0100 Subject: [PATCH] pidfs: implement ino allocation without the pidmap lock This paves the way for scalable PID allocation later. The 32 bit variant merely takes a spinlock for simplicity, the 64 bit variant uses a scalable scheme. Signed-off-by: Mateusz Guzik Link: https://patch.msgid.link/20260120184539.1480930-1-mjguzik@gmail.com Co-developed-by: Christian Brauner Signed-off-by: Christian Brauner --- fs/pidfs.c | 113 ++++++++++++++++++++++++++++++++------------------- kernel/pid.c | 3 +- 2 files changed, 73 insertions(+), 43 deletions(-) diff --git a/fs/pidfs.c b/fs/pidfs.c index ee0e36dd29d2..b984d0e95734 100644 --- a/fs/pidfs.c +++ b/fs/pidfs.c @@ -23,6 +23,7 @@ #include #include #include +#include #include "internal.h" #include "mount.h" @@ -65,7 +66,39 @@ static const struct rhashtable_params pidfs_ino_ht_params = { .automatic_shrinking = true, }; +/* + * inode number handling + * + * On 64 bit nothing special happens. The 64bit number assigned + * to struct pid is the inode number. + * + * On 32 bit the 64 bit number assigned to struct pid is split + * into two 32 bit numbers. The lower 32 bits are used as the + * inode number and the upper 32 bits are used as the inode + * generation number. + * + * On 32 bit pidfs_ino() will return the lower 32 bit. When + * pidfs_ino() returns zero a wrap around happened. When a + * wraparound happens the 64 bit number will be incremented by 1 + * so inode numbering starts at 1 again. + * + * On 64 bit comparing two pidfds is as simple as comparing + * inode numbers. + * + * When a wraparound happens on 32 bit multiple pidfds with the + * same inode number are likely to exist (This isn't a problem + * since before pidfs pidfds used the anonymous inode meaning + * all pidfds had the same inode number.). Userspace can + * reconstruct the 64 bit identifier by retrieving both the + * inode number and the inode generation number to compare or + * use file handles. + */ + #if BITS_PER_LONG == 32 + +DEFINE_SPINLOCK(pidfs_ino_lock); +static u64 pidfs_ino_nr = 1; + static inline unsigned long pidfs_ino(u64 ino) { return lower_32_bits(ino); @@ -77,6 +110,18 @@ static inline u32 pidfs_gen(u64 ino) return upper_32_bits(ino); } +static inline u64 pidfs_alloc_ino(void) +{ + u64 ino; + + spin_lock(&pidfs_ino_lock); + if (pidfs_ino(pidfs_ino_nr) == 0) + pidfs_ino_nr++; + ino = pidfs_ino_nr++; + spin_unlock(&pidfs_ino_lock); + return ino; +} + #else /* On 64 bit simply return ino. */ @@ -90,61 +135,47 @@ static inline u32 pidfs_gen(u64 ino) { return 0; } -#endif -/* - * Allocate inode number and initialize pidfs fields. - * Called with pidmap_lock held. - */ -void pidfs_prepare_pid(struct pid *pid) +DEFINE_COOKIE(pidfs_ino_cookie); + +static u64 pidfs_alloc_ino(void) { - static u64 pidfs_ino_nr = 2; + u64 ino; - /* - * On 64 bit nothing special happens. The 64bit number assigned - * to struct pid is the inode number. - * - * On 32 bit the 64 bit number assigned to struct pid is split - * into two 32 bit numbers. The lower 32 bits are used as the - * inode number and the upper 32 bits are used as the inode - * generation number. - * - * On 32 bit pidfs_ino() will return the lower 32 bit. When - * pidfs_ino() returns zero a wrap around happened. When a - * wraparound happens the 64 bit number will be incremented by 2 - * so inode numbering starts at 2 again. - * - * On 64 bit comparing two pidfds is as simple as comparing - * inode numbers. - * - * When a wraparound happens on 32 bit multiple pidfds with the - * same inode number are likely to exist (This isn't a problem - * since before pidfs pidfds used the anonymous inode meaning - * all pidfds had the same inode number.). Userspace can - * reconstruct the 64 bit identifier by retrieving both the - * inode number and the inode generation number to compare or - * use file handles. - */ - if (pidfs_ino(pidfs_ino_nr) == 0) - pidfs_ino_nr += 2; + preempt_disable(); + ino = gen_cookie_next(&pidfs_ino_cookie); + preempt_enable(); + + VFS_WARN_ON_ONCE(ino < 1); + return ino; +} + +#endif - pid->ino = pidfs_ino_nr; - pid->pidfs_hash.next = NULL; +void pidfs_prepare_pid(struct pid *pid) +{ pid->stashed = NULL; pid->attr = NULL; - pidfs_ino_nr++; + pid->ino = 0; } int pidfs_add_pid(struct pid *pid) { - return rhashtable_insert_fast(&pidfs_ino_ht, &pid->pidfs_hash, - pidfs_ino_ht_params); + int ret; + + pid->ino = pidfs_alloc_ino(); + ret = rhashtable_insert_fast(&pidfs_ino_ht, &pid->pidfs_hash, + pidfs_ino_ht_params); + if (unlikely(ret)) + pid->ino = 0; + return ret; } void pidfs_remove_pid(struct pid *pid) { - rhashtable_remove_fast(&pidfs_ino_ht, &pid->pidfs_hash, - pidfs_ino_ht_params); + if (likely(pid->ino)) + rhashtable_remove_fast(&pidfs_ino_ht, &pid->pidfs_hash, + pidfs_ino_ht_params); } void pidfs_free_pid(struct pid *pid) diff --git a/kernel/pid.c b/kernel/pid.c index 06356e40ac00..72c9372b84b8 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -198,6 +198,7 @@ struct pid *alloc_pid(struct pid_namespace *ns, pid_t *arg_set_tid, INIT_HLIST_HEAD(&pid->tasks[type]); init_waitqueue_head(&pid->wait_pidfd); INIT_HLIST_HEAD(&pid->inodes); + pidfs_prepare_pid(pid); /* * 2. perm check checkpoint_restore_ns_capable() @@ -314,8 +315,6 @@ struct pid *alloc_pid(struct pid_namespace *ns, pid_t *arg_set_tid, retval = -ENOMEM; if (unlikely(!(ns->pid_allocated & PIDNS_ADDING))) goto out_free; - pidfs_prepare_pid(pid); - for (upid = pid->numbers + ns->level; upid >= pid->numbers; --upid) { /* Make the PID visible to find_pid_ns. */ idr_replace(&upid->ns->idr, pid, upid->nr); -- 2.47.3