+++ /dev/null
-From a10eec941b82c9405218adbfc3d5f3d095fd2282 Mon Sep 17 00:00:00 2001
-From: Sasha Levin <sashal@kernel.org>
-Date: Wed, 18 Jun 2025 22:53:39 +0200
-Subject: pidfs: persist information
-
-From: Christian Brauner <brauner@kernel.org>
-
-[ Upstream commit 8ec7c826d97b390879df2a03dfb035c70af86779 ]
-
-Persist exit and coredump information independent of whether anyone
-currently holds a pidfd for the struct pid.
-
-The current scheme allocated pidfs dentries on-demand repeatedly.
-This scheme is reaching it's limits as it makes it impossible to pin
-information that needs to be available after the task has exited or
-coredumped and that should not be lost simply because the pidfd got
-closed temporarily. The next opener should still see the stashed
-information.
-
-This is also a prerequisite for supporting extended attributes on
-pidfds to allow attaching meta information to them.
-
-If someone opens a pidfd for a struct pid a pidfs dentry is allocated
-and stashed in pid->stashed. Once the last pidfd for the struct pid is
-closed the pidfs dentry is released and removed from pid->stashed.
-
-So if 10 callers create a pidfs dentry for the same struct pid
-sequentially, i.e., each closing the pidfd before the other creates a
-new one then a new pidfs dentry is allocated every time.
-
-Because multiple tasks acquiring and releasing a pidfd for the same
-struct pid can race with each another a task may still find a valid
-pidfs entry from the previous task in pid->stashed and reuse it. Or it
-might find a dead dentry in there and fail to reuse it and so stashes a
-new pidfs dentry. Multiple tasks may race to stash a new pidfs dentry
-but only one will succeed, the other ones will put their dentry.
-
-The current scheme aims to ensure that a pidfs dentry for a struct pid
-can only be created if the task is still alive or if a pidfs dentry
-already existed before the task was reaped and so exit information has
-been was stashed in the pidfs inode.
-
-That's great except that it's buggy. If a pidfs dentry is stashed in
-pid->stashed after pidfs_exit() but before __unhash_process() is called
-we will return a pidfd for a reaped task without exit information being
-available.
-
-The pidfds_pid_valid() check does not guard against this race as it
-doens't sync at all with pidfs_exit(). The pid_has_task() check might be
-successful simply because we're before __unhash_process() but after
-pidfs_exit().
-
-Introduce a new scheme where the lifetime of information associated with
-a pidfs entry (coredump and exit information) isn't bound to the
-lifetime of the pidfs inode but the struct pid itself.
-
-The first time a pidfs dentry is allocated for a struct pid a struct
-pidfs_attr will be allocated which will be used to store exit and
-coredump information.
-
-If all pidfs for the pidfs dentry are closed the dentry and inode can be
-cleaned up but the struct pidfs_attr will stick until the struct pid
-itself is freed. This will ensure minimal memory usage while persisting
-relevant information.
-
-The new scheme has various advantages. First, it allows to close the
-race where we end up handing out a pidfd for a reaped task for which no
-exit information is available. Second, it minimizes memory usage.
-Third, it allows to remove complex lifetime tracking via dentries when
-registering a struct pid with pidfs. There's no need to get or put a
-reference. Instead, the lifetime of exit and coredump information
-associated with a struct pid is bound to the lifetime of struct pid
-itself.
-
-Link: https://lore.kernel.org/20250618-work-pidfs-persistent-v2-5-98f3456fd552@kernel.org
-Reviewed-by: Alexander Mikhalitsyn <aleksandr.mikhalitsyn@canonical.com>
-Signed-off-by: Christian Brauner <brauner@kernel.org>
-Stable-dep-of: 0b2d71a7c826 ("pidfs: Fix memory leak in pidfd_info()")
-Signed-off-by: Sasha Levin <sashal@kernel.org>
----
- fs/pidfs.c | 212 +++++++++++++++++++++++++++++-------------
- include/linux/pid.h | 3 +
- include/linux/pidfs.h | 1 +
- kernel/pid.c | 2 +-
- 4 files changed, 151 insertions(+), 67 deletions(-)
-
-diff --git a/fs/pidfs.c b/fs/pidfs.c
-index 4c551bfa8927..568574fed576 100644
---- a/fs/pidfs.c
-+++ b/fs/pidfs.c
-@@ -25,7 +25,10 @@
- #include "internal.h"
- #include "mount.h"
-
-+#define PIDFS_PID_DEAD ERR_PTR(-ESRCH)
-+
- static struct kmem_cache *pidfs_cachep __ro_after_init;
-+static struct kmem_cache *pidfs_attr_cachep __ro_after_init;
-
- /*
- * Stashes information that userspace needs to access even after the
-@@ -37,6 +40,11 @@ struct pidfs_exit_info {
- __u32 coredump_mask;
- };
-
-+struct pidfs_attr {
-+ struct pidfs_exit_info __pei;
-+ struct pidfs_exit_info *exit_info;
-+};
-+
- struct pidfs_inode {
- struct pidfs_exit_info __pei;
- struct pidfs_exit_info *exit_info;
-@@ -125,6 +133,7 @@ void pidfs_add_pid(struct pid *pid)
-
- pid->ino = pidfs_ino_nr;
- pid->stashed = NULL;
-+ pid->attr = NULL;
- pidfs_ino_nr++;
-
- write_seqcount_begin(&pidmap_lock_seq);
-@@ -139,6 +148,18 @@ void pidfs_remove_pid(struct pid *pid)
- write_seqcount_end(&pidmap_lock_seq);
- }
-
-+void pidfs_free_pid(struct pid *pid)
-+{
-+ /*
-+ * Any dentry must've been wiped from the pid by now.
-+ * Otherwise there's a reference count bug.
-+ */
-+ VFS_WARN_ON_ONCE(pid->stashed);
-+
-+ if (!IS_ERR(pid->attr))
-+ kfree(pid->attr);
-+}
-+
- #ifdef CONFIG_PROC_FS
- /**
- * pidfd_show_fdinfo - print information about a pidfd
-@@ -261,13 +282,13 @@ static __u32 pidfs_coredump_mask(unsigned long mm_flags)
- static long pidfd_info(struct file *file, unsigned int cmd, unsigned long arg)
- {
- struct pidfd_info __user *uinfo = (struct pidfd_info __user *)arg;
-- struct inode *inode = file_inode(file);
- struct pid *pid = pidfd_pid(file);
- size_t usize = _IOC_SIZE(cmd);
- struct pidfd_info kinfo = {};
- struct pidfs_exit_info *exit_info;
- struct user_namespace *user_ns;
- struct task_struct *task;
-+ struct pidfs_attr *attr;
- const struct cred *c;
- __u64 mask;
-
-@@ -286,8 +307,9 @@ static long pidfd_info(struct file *file, unsigned int cmd, unsigned long arg)
- if (!pid_in_current_pidns(pid))
- return -ESRCH;
-
-+ attr = READ_ONCE(pid->attr);
- if (mask & PIDFD_INFO_EXIT) {
-- exit_info = READ_ONCE(pidfs_i(inode)->exit_info);
-+ exit_info = READ_ONCE(attr->exit_info);
- if (exit_info) {
- kinfo.mask |= PIDFD_INFO_EXIT;
- #ifdef CONFIG_CGROUPS
-@@ -300,7 +322,7 @@ static long pidfd_info(struct file *file, unsigned int cmd, unsigned long arg)
-
- if (mask & PIDFD_INFO_COREDUMP) {
- kinfo.mask |= PIDFD_INFO_COREDUMP;
-- kinfo.coredump_mask = READ_ONCE(pidfs_i(inode)->__pei.coredump_mask);
-+ kinfo.coredump_mask = READ_ONCE(attr->__pei.coredump_mask);
- }
-
- task = get_pid_task(pid, PIDTYPE_PID);
-@@ -552,41 +574,61 @@ struct pid *pidfd_pid(const struct file *file)
- * task has been reaped which cannot happen until we're out of
- * release_task().
- *
-- * If this struct pid is referred to by a pidfd then
-- * stashed_dentry_get() will return the dentry and inode for that struct
-- * pid. Since we've taken a reference on it there's now an additional
-- * reference from the exit path on it. Which is fine. We're going to put
-- * it again in a second and we know that the pid is kept alive anyway.
-+ * If this struct pid has at least once been referred to by a pidfd then
-+ * pid->attr will be allocated. If not we mark the struct pid as dead so
-+ * anyone who is trying to register it with pidfs will fail to do so.
-+ * Otherwise we would hand out pidfs for reaped tasks without having
-+ * exit information available.
- *
-- * Worst case is that we've filled in the info and immediately free the
-- * dentry and inode afterwards since the pidfd has been closed. Since
-+ * Worst case is that we've filled in the info and the pid gets freed
-+ * right away in free_pid() when no one holds a pidfd anymore. Since
- * pidfs_exit() currently is placed after exit_task_work() we know that
-- * it cannot be us aka the exiting task holding a pidfd to ourselves.
-+ * it cannot be us aka the exiting task holding a pidfd to itself.
- */
- void pidfs_exit(struct task_struct *tsk)
- {
-- struct dentry *dentry;
-+ struct pid *pid = task_pid(tsk);
-+ struct pidfs_attr *attr;
-+ struct pidfs_exit_info *exit_info;
-+#ifdef CONFIG_CGROUPS
-+ struct cgroup *cgrp;
-+#endif
-
- might_sleep();
-
-- dentry = stashed_dentry_get(&task_pid(tsk)->stashed);
-- if (dentry) {
-- struct inode *inode = d_inode(dentry);
-- struct pidfs_exit_info *exit_info = &pidfs_i(inode)->__pei;
--#ifdef CONFIG_CGROUPS
-- struct cgroup *cgrp;
-+ guard(spinlock_irq)(&pid->wait_pidfd.lock);
-+ attr = pid->attr;
-+ if (!attr) {
-+ /*
-+ * No one ever held a pidfd for this struct pid.
-+ * Mark it as dead so no one can add a pidfs
-+ * entry anymore. We're about to be reaped and
-+ * so no exit information would be available.
-+ */
-+ pid->attr = PIDFS_PID_DEAD;
-+ return;
-+ }
-
-- rcu_read_lock();
-- cgrp = task_dfl_cgroup(tsk);
-- exit_info->cgroupid = cgroup_id(cgrp);
-- rcu_read_unlock();
-+ /*
-+ * If @pid->attr is set someone might still legitimately hold a
-+ * pidfd to @pid or someone might concurrently still be getting
-+ * a reference to an already stashed dentry from @pid->stashed.
-+ * So defer cleaning @pid->attr until the last reference to @pid
-+ * is put
-+ */
-+
-+ exit_info = &attr->__pei;
-+
-+#ifdef CONFIG_CGROUPS
-+ rcu_read_lock();
-+ cgrp = task_dfl_cgroup(tsk);
-+ exit_info->cgroupid = cgroup_id(cgrp);
-+ rcu_read_unlock();
- #endif
-- exit_info->exit_code = tsk->exit_code;
-+ exit_info->exit_code = tsk->exit_code;
-
-- /* Ensure that PIDFD_GET_INFO sees either all or nothing. */
-- smp_store_release(&pidfs_i(inode)->exit_info, &pidfs_i(inode)->__pei);
-- dput(dentry);
-- }
-+ /* Ensure that PIDFD_GET_INFO sees either all or nothing. */
-+ smp_store_release(&attr->exit_info, &attr->__pei);
- }
-
- #ifdef CONFIG_COREDUMP
-@@ -594,16 +636,15 @@ void pidfs_coredump(const struct coredump_params *cprm)
- {
- struct pid *pid = cprm->pid;
- struct pidfs_exit_info *exit_info;
-- struct dentry *dentry;
-- struct inode *inode;
-+ struct pidfs_attr *attr;
- __u32 coredump_mask = 0;
-
-- dentry = pid->stashed;
-- if (WARN_ON_ONCE(!dentry))
-- return;
-+ attr = READ_ONCE(pid->attr);
-
-- inode = d_inode(dentry);
-- exit_info = &pidfs_i(inode)->__pei;
-+ VFS_WARN_ON_ONCE(!attr);
-+ VFS_WARN_ON_ONCE(attr == PIDFS_PID_DEAD);
-+
-+ exit_info = &attr->__pei;
- /* Note how we were coredumped. */
- coredump_mask = pidfs_coredump_mask(cprm->mm_flags);
- /* Note that we actually did coredump. */
-@@ -663,7 +704,7 @@ static struct inode *pidfs_alloc_inode(struct super_block *sb)
-
- static void pidfs_free_inode(struct inode *inode)
- {
-- kmem_cache_free(pidfs_cachep, pidfs_i(inode));
-+ kfree(pidfs_i(inode));
- }
-
- static const struct super_operations pidfs_sops = {
-@@ -831,8 +872,13 @@ static inline bool pidfs_pid_valid(struct pid *pid, const struct path *path,
- * recorded and published can be handled correctly.
- */
- if (unlikely(!pid_has_task(pid, type))) {
-- struct inode *inode = d_inode(path->dentry);
-- return !!READ_ONCE(pidfs_i(inode)->exit_info);
-+ struct pidfs_attr *attr;
-+
-+ attr = READ_ONCE(pid->attr);
-+ if (!attr)
-+ return false;
-+ if (!READ_ONCE(attr->exit_info))
-+ return false;
- }
-
- return true;
-@@ -878,9 +924,67 @@ static void pidfs_put_data(void *data)
- put_pid(pid);
- }
-
-+/**
-+ * pidfs_register_pid - register a struct pid in pidfs
-+ * @pid: pid to pin
-+ *
-+ * Register a struct pid in pidfs. Needs to be paired with
-+ * pidfs_put_pid() to not risk leaking the pidfs dentry and inode.
-+ *
-+ * Return: On success zero, on error a negative error code is returned.
-+ */
-+int pidfs_register_pid(struct pid *pid)
-+{
-+ struct pidfs_attr *new_attr __free(kfree) = NULL;
-+ struct pidfs_attr *attr;
-+
-+ might_sleep();
-+
-+ if (!pid)
-+ return 0;
-+
-+ attr = READ_ONCE(pid->attr);
-+ if (unlikely(attr == PIDFS_PID_DEAD))
-+ return PTR_ERR(PIDFS_PID_DEAD);
-+ if (attr)
-+ return 0;
-+
-+ new_attr = kmem_cache_zalloc(pidfs_attr_cachep, GFP_KERNEL);
-+ if (!new_attr)
-+ return -ENOMEM;
-+
-+ /* Synchronize with pidfs_exit(). */
-+ guard(spinlock_irq)(&pid->wait_pidfd.lock);
-+
-+ attr = pid->attr;
-+ if (unlikely(attr == PIDFS_PID_DEAD))
-+ return PTR_ERR(PIDFS_PID_DEAD);
-+ if (unlikely(attr))
-+ return 0;
-+
-+ pid->attr = no_free_ptr(new_attr);
-+ return 0;
-+}
-+
-+static struct dentry *pidfs_stash_dentry(struct dentry **stashed,
-+ struct dentry *dentry)
-+{
-+ int ret;
-+ struct pid *pid = d_inode(dentry)->i_private;
-+
-+ VFS_WARN_ON_ONCE(stashed != &pid->stashed);
-+
-+ ret = pidfs_register_pid(pid);
-+ if (ret)
-+ return ERR_PTR(ret);
-+
-+ return stash_dentry(stashed, dentry);
-+}
-+
- static const struct stashed_operations pidfs_stashed_ops = {
-- .init_inode = pidfs_init_inode,
-- .put_data = pidfs_put_data,
-+ .stash_dentry = pidfs_stash_dentry,
-+ .init_inode = pidfs_init_inode,
-+ .put_data = pidfs_put_data,
- };
-
- static int pidfs_init_fs_context(struct fs_context *fc)
-@@ -936,33 +1040,6 @@ struct file *pidfs_alloc_file(struct pid *pid, unsigned int flags)
- return pidfd_file;
- }
-
--/**
-- * pidfs_register_pid - register a struct pid in pidfs
-- * @pid: pid to pin
-- *
-- * Register a struct pid in pidfs. Needs to be paired with
-- * pidfs_put_pid() to not risk leaking the pidfs dentry and inode.
-- *
-- * Return: On success zero, on error a negative error code is returned.
-- */
--int pidfs_register_pid(struct pid *pid)
--{
-- struct path path __free(path_put) = {};
-- int ret;
--
-- might_sleep();
--
-- if (!pid)
-- return 0;
--
-- ret = path_from_stashed(&pid->stashed, pidfs_mnt, get_pid(pid), &path);
-- if (unlikely(ret))
-- return ret;
-- /* Keep the dentry and only put the reference to the mount. */
-- path.dentry = NULL;
-- return 0;
--}
--
- /**
- * pidfs_get_pid - pin a struct pid through pidfs
- * @pid: pid to pin
-@@ -1008,6 +1085,9 @@ void __init pidfs_init(void)
- (SLAB_HWCACHE_ALIGN | SLAB_RECLAIM_ACCOUNT |
- SLAB_ACCOUNT | SLAB_PANIC),
- pidfs_inode_init_once);
-+ pidfs_attr_cachep = kmem_cache_create("pidfs_attr_cache", sizeof(struct pidfs_attr), 0,
-+ (SLAB_HWCACHE_ALIGN | SLAB_RECLAIM_ACCOUNT |
-+ SLAB_ACCOUNT | SLAB_PANIC), NULL);
- pidfs_mnt = kern_mount(&pidfs_type);
- if (IS_ERR(pidfs_mnt))
- panic("Failed to mount pidfs pseudo filesystem");
-diff --git a/include/linux/pid.h b/include/linux/pid.h
-index 00646a692dd4..003a1027d219 100644
---- a/include/linux/pid.h
-+++ b/include/linux/pid.h
-@@ -47,6 +47,8 @@
-
- #define RESERVED_PIDS 300
-
-+struct pidfs_attr;
-+
- struct upid {
- int nr;
- struct pid_namespace *ns;
-@@ -60,6 +62,7 @@ struct pid {
- u64 ino;
- struct rb_node pidfs_node;
- struct dentry *stashed;
-+ struct pidfs_attr *attr;
- };
- /* lists of tasks that use this pid */
- struct hlist_head tasks[PIDTYPE_MAX];
-diff --git a/include/linux/pidfs.h b/include/linux/pidfs.h
-index 77e7db194914..8f6ed59bb3fb 100644
---- a/include/linux/pidfs.h
-+++ b/include/linux/pidfs.h
-@@ -16,5 +16,6 @@ extern const struct dentry_operations pidfs_dentry_operations;
- int pidfs_register_pid(struct pid *pid);
- void pidfs_get_pid(struct pid *pid);
- void pidfs_put_pid(struct pid *pid);
-+void pidfs_free_pid(struct pid *pid);
-
- #endif /* _LINUX_PID_FS_H */
-diff --git a/kernel/pid.c b/kernel/pid.c
-index 8317bcbc7cf7..07db7d8d066c 100644
---- a/kernel/pid.c
-+++ b/kernel/pid.c
-@@ -100,7 +100,7 @@ void put_pid(struct pid *pid)
-
- ns = pid->numbers[pid->level].ns;
- if (refcount_dec_and_test(&pid->count)) {
-- WARN_ON_ONCE(pid->stashed);
-+ pidfs_free_pid(pid);
- kmem_cache_free(ns->pid_cachep, pid);
- put_pid_ns(ns);
- }
---
-2.50.1
-