1 // SPDX-License-Identifier: GPL-2.0
2 #include <linux/anon_inodes.h>
3 #include <linux/file.h>
5 #include <linux/magic.h>
6 #include <linux/mount.h>
8 #include <linux/pidfs.h>
9 #include <linux/pid_namespace.h>
10 #include <linux/poll.h>
11 #include <linux/proc_fs.h>
12 #include <linux/proc_ns.h>
13 #include <linux/pseudo_fs.h>
14 #include <linux/seq_file.h>
15 #include <uapi/linux/pidfd.h>
21 * pidfd_show_fdinfo - print information about a pidfd
22 * @m: proc fdinfo file
23 * @f: file referencing a pidfd
26 * This function will print the pid that a given pidfd refers to in the
27 * pid namespace of the procfs instance.
28 * If the pid namespace of the process is not a descendant of the pid
29 * namespace of the procfs instance 0 will be shown as its pid. This is
30 * similar to calling getppid() on a process whose parent is outside of
34 * If pid namespaces are supported then this function will also print
35 * the pid of a given pidfd refers to for all descendant pid namespaces
36 * starting from the current pid namespace of the instance, i.e. the
37 * Pid field and the first entry in the NSpid field will be identical.
38 * If the pid namespace of the process is not a descendant of the pid
39 * namespace of the procfs instance 0 will be shown as its first NSpid
40 * entry and no others will be shown.
41 * Note that this differs from the Pid and NSpid fields in
42 * /proc/<pid>/status where Pid and NSpid are always shown relative to
43 * the pid namespace of the procfs instance. The difference becomes
44 * obvious when sending around a pidfd between pid namespaces from a
45 * different branch of the tree, i.e. where no ancestral relation is
46 * present between the pid namespaces:
47 * - create two new pid namespaces ns1 and ns2 in the initial pid
48 * namespace (also take care to create new mount namespaces in the
49 * new pid namespace and mount procfs)
50 * - create a process with a pidfd in ns1
51 * - send pidfd from ns1 to ns2
52 * - read /proc/self/fdinfo/<pidfd> and observe that both Pid and NSpid
53 * have exactly one entry, which is 0
55 static void pidfd_show_fdinfo(struct seq_file
*m
, struct file
*f
)
57 struct pid
*pid
= pidfd_pid(f
);
58 struct pid_namespace
*ns
;
61 if (likely(pid_has_task(pid
, PIDTYPE_PID
))) {
62 ns
= proc_pid_ns(file_inode(m
->file
)->i_sb
);
63 nr
= pid_nr_ns(pid
, ns
);
66 seq_put_decimal_ll(m
, "Pid:\t", nr
);
69 seq_put_decimal_ll(m
, "\nNSpid:\t", nr
);
73 /* If nr is non-zero it means that 'pid' is valid and that
74 * ns, i.e. the pid namespace associated with the procfs
75 * instance, is in the pid namespace hierarchy of pid.
76 * Start at one below the already printed level.
78 for (i
= ns
->level
+ 1; i
<= pid
->level
; i
++)
79 seq_put_decimal_ll(m
, "\t", pid
->numbers
[i
].nr
);
87 * Poll support for process exit notification.
89 static __poll_t
pidfd_poll(struct file
*file
, struct poll_table_struct
*pts
)
91 struct pid
*pid
= pidfd_pid(file
);
92 bool thread
= file
->f_flags
& PIDFD_THREAD
;
93 struct task_struct
*task
;
94 __poll_t poll_flags
= 0;
96 poll_wait(file
, &pid
->wait_pidfd
, pts
);
98 * Depending on PIDFD_THREAD, inform pollers when the thread
99 * or the whole thread-group exits.
102 task
= pid_task(pid
, PIDTYPE_PID
);
104 poll_flags
= EPOLLIN
| EPOLLRDNORM
| EPOLLHUP
;
105 else if (task
->exit_state
&& (thread
|| thread_group_empty(task
)))
106 poll_flags
= EPOLLIN
| EPOLLRDNORM
;
111 static const struct file_operations pidfs_file_operations
= {
113 #ifdef CONFIG_PROC_FS
114 .show_fdinfo
= pidfd_show_fdinfo
,
118 struct pid
*pidfd_pid(const struct file
*file
)
120 if (file
->f_op
!= &pidfs_file_operations
)
121 return ERR_PTR(-EBADF
);
122 return file_inode(file
)->i_private
;
125 static struct vfsmount
*pidfs_mnt __ro_after_init
;
127 #if BITS_PER_LONG == 32
129 * Provide a fallback mechanism for 32-bit systems so processes remain
130 * reliably comparable by inode number even on those systems.
132 static DEFINE_IDA(pidfd_inum_ida
);
134 static int pidfs_inum(struct pid
*pid
, unsigned long *ino
)
138 ret
= ida_alloc_range(&pidfd_inum_ida
, RESERVED_PIDS
+ 1,
139 UINT_MAX
, GFP_ATOMIC
);
147 static inline void pidfs_free_inum(unsigned long ino
)
150 ida_free(&pidfd_inum_ida
, ino
);
153 static inline int pidfs_inum(struct pid
*pid
, unsigned long *ino
)
158 #define pidfs_free_inum(ino) ((void)(ino))
162 * The vfs falls back to simple_setattr() if i_op->setattr() isn't
163 * implemented. Let's reject it completely until we have a clean
164 * permission concept for pidfds.
166 static int pidfs_setattr(struct mnt_idmap
*idmap
, struct dentry
*dentry
,
172 static int pidfs_getattr(struct mnt_idmap
*idmap
, const struct path
*path
,
173 struct kstat
*stat
, u32 request_mask
,
174 unsigned int query_flags
)
176 struct inode
*inode
= d_inode(path
->dentry
);
178 generic_fillattr(&nop_mnt_idmap
, request_mask
, inode
, stat
);
182 static const struct inode_operations pidfs_inode_operations
= {
183 .getattr
= pidfs_getattr
,
184 .setattr
= pidfs_setattr
,
187 static void pidfs_evict_inode(struct inode
*inode
)
189 struct pid
*pid
= inode
->i_private
;
193 pidfs_free_inum(inode
->i_ino
);
196 static const struct super_operations pidfs_sops
= {
197 .drop_inode
= generic_delete_inode
,
198 .evict_inode
= pidfs_evict_inode
,
199 .statfs
= simple_statfs
,
202 static char *pidfs_dname(struct dentry
*dentry
, char *buffer
, int buflen
)
204 struct inode
*inode
= d_inode(dentry
);
205 struct pid
*pid
= inode
->i_private
;
207 return dynamic_dname(buffer
, buflen
, "pidfd:[%llu]", pid
->ino
);
210 static const struct dentry_operations pidfs_dentry_operations
= {
211 .d_delete
= always_delete_dentry
,
212 .d_dname
= pidfs_dname
,
213 .d_prune
= stashed_dentry_prune
,
216 static int pidfs_init_inode(struct inode
*inode
, void *data
)
218 inode
->i_private
= data
;
219 inode
->i_flags
|= S_PRIVATE
;
220 inode
->i_mode
|= S_IRWXU
;
221 inode
->i_op
= &pidfs_inode_operations
;
222 inode
->i_fop
= &pidfs_file_operations
;
224 * Inode numbering for pidfs start at RESERVED_PIDS + 1. This
225 * avoids collisions with the root inode which is 1 for pseudo
228 return pidfs_inum(data
, &inode
->i_ino
);
231 static void pidfs_put_data(void *data
)
233 struct pid
*pid
= data
;
237 static const struct stashed_operations pidfs_stashed_ops
= {
238 .init_inode
= pidfs_init_inode
,
239 .put_data
= pidfs_put_data
,
242 static int pidfs_init_fs_context(struct fs_context
*fc
)
244 struct pseudo_fs_context
*ctx
;
246 ctx
= init_pseudo(fc
, PID_FS_MAGIC
);
250 ctx
->ops
= &pidfs_sops
;
251 ctx
->dops
= &pidfs_dentry_operations
;
252 fc
->s_fs_info
= (void *)&pidfs_stashed_ops
;
256 static struct file_system_type pidfs_type
= {
258 .init_fs_context
= pidfs_init_fs_context
,
259 .kill_sb
= kill_anon_super
,
262 struct file
*pidfs_alloc_file(struct pid
*pid
, unsigned int flags
)
265 struct file
*pidfd_file
;
269 ret
= path_from_stashed(&pid
->stashed
, pidfs_mnt
, get_pid(pid
), &path
);
273 pidfd_file
= dentry_open(&path
, flags
, current_cred());
278 void __init
pidfs_init(void)
280 pidfs_mnt
= kern_mount(&pidfs_type
);
281 if (IS_ERR(pidfs_mnt
))
282 panic("Failed to mount pidfs pseudo filesystem");