From: Christian Brauner <brauner@kernel.org>
Date: Fri, 15 May 2026 15:41:05 +0000 (+0200)
Subject: Merge patch series "io_uring related epoll cleanups"
X-Git-Tag: v7.2-rc1~117^2~2^2
X-Git-Url: http://git.ipfire.org/gitweb/?a=commitdiff_plain;h=6ece1a31c58c8c8293ecbbe79d7f92d52e1b0022;p=thirdparty%2Fkernel%2Flinux.git

Merge patch series "io_uring related epoll cleanups"

Jens Axboe <axboe@kernel.dk> says:

One of the nastier things about epoll is how it allows nesting contexts
inside each other, leading to the necessity of loop detection and the
issues that have come with that.

I don't believe there's any reason to support nesting on the io_uring
side, in fact IORING_OP_EPOLL_CTL is a historical mistake, imho. But
let's at least try and contain the damage and disallow nested contexts
from our side.

Christian Brauner <brauner@kernel.org> says:

Bring in the eventpoll specific io_uring changes together with the
eventpoll cleanup I did this cycle. The io_uring changes can go on top
of both through the block tree.

* patches from https://patch.msgid.link/20260514140817.623026-1-axboe@kernel.dk:
  eventpoll: rename struct epoll_filefd to epoll_key
  eventpoll: add file based control interface
  eventpoll: export is_file_epoll()
  eventpoll: pass struct epoll_filefd through ep_find() and ep_insert()

Link: https://patch.msgid.link/20260514140817.623026-1-axboe@kernel.dk
Signed-off-by: Christian Brauner (Amutable) <brauner@kernel.org>
---

6ece1a31c58c8c8293ecbbe79d7f92d52e1b0022
diff --cc fs/eventpoll.c
index b839cc02eb0ed,7535b10f8c6a6..a569e98d4a996
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@@ -265,15 -135,13 +260,15 @@@ struct epitem 
  	struct list_head rdllink;
  
  	/*
 -	 * Works together "struct eventpoll"->ovflist in keeping the
 -	 * single linked chain of items.
 +	 * Link on the owning eventpoll's scan-overflow list (ep->ovflist),
 +	 * EP_UNACTIVE_PTR when not linked. See epi_on_ovflist() /
 +	 * epi_clear_ovflist() and the "Ready-list state machine" section
 +	 * in the top-of-file banner.
  	 */
 -	struct epitem *next;
 +	struct epitem *ovflist_next;
  
  	/* The file descriptor information this item refers to */
- 	struct epoll_filefd ffd;
+ 	struct epoll_key ffd;
  
  	/* List containing poll wait queues */
  	struct eppoll_entry *pwqlist;
@@@ -526,7 -329,7 +521,7 @@@ static void __init epoll_sysctls_init(v
  
  static const struct file_operations eventpoll_fops;
  
- static inline bool is_file_epoll(struct file *f)
 -int is_file_epoll(struct file *f)
++bool is_file_epoll(struct file *f)
  {
  	return f->f_op == &eventpoll_fops;
  }
@@@ -1786,62 -1545,38 +1770,62 @@@ allocate
  }
  
  /*
 - * Must be called with "mtx" held.
 + * Charge the user's epoll_watches quota, allocate a fresh epitem for
-  * @tfile/@fd, and initialize its fields. The returned item is not yet
-  * linked into any data structure; the caller must install it via
++ * @tf, and initialize its fields. The returned item is not yet linked
++ * into any data structure; the caller must install it via
 + * ep_register_epitem() (which takes over on success) or kmem_cache_free()
 + * it and decrement epoll_watches on its own.
 + *
 + * Returns ERR_PTR(-ENOSPC) if the quota is exceeded, ERR_PTR(-ENOMEM)
 + * if the slab allocation fails.
   */
 -static int ep_insert(struct eventpoll *ep, const struct epoll_event *event,
 -		     struct epoll_key *tf, int full_check)
 +static struct epitem *ep_alloc_epitem(struct eventpoll *ep,
 +				      const struct epoll_event *event,
- 				      struct file *tfile, int fd)
++				      struct epoll_key *tf)
  {
 -	int error, pwake = 0;
 -	__poll_t revents;
  	struct epitem *epi;
 -	struct ep_pqueue epq;
 -	struct eventpoll *tep = NULL;
 -
 -	if (is_file_epoll(tf->file))
 -		tep = tf->file->private_data;
 -
 -	lockdep_assert_irqs_enabled();
  
  	if (unlikely(percpu_counter_compare(&ep->user->epoll_watches,
  					    max_user_watches) >= 0))
 -		return -ENOSPC;
 +		return ERR_PTR(-ENOSPC);
  	percpu_counter_inc(&ep->user->epoll_watches);
  
 -	if (!(epi = kmem_cache_zalloc(epi_cache, GFP_KERNEL))) {
 +	epi = kmem_cache_zalloc(epi_cache, GFP_KERNEL);
 +	if (unlikely(!epi)) {
  		percpu_counter_dec(&ep->user->epoll_watches);
 -		return -ENOMEM;
 +		return ERR_PTR(-ENOMEM);
  	}
  
 -	/* Item initialization follow here ... */
  	INIT_LIST_HEAD(&epi->rdllink);
  	epi->ep = ep;
- 	ep_set_ffd(&epi->ffd, tfile, fd);
+ 	epi->ffd = *tf;
  	epi->event = *event;
 -	epi->next = EP_UNACTIVE_PTR;
 +	epi_clear_ovflist(epi);
 +
 +	return epi;
 +}
 +
 +/*
 + * Install @epi into its target file's f_ep hlist and into @ep's rbtree,
 + * taking one additional reference on @ep for the lifetime of the item.
 + *
 + * If @tep is non-NULL, the target file is itself an eventpoll; we hold
 + * tep->mtx at subclass 1 across the attach + rbtree insert to serialize
 + * with the target side. RB tree ops are protected by @ep->mtx, which
 + * the caller already holds.
 + *
 + * On failure the epi is freed and the epoll_watches counter decremented,
 + * matching ep_alloc_epitem()'s allocation. After this returns
 + * successfully, ep_insert()'s later error paths use ep_remove() for
 + * unwind; that cannot drop @ep's refcount to zero because the ep file
 + * itself still holds the original reference.
 + */
 +static int ep_register_epitem(struct ep_ctl_ctx *ctx, struct eventpoll *ep,
 +			      struct epitem *epi, struct eventpoll *tep,
 +			      int full_check)
 +{
 +	struct file *tfile = epi->ffd.file;
 +	int error;
  
  	if (tep)
  		mutex_lock_nested(&tep->mtx, 1);
@@@ -1863,38 -1600,14 +1847,38 @@@
  	if (tep)
  		mutex_unlock(&tep->mtx);
  
 -	/*
 -	 * ep_remove() calls in the later error paths can't lead to
 -	 * ep_free() as the ep file itself still holds an ep reference.
 -	 */
  	ep_get(ep);
 +	return 0;
 +}
 +
 +/*
 + * Must be called with "mtx" held.
 + */
 +static int ep_insert(struct ep_ctl_ctx *ctx, struct eventpoll *ep,
- 		     const struct epoll_event *event, struct file *tfile,
- 		     int fd, int full_check)
++		     const struct epoll_event *event, struct epoll_key *tf,
++		     int full_check)
 +{
 +	int error, pwake = 0;
 +	__poll_t revents;
 +	struct epitem *epi;
 +	struct ep_pqueue epq;
 +	struct eventpoll *tep = NULL;
 +
- 	if (is_file_epoll(tfile))
- 		tep = tfile->private_data;
++	if (is_file_epoll(tf->file))
++		tep = tf->file->private_data;
 +
 +	lockdep_assert_irqs_enabled();
 +
- 	epi = ep_alloc_epitem(ep, event, tfile, fd);
++	epi = ep_alloc_epitem(ep, event, tf);
 +	if (IS_ERR(epi))
 +		return PTR_ERR(epi);
 +
 +	error = ep_register_epitem(ctx, ep, epi, tep, full_check);
 +	if (error)
 +		return error;
  
 -	/* now check if we've created too many backpaths */
 -	if (unlikely(full_check && reverse_path_check())) {
 +	/* Reject the insert if the new link would create too many back-paths. */
 +	if (unlikely(full_check && reverse_path_check(ctx))) {
  		ep_remove(ep, epi);
  		return -EINVAL;
  	}
@@@ -2522,114 -2207,29 +2506,105 @@@ static inline void ep_take_care_of_epol
  }
  #endif
  
 -static inline int epoll_mutex_lock(struct mutex *mutex, int depth,
 -				   bool nonblock)
 +static inline int epoll_mutex_lock(struct mutex *mutex, bool nonblock)
  {
  	if (!nonblock) {
 -		mutex_lock_nested(mutex, depth);
 +		mutex_lock(mutex);
  		return 0;
  	}
 -	if (mutex_trylock(mutex))
 +	return mutex_trylock(mutex) ? 0 : -EAGAIN;
 +}
 +
 +/*
 + * Acquire the locks required for do_epoll_ctl() on @ep for @op.
 + *
 + * Always takes ep->mtx. For EPOLL_CTL_ADD, additionally runs the
 + * loop / path check under epnested_mutex when the topology can
 + * change: @ep is already watched (epfile->f_ep non-NULL), @ep was
 + * recently loop-checked (ep->gen == loop_check_gen), or @tfile is
 + * itself an eventpoll.
 + *
 + * Return value encodes both outcome and lock state:
 + *
 + *   0        success; ep->mtx held.
 + *   1        success; ep->mtx held AND the full check ran under
 + *            epnested_mutex (which is also still held). The value
 + *            doubles as the @full_check argument to ep_insert().
 + *   -errno   failure; no locks held.
 + *
 + * The caller releases what was taken with ep_ctl_unlock(ep, ret).
 + *
 + * Holding epnested_mutex on add is what prevents two racing
 + * EPOLL_CTL_ADDs on different eps from building a cycle without
 + * either walker observing it.
 + */
 +static int ep_ctl_lock(struct ep_ctl_ctx *ctx, struct eventpoll *ep, int op,
 +		       struct file *epfile, struct file *tfile, bool nonblock)
 +{
 +	struct eventpoll *tep;
 +	int error;
 +
 +	error = epoll_mutex_lock(&ep->mtx, nonblock);
 +	if (error)
 +		return error;
 +
 +	if (op != EPOLL_CTL_ADD)
  		return 0;
 -	return -EAGAIN;
 +	if (!READ_ONCE(epfile->f_ep) && ep->gen != loop_check_gen &&
 +	    !is_file_epoll(tfile))
 +		return 0;
 +
 +	/* Full check needed: drop ep->mtx so we can take epnested_mutex. */
 +	mutex_unlock(&ep->mtx);
 +	error = epoll_mutex_lock(&epnested_mutex, nonblock);
 +	if (error)
 +		return error;
 +
 +	loop_check_gen++;
 +
 +	if (is_file_epoll(tfile)) {
 +		tep = tfile->private_data;
 +		if (ep_loop_check(ctx, ep, tep) != 0) {
 +			error = -ELOOP;
 +			goto err_unlock_nested;
 +		}
 +	}
 +
 +	error = epoll_mutex_lock(&ep->mtx, nonblock);
 +	if (error)
 +		goto err_unlock_nested;
 +
 +	return 1;
 +
 +err_unlock_nested:
 +	clear_tfile_check_list(ctx);
 +	loop_check_gen++;
 +	mutex_unlock(&epnested_mutex);
 +	return error;
 +}
 +
 +static void ep_ctl_unlock(struct ep_ctl_ctx *ctx, struct eventpoll *ep,
 +			  int full_check)
 +{
 +	mutex_unlock(&ep->mtx);
 +	if (full_check) {
 +		clear_tfile_check_list(ctx);
 +		loop_check_gen++;
 +		mutex_unlock(&epnested_mutex);
 +	}
  }
  
- int do_epoll_ctl(int epfd, int op, int fd, struct epoll_event *epds,
- 		 bool nonblock)
+ int do_epoll_ctl_file(struct file *f, int op, struct epoll_key *tf,
+ 		      struct epoll_event *epds, bool nonblock)
  {
  	int error;
 -	int full_check = 0;
 +	int full_check;
  	struct eventpoll *ep;
  	struct epitem *epi;
 -	struct eventpoll *tep = NULL;
 +	struct ep_ctl_ctx ctx = { };
  
- 	CLASS(fd, f)(epfd);
- 	if (fd_empty(f))
- 		return -EBADF;
- 
- 	/* Get the "struct file *" for the target file */
- 	CLASS(fd, tf)(fd);
- 	if (fd_empty(tf))
- 		return -EBADF;
- 
  	/* The target file descriptor must support poll */
- 	if (!file_can_poll(fd_file(tf)))
+ 	if (!file_can_poll(tf->file))
  		return -EPERM;
  
  	/* Check if EPOLLWAKEUP is allowed */
@@@ -2637,11 -2237,13 +2612,11 @@@
  		ep_take_care_of_epollwakeup(epds);
  
  	/*
- 	 * The @epfd file must itself be an eventpoll, and we do not permit
 -	 * We have to check that the file structure underneath the file descriptor
 -	 * the user passed to us _is_ an eventpoll file. And also we do not permit
++	 * The @f file must itself be an eventpoll, and we do not permit
  	 * adding an epoll file descriptor inside itself.
  	 */
- 	if (fd_file(f) == fd_file(tf) || !is_file_epoll(fd_file(f)))
 -	error = -EINVAL;
+ 	if (f == tf->file || !is_file_epoll(f))
 -		goto error_tgt_fput;
 +		return -EINVAL;
  
  	/*
  	 * epoll adds to the wakeup queue at EPOLL_CTL_ADD time only,
@@@ -2650,32 -2252,70 +2625,30 @@@
  	 */
  	if (ep_op_has_event(op) && (epds->events & EPOLLEXCLUSIVE)) {
  		if (op == EPOLL_CTL_MOD)
 -			goto error_tgt_fput;
 +			return -EINVAL;
- 		if (op == EPOLL_CTL_ADD && (is_file_epoll(fd_file(tf)) ||
+ 		if (op == EPOLL_CTL_ADD && (is_file_epoll(tf->file) ||
  				(epds->events & ~EPOLLEXCLUSIVE_OK_BITS)))
 -			goto error_tgt_fput;
 +			return -EINVAL;
  	}
  
- 	ep = fd_file(f)->private_data;
 -	/*
 -	 * At this point it is safe to assume that the "private_data" contains
 -	 * our own data structure.
 -	 */
+ 	ep = f->private_data;
  
- 	full_check = ep_ctl_lock(&ctx, ep, op, fd_file(f), fd_file(tf),
- 				 nonblock);
 -	/*
 -	 * When we insert an epoll file descriptor inside another epoll file
 -	 * descriptor, there is the chance of creating closed loops, which are
 -	 * better be handled here, than in more critical paths. While we are
 -	 * checking for loops we also determine the list of files reachable
 -	 * and hang them on the tfile_check_list, so we can check that we
 -	 * haven't created too many possible wakeup paths.
 -	 *
 -	 * We do not need to take the global 'epumutex' on EPOLL_CTL_ADD when
 -	 * the epoll file descriptor is attaching directly to a wakeup source,
 -	 * unless the epoll file descriptor is nested. The purpose of taking the
 -	 * 'epnested_mutex' on add is to prevent complex toplogies such as loops and
 -	 * deep wakeup paths from forming in parallel through multiple
 -	 * EPOLL_CTL_ADD operations.
 -	 */
 -	error = epoll_mutex_lock(&ep->mtx, 0, nonblock);
 -	if (error)
 -		goto error_tgt_fput;
 -	if (op == EPOLL_CTL_ADD) {
 -		if (READ_ONCE(f->f_ep) || ep->gen == loop_check_gen ||
 -		    is_file_epoll(tf->file)) {
 -			mutex_unlock(&ep->mtx);
 -			error = epoll_mutex_lock(&epnested_mutex, 0, nonblock);
 -			if (error)
 -				goto error_tgt_fput;
 -			loop_check_gen++;
 -			full_check = 1;
 -			if (is_file_epoll(tf->file)) {
 -				tep = tf->file->private_data;
 -				error = -ELOOP;
 -				if (ep_loop_check(ep, tep) != 0)
 -					goto error_tgt_fput;
 -			}
 -			error = epoll_mutex_lock(&ep->mtx, 0, nonblock);
 -			if (error)
 -				goto error_tgt_fput;
 -		}
 -	}
++	full_check = ep_ctl_lock(&ctx, ep, op, f, tf->file, nonblock);
 +	if (full_check < 0)
 +		return full_check;
  
  	/*
 -	 * Try to lookup the file inside our RB tree. Since we grabbed "mtx"
 -	 * above, we can be sure to be able to use the item looked up by
 -	 * ep_find() till we release the mutex.
 +	 * Look the target up in ep's RB tree. We hold ep->mtx, so the
 +	 * item stays valid until we release.
  	 */
- 	epi = ep_find(ep, fd_file(tf), fd);
+ 	epi = ep_find(ep, tf);
  
  	error = -EINVAL;
  	switch (op) {
  	case EPOLL_CTL_ADD:
  		if (!epi) {
  			epds->events |= EPOLLERR | EPOLLHUP;
- 			error = ep_insert(&ctx, ep, epds, fd_file(tf), fd,
- 					  full_check);
 -			error = ep_insert(ep, epds, tf, full_check);
++			error = ep_insert(&ctx, ep, epds, tf, full_check);
  		} else
  			error = -EEXIST;
  		break;
@@@ -2701,11 -2341,37 +2674,30 @@@
  			error = -ENOENT;
  		break;
  	}
 -	mutex_unlock(&ep->mtx);
  
 -error_tgt_fput:
 -	if (full_check) {
 -		clear_tfile_check_list();
 -		loop_check_gen++;
 -		mutex_unlock(&epnested_mutex);
 -	}
 +	ep_ctl_unlock(&ctx, ep, full_check);
  	return error;
 -
  }
  
+ int do_epoll_ctl(int epfd, int op, int fd, struct epoll_event *epds,
+ 		 bool nonblock)
+ {
+ 	struct epoll_key efd;
+ 
+ 	CLASS(fd, f)(epfd);
+ 	if (fd_empty(f))
+ 		return -EBADF;
+ 
+ 	/* Get the "struct file *" for the target file */
+ 	CLASS(fd, tf)(fd);
+ 	if (fd_empty(tf))
+ 		return -EBADF;
+ 
+ 	efd.file = fd_file(tf);
+ 	efd.fd = fd;
+ 	return do_epoll_ctl_file(fd_file(f), op, &efd, epds, nonblock);
+ }
+ 
  /*
   * The following function implements the controller interface for
   * the eventpoll file that enables the insertion/removal/change of
diff --cc include/linux/eventpoll.h
index 728fb5dee5ede,c214c374fefc2..de1c738aa8ad9
--- a/include/linux/eventpoll.h
+++ b/include/linux/eventpoll.h
@@@ -61,8 -61,16 +61,16 @@@ static inline void eventpoll_release(st
  	eventpoll_release_file(file);
  }
  
+ struct epoll_key {
+ 	struct file *file;
+ 	int fd;
+ } __packed;
+ 
+ int do_epoll_ctl_file(struct file *f, int op, struct epoll_key *tf,
+ 		      struct epoll_event *epds, bool nonblock);
  int do_epoll_ctl(int epfd, int op, int fd, struct epoll_event *epds,
  		 bool nonblock);
 -int is_file_epoll(struct file *f);
++bool is_file_epoll(struct file *f);
  
  /* Tells if the epoll_ctl(2) operation needs an event copy from userspace */
  static inline int ep_op_has_event(int op)