Merge patch series "io_uring related epoll cleanups"

author Christian Brauner <brauner@kernel.org>

Fri, 15 May 2026 15:41:05 +0000 (17:41 +0200)

committer Christian Brauner <brauner@kernel.org>

Fri, 15 May 2026 15:41:05 +0000 (17:41 +0200)
author Christian Brauner <brauner@kernel.org>
Fri, 15 May 2026 15:41:05 +0000 (17:41 +0200)
committer Christian Brauner <brauner@kernel.org>
Fri, 15 May 2026 15:41:05 +0000 (17:41 +0200)
diff --cc fs/eventpoll.c

index b839cc02eb0eda085bb5d1c74eb153d51f1505c6,7535b10f8c6a60fcb6a5a62019c92d38d8dfb875..a569e98d4a9964622f663c38139f0fe8010456c7
--- 1/fs/eventpoll.c
--- 2/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@@ -265,15 -135,13 +260,15 @@@ struct epitem 
         struct list_head rdllink;
   
         /*
- -       * Works together "struct eventpoll"->ovflist in keeping the
- -       * single linked chain of items.
+ +       * Link on the owning eventpoll's scan-overflow list (ep->ovflist),
+ +       * EP_UNACTIVE_PTR when not linked. See epi_on_ovflist() /
+ +       * epi_clear_ovflist() and the "Ready-list state machine" section
+ +       * in the top-of-file banner.
          */
- -      struct epitem *next;
+ +      struct epitem *ovflist_next;
   
         /* The file descriptor information this item refers to */
-       struct epoll_filefd ffd;
+       struct epoll_key ffd;
   
         /* List containing poll wait queues */
         struct eppoll_entry *pwqlist;
@@@ -526,7 -329,7 +521,7 @@@ static void __init epoll_sysctls_init(v
   
   static const struct file_operations eventpoll_fops;
   
- static inline bool is_file_epoll(struct file *f)
- -int is_file_epoll(struct file *f)
++bool is_file_epoll(struct file *f)
   {
         return f->f_op == &eventpoll_fops;
   }
@@@ -1786,62 -1545,38 +1770,62 @@@ allocate
   }
   
   /*
- - * Must be called with "mtx" held.
+ + * Charge the user's epoll_watches quota, allocate a fresh epitem for
-  * @tfile/@fd, and initialize its fields. The returned item is not yet
-  * linked into any data structure; the caller must install it via
++ * @tf, and initialize its fields. The returned item is not yet linked
++ * into any data structure; the caller must install it via
+ + * ep_register_epitem() (which takes over on success) or kmem_cache_free()
+ + * it and decrement epoll_watches on its own.
+ + *
+ + * Returns ERR_PTR(-ENOSPC) if the quota is exceeded, ERR_PTR(-ENOMEM)
+ + * if the slab allocation fails.
    */
- -static int ep_insert(struct eventpoll *ep, const struct epoll_event *event,
- -                   struct epoll_key *tf, int full_check)
+ +static struct epitem *ep_alloc_epitem(struct eventpoll *ep,
+ +                                    const struct epoll_event *event,
-                                     struct file *tfile, int fd)
++                                    struct epoll_key *tf)
   {
- -      int error, pwake = 0;
- -      __poll_t revents;
         struct epitem *epi;
- -      struct ep_pqueue epq;
- -      struct eventpoll *tep = NULL;
- -
- -      if (is_file_epoll(tf->file))
- -              tep = tf->file->private_data;
- -
- -      lockdep_assert_irqs_enabled();
   
         if (unlikely(percpu_counter_compare(&ep->user->epoll_watches,
                                             max_user_watches) >= 0))
- -              return -ENOSPC;
+ +              return ERR_PTR(-ENOSPC);
         percpu_counter_inc(&ep->user->epoll_watches);
   
- -      if (!(epi = kmem_cache_zalloc(epi_cache, GFP_KERNEL))) {
+ +      epi = kmem_cache_zalloc(epi_cache, GFP_KERNEL);
+ +      if (unlikely(!epi)) {
                 percpu_counter_dec(&ep->user->epoll_watches);
- -              return -ENOMEM;
+ +              return ERR_PTR(-ENOMEM);
         }
   
- -      /* Item initialization follow here ... */
         INIT_LIST_HEAD(&epi->rdllink);
         epi->ep = ep;
-       ep_set_ffd(&epi->ffd, tfile, fd);
+       epi->ffd = *tf;
         epi->event = *event;
- -      epi->next = EP_UNACTIVE_PTR;
+ +      epi_clear_ovflist(epi);
+ +
+ +      return epi;
+ +}
+ +
+ +/*
+ + * Install @epi into its target file's f_ep hlist and into @ep's rbtree,
+ + * taking one additional reference on @ep for the lifetime of the item.
+ + *
+ + * If @tep is non-NULL, the target file is itself an eventpoll; we hold
+ + * tep->mtx at subclass 1 across the attach + rbtree insert to serialize
+ + * with the target side. RB tree ops are protected by @ep->mtx, which
+ + * the caller already holds.
+ + *
+ + * On failure the epi is freed and the epoll_watches counter decremented,
+ + * matching ep_alloc_epitem()'s allocation. After this returns
+ + * successfully, ep_insert()'s later error paths use ep_remove() for
+ + * unwind; that cannot drop @ep's refcount to zero because the ep file
+ + * itself still holds the original reference.
+ + */
+ +static int ep_register_epitem(struct ep_ctl_ctx *ctx, struct eventpoll *ep,
+ +                            struct epitem *epi, struct eventpoll *tep,
+ +                            int full_check)
+ +{
+ +      struct file *tfile = epi->ffd.file;
+ +      int error;
   
         if (tep)
                 mutex_lock_nested(&tep->mtx, 1);
@@@ -1863,38 -1600,14 +1847,38 @@@
         if (tep)
                 mutex_unlock(&tep->mtx);
   
- -      /*
- -       * ep_remove() calls in the later error paths can't lead to
- -       * ep_free() as the ep file itself still holds an ep reference.
- -       */
         ep_get(ep);
-                    const struct epoll_event *event, struct file *tfile,
-                    int fd, int full_check)
+ +      return 0;
+ +}
+ +
+ +/*
+ + * Must be called with "mtx" held.
+ + */
+ +static int ep_insert(struct ep_ctl_ctx *ctx, struct eventpoll *ep,
-       if (is_file_epoll(tfile))
-               tep = tfile->private_data;
++                   const struct epoll_event *event, struct epoll_key *tf,
++                   int full_check)
+ +{
+ +      int error, pwake = 0;
+ +      __poll_t revents;
+ +      struct epitem *epi;
+ +      struct ep_pqueue epq;
+ +      struct eventpoll *tep = NULL;
+ +
-       epi = ep_alloc_epitem(ep, event, tfile, fd);
++      if (is_file_epoll(tf->file))
++              tep = tf->file->private_data;
+ +
+ +      lockdep_assert_irqs_enabled();
+ +
++      epi = ep_alloc_epitem(ep, event, tf);
+ +      if (IS_ERR(epi))
+ +              return PTR_ERR(epi);
+ +
+ +      error = ep_register_epitem(ctx, ep, epi, tep, full_check);
+ +      if (error)
+ +              return error;
   
- -      /* now check if we've created too many backpaths */
- -      if (unlikely(full_check && reverse_path_check())) {
+ +      /* Reject the insert if the new link would create too many back-paths. */
+ +      if (unlikely(full_check && reverse_path_check(ctx))) {
                 ep_remove(ep, epi);
                 return -EINVAL;
         }
@@@ -2522,114 -2207,29 +2506,105 @@@ static inline void ep_take_care_of_epol
   }
   #endif
   
- -static inline int epoll_mutex_lock(struct mutex *mutex, int depth,
- -                                 bool nonblock)
+ +static inline int epoll_mutex_lock(struct mutex *mutex, bool nonblock)
   {
         if (!nonblock) {
- -              mutex_lock_nested(mutex, depth);
+ +              mutex_lock(mutex);
                 return 0;
         }
- -      if (mutex_trylock(mutex))
+ +      return mutex_trylock(mutex) ? 0 : -EAGAIN;
+ +}
+ +
+ +/*
+ + * Acquire the locks required for do_epoll_ctl() on @ep for @op.
+ + *
+ + * Always takes ep->mtx. For EPOLL_CTL_ADD, additionally runs the
+ + * loop / path check under epnested_mutex when the topology can
+ + * change: @ep is already watched (epfile->f_ep non-NULL), @ep was
+ + * recently loop-checked (ep->gen == loop_check_gen), or @tfile is
+ + * itself an eventpoll.
+ + *
+ + * Return value encodes both outcome and lock state:
+ + *
+ + *   0        success; ep->mtx held.
+ + *   1        success; ep->mtx held AND the full check ran under
+ + *            epnested_mutex (which is also still held). The value
+ + *            doubles as the @full_check argument to ep_insert().
+ + *   -errno   failure; no locks held.
+ + *
+ + * The caller releases what was taken with ep_ctl_unlock(ep, ret).
+ + *
+ + * Holding epnested_mutex on add is what prevents two racing
+ + * EPOLL_CTL_ADDs on different eps from building a cycle without
+ + * either walker observing it.
+ + */
+ +static int ep_ctl_lock(struct ep_ctl_ctx *ctx, struct eventpoll *ep, int op,
+ +                     struct file *epfile, struct file *tfile, bool nonblock)
+ +{
+ +      struct eventpoll *tep;
+ +      int error;
+ +
+ +      error = epoll_mutex_lock(&ep->mtx, nonblock);
+ +      if (error)
+ +              return error;
+ +
+ +      if (op != EPOLL_CTL_ADD)
                 return 0;
- -      return -EAGAIN;
+ +      if (!READ_ONCE(epfile->f_ep) && ep->gen != loop_check_gen &&
+ +          !is_file_epoll(tfile))
+ +              return 0;
+ +
+ +      /* Full check needed: drop ep->mtx so we can take epnested_mutex. */
+ +      mutex_unlock(&ep->mtx);
+ +      error = epoll_mutex_lock(&epnested_mutex, nonblock);
+ +      if (error)
+ +              return error;
+ +
+ +      loop_check_gen++;
+ +
+ +      if (is_file_epoll(tfile)) {
+ +              tep = tfile->private_data;
+ +              if (ep_loop_check(ctx, ep, tep) != 0) {
+ +                      error = -ELOOP;
+ +                      goto err_unlock_nested;
+ +              }
+ +      }
+ +
+ +      error = epoll_mutex_lock(&ep->mtx, nonblock);
+ +      if (error)
+ +              goto err_unlock_nested;
+ +
+ +      return 1;
+ +
+ +err_unlock_nested:
+ +      clear_tfile_check_list(ctx);
+ +      loop_check_gen++;
+ +      mutex_unlock(&epnested_mutex);
+ +      return error;
+ +}
+ +
+ +static void ep_ctl_unlock(struct ep_ctl_ctx *ctx, struct eventpoll *ep,
+ +                        int full_check)
+ +{
+ +      mutex_unlock(&ep->mtx);
+ +      if (full_check) {
+ +              clear_tfile_check_list(ctx);
+ +              loop_check_gen++;
+ +              mutex_unlock(&epnested_mutex);
+ +      }
   }
   
- int do_epoll_ctl(int epfd, int op, int fd, struct epoll_event *epds,
-                bool nonblock)
+ int do_epoll_ctl_file(struct file *f, int op, struct epoll_key *tf,
+                     struct epoll_event *epds, bool nonblock)
   {
         int error;
- -      int full_check = 0;
+ +      int full_check;
         struct eventpoll *ep;
         struct epitem *epi;
- -      struct eventpoll *tep = NULL;
+ +      struct ep_ctl_ctx ctx = { };
   
-       CLASS(fd, f)(epfd);
-       if (fd_empty(f))
-               return -EBADF;
- 
-       /* Get the "struct file *" for the target file */
-       CLASS(fd, tf)(fd);
-       if (fd_empty(tf))
-               return -EBADF;
- 
         /* The target file descriptor must support poll */
-       if (!file_can_poll(fd_file(tf)))
+       if (!file_can_poll(tf->file))
                 return -EPERM;
   
         /* Check if EPOLLWAKEUP is allowed */
@@@ -2637,11 -2237,13 +2612,11 @@@
                 ep_take_care_of_epollwakeup(epds);
   
         /*
-        * The @epfd file must itself be an eventpoll, and we do not permit
- -       * We have to check that the file structure underneath the file descriptor
- -       * the user passed to us _is_ an eventpoll file. And also we do not permit
++       * The @f file must itself be an eventpoll, and we do not permit
          * adding an epoll file descriptor inside itself.
          */
-       if (fd_file(f) == fd_file(tf) || !is_file_epoll(fd_file(f)))
- -      error = -EINVAL;
+       if (f == tf->file || !is_file_epoll(f))
- -              goto error_tgt_fput;
+ +              return -EINVAL;
   
         /*
          * epoll adds to the wakeup queue at EPOLL_CTL_ADD time only,
@@@ -2650,32 -2252,70 +2625,30 @@@
          */
         if (ep_op_has_event(op) && (epds->events & EPOLLEXCLUSIVE)) {
                 if (op == EPOLL_CTL_MOD)
- -                      goto error_tgt_fput;
+ +                      return -EINVAL;
-               if (op == EPOLL_CTL_ADD && (is_file_epoll(fd_file(tf)) ||
+               if (op == EPOLL_CTL_ADD && (is_file_epoll(tf->file) ||
                                 (epds->events & ~EPOLLEXCLUSIVE_OK_BITS)))
- -                      goto error_tgt_fput;
+ +                      return -EINVAL;
         }
   
-       ep = fd_file(f)->private_data;
- -      /*
- -       * At this point it is safe to assume that the "private_data" contains
- -       * our own data structure.
- -       */
+       ep = f->private_data;
   
-       full_check = ep_ctl_lock(&ctx, ep, op, fd_file(f), fd_file(tf),
-                                nonblock);
- -      /*
- -       * When we insert an epoll file descriptor inside another epoll file
- -       * descriptor, there is the chance of creating closed loops, which are
- -       * better be handled here, than in more critical paths. While we are
- -       * checking for loops we also determine the list of files reachable
- -       * and hang them on the tfile_check_list, so we can check that we
- -       * haven't created too many possible wakeup paths.
- -       *
- -       * We do not need to take the global 'epumutex' on EPOLL_CTL_ADD when
- -       * the epoll file descriptor is attaching directly to a wakeup source,
- -       * unless the epoll file descriptor is nested. The purpose of taking the
- -       * 'epnested_mutex' on add is to prevent complex toplogies such as loops and
- -       * deep wakeup paths from forming in parallel through multiple
- -       * EPOLL_CTL_ADD operations.
- -       */
- -      error = epoll_mutex_lock(&ep->mtx, 0, nonblock);
- -      if (error)
- -              goto error_tgt_fput;
- -      if (op == EPOLL_CTL_ADD) {
- -              if (READ_ONCE(f->f_ep) || ep->gen == loop_check_gen ||
- -                  is_file_epoll(tf->file)) {
- -                      mutex_unlock(&ep->mtx);
- -                      error = epoll_mutex_lock(&epnested_mutex, 0, nonblock);
- -                      if (error)
- -                              goto error_tgt_fput;
- -                      loop_check_gen++;
- -                      full_check = 1;
- -                      if (is_file_epoll(tf->file)) {
- -                              tep = tf->file->private_data;
- -                              error = -ELOOP;
- -                              if (ep_loop_check(ep, tep) != 0)
- -                                      goto error_tgt_fput;
- -                      }
- -                      error = epoll_mutex_lock(&ep->mtx, 0, nonblock);
- -                      if (error)
- -                              goto error_tgt_fput;
- -              }
- -      }
++      full_check = ep_ctl_lock(&ctx, ep, op, f, tf->file, nonblock);
+ +      if (full_check < 0)
+ +              return full_check;
   
         /*
- -       * Try to lookup the file inside our RB tree. Since we grabbed "mtx"
- -       * above, we can be sure to be able to use the item looked up by
- -       * ep_find() till we release the mutex.
+ +       * Look the target up in ep's RB tree. We hold ep->mtx, so the
+ +       * item stays valid until we release.
          */
-       epi = ep_find(ep, fd_file(tf), fd);
+       epi = ep_find(ep, tf);
   
         error = -EINVAL;
         switch (op) {
         case EPOLL_CTL_ADD:
                 if (!epi) {
                         epds->events |= EPOLLERR | EPOLLHUP;
-                       error = ep_insert(&ctx, ep, epds, fd_file(tf), fd,
-                                         full_check);
- -                      error = ep_insert(ep, epds, tf, full_check);
++                      error = ep_insert(&ctx, ep, epds, tf, full_check);
                 } else
                         error = -EEXIST;
                 break;
@@@ -2701,11 -2341,37 +2674,30 @@@
                         error = -ENOENT;
                 break;
         }
- -      mutex_unlock(&ep->mtx);
   
- -error_tgt_fput:
- -      if (full_check) {
- -              clear_tfile_check_list();
- -              loop_check_gen++;
- -              mutex_unlock(&epnested_mutex);
- -      }
+ +      ep_ctl_unlock(&ctx, ep, full_check);
         return error;
- -
   }
   
+ int do_epoll_ctl(int epfd, int op, int fd, struct epoll_event *epds,
+                bool nonblock)
+ {
+       struct epoll_key efd;
+ 
+       CLASS(fd, f)(epfd);
+       if (fd_empty(f))
+               return -EBADF;
+ 
+       /* Get the "struct file *" for the target file */
+       CLASS(fd, tf)(fd);
+       if (fd_empty(tf))
+               return -EBADF;
+ 
+       efd.file = fd_file(tf);
+       efd.fd = fd;
+       return do_epoll_ctl_file(fd_file(f), op, &efd, epds, nonblock);
+ }
+ 
   /*
    * The following function implements the controller interface for
    * the eventpoll file that enables the insertion/removal/change of
diff --cc include/linux/eventpoll.h

index 728fb5dee5edeb309ec404eb3e5da0aa07098728,c214c374fefc27ad00105dd17457e24148ed8a59..de1c738aa8ad9ee1b9408214be9c7304ab914a88
--- 1/include/linux/eventpoll.h
--- 2/include/linux/eventpoll.h
+++ b/include/linux/eventpoll.h
@@@ -61,8 -61,16 +61,16 @@@ static inline void eventpoll_release(st
         eventpoll_release_file(file);
   }
   
+ struct epoll_key {
+       struct file *file;
+       int fd;
+ } __packed;
+ 
+ int do_epoll_ctl_file(struct file *f, int op, struct epoll_key *tf,
+                     struct epoll_event *epds, bool nonblock);
   int do_epoll_ctl(int epfd, int op, int fd, struct epoll_event *epds,
                  bool nonblock);
- -int is_file_epoll(struct file *f);
++bool is_file_epoll(struct file *f);
   
   /* Tells if the epoll_ctl(2) operation needs an event copy from userspace */
   static inline int ep_op_has_event(int op)
author	Christian Brauner <brauner@kernel.org>
	Fri, 15 May 2026 15:41:05 +0000 (17:41 +0200)
committer	Christian Brauner <brauner@kernel.org>
	Fri, 15 May 2026 15:41:05 +0000 (17:41 +0200)
		1	2
fs/eventpoll.c	patch \|	diff1 \|	diff2 \|	blob \| history
include/linux/eventpoll.h	patch \|	diff1 \|	diff2 \|	blob \| history