eventpoll: extract lock dance from do_epoll_ctl() into ep_ctl_lock()

author Christian Brauner <brauner@kernel.org>

Fri, 24 Apr 2026 13:46:44 +0000 (15:46 +0200)

committer Christian Brauner <brauner@kernel.org>

Tue, 28 Apr 2026 15:27:28 +0000 (17:27 +0200)
author Christian Brauner <brauner@kernel.org>
Fri, 24 Apr 2026 13:46:44 +0000 (15:46 +0200)
committer Christian Brauner <brauner@kernel.org>
Tue, 28 Apr 2026 15:27:28 +0000 (17:27 +0200)
diff --git a/fs/eventpoll.c b/fs/eventpoll.c

index 6d4167a347ab05a8f26dacb21682cc21a862ec1c..d49457dc8c7f9b63df4e8c204cce9bd882463a21 100644 (file)
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -2479,14 +2479,92 @@ static inline int epoll_mutex_lock(struct mutex *mutex, bool nonblock)
         return mutex_trylock(mutex) ? 0 : -EAGAIN;
  }
  
+/*
+ * Acquire the locks required for do_epoll_ctl() on @ep for @op.
+ *
+ * Always takes ep->mtx. For EPOLL_CTL_ADD, additionally runs the
+ * loop / path check under epnested_mutex when the topology can
+ * change: @ep is already watched (epfile->f_ep non-NULL), @ep was
+ * recently loop-checked (ep->gen == loop_check_gen), or @tfile is
+ * itself an eventpoll.
+ *
+ * Return value encodes both outcome and lock state:
+ *
+ *   0        success; ep->mtx held.
+ *   1        success; ep->mtx held AND the full check ran under
+ *            epnested_mutex (which is also still held). The value
+ *            doubles as the @full_check argument to ep_insert().
+ *   -errno   failure; no locks held.
+ *
+ * The caller releases what was taken with ep_ctl_unlock(ep, ret).
+ *
+ * Holding epnested_mutex on add is what prevents two racing
+ * EPOLL_CTL_ADDs on different eps from building a cycle without
+ * either walker observing it.
+ */
+static int ep_ctl_lock(struct eventpoll *ep, int op,
+                      struct file *epfile, struct file *tfile,
+                      bool nonblock)
+{
+       struct eventpoll *tep;
+       int error;
+
+       error = epoll_mutex_lock(&ep->mtx, nonblock);
+       if (error)
+               return error;
+
+       if (op != EPOLL_CTL_ADD)
+               return 0;
+       if (!READ_ONCE(epfile->f_ep) && ep->gen != loop_check_gen &&
+           !is_file_epoll(tfile))
+               return 0;
+
+       /* Full check needed: drop ep->mtx so we can take epnested_mutex. */
+       mutex_unlock(&ep->mtx);
+       error = epoll_mutex_lock(&epnested_mutex, nonblock);
+       if (error)
+               return error;
+
+       loop_check_gen++;
+
+       if (is_file_epoll(tfile)) {
+               tep = tfile->private_data;
+               if (ep_loop_check(ep, tep) != 0) {
+                       error = -ELOOP;
+                       goto err_unlock_nested;
+               }
+       }
+
+       error = epoll_mutex_lock(&ep->mtx, nonblock);
+       if (error)
+               goto err_unlock_nested;
+
+       return 1;
+
+err_unlock_nested:
+       clear_tfile_check_list();
+       loop_check_gen++;
+       mutex_unlock(&epnested_mutex);
+       return error;
+}
+
+static void ep_ctl_unlock(struct eventpoll *ep, int full_check)
+{
+       mutex_unlock(&ep->mtx);
+       if (full_check) {
+               clear_tfile_check_list();
+               loop_check_gen++;
+               mutex_unlock(&epnested_mutex);
+       }
+}
+
  int do_epoll_ctl(int epfd, int op, int fd, struct epoll_event *epds,
                  bool nonblock)
  {
         int error;
-       int full_check = 0;
+       int full_check;
         struct eventpoll *ep;
         struct epitem *epi;
-       struct eventpoll *tep = NULL;
  
         CLASS(fd, f)(epfd);
         if (fd_empty(f))
@@ -2506,76 +2584,34 @@ int do_epoll_ctl(int epfd, int op, int fd, struct epoll_event *epds,
                 ep_take_care_of_epollwakeup(epds);
  
         /*
-        * We have to check that the file structure underneath the file descriptor
-        * the user passed to us _is_ an eventpoll file. And also we do not permit
+        * The @epfd file must itself be an eventpoll, and we do not permit
          * adding an epoll file descriptor inside itself.
          */
-       error = -EINVAL;
         if (fd_file(f) == fd_file(tf) || !is_file_epoll(fd_file(f)))
-               goto error_tgt_fput;
+               return -EINVAL;
  
         /*
          * epoll adds to the wakeup queue at EPOLL_CTL_ADD time only,
          * so EPOLLEXCLUSIVE is not allowed for a EPOLL_CTL_MOD operation.
-        * Also, we do not currently supported nested exclusive wakeups.
+        * Also, nested exclusive wakeups are not supported.
          */
         if (ep_op_has_event(op) && (epds->events & EPOLLEXCLUSIVE)) {
                 if (op == EPOLL_CTL_MOD)
-                       goto error_tgt_fput;
+                       return -EINVAL;
                 if (op == EPOLL_CTL_ADD && (is_file_epoll(fd_file(tf)) ||
                                 (epds->events & ~EPOLLEXCLUSIVE_OK_BITS)))
-                       goto error_tgt_fput;
+                       return -EINVAL;
         }
  
-       /*
-        * At this point it is safe to assume that the "private_data" contains
-        * our own data structure.
-        */
         ep = fd_file(f)->private_data;
  
-       /*
-        * When we insert an epoll file descriptor inside another epoll file
-        * descriptor, there is the chance of creating closed loops, which are
-        * better be handled here, than in more critical paths. While we are
-        * checking for loops we also determine the list of files reachable
-        * and hang them on the tfile_check_list, so we can check that we
-        * haven't created too many possible wakeup paths.
-        *
-        * We do not need to take the global 'epumutex' on EPOLL_CTL_ADD when
-        * the epoll file descriptor is attaching directly to a wakeup source,
-        * unless the epoll file descriptor is nested. The purpose of taking the
-        * 'epnested_mutex' on add is to prevent complex toplogies such as loops and
-        * deep wakeup paths from forming in parallel through multiple
-        * EPOLL_CTL_ADD operations.
-        */
-       error = epoll_mutex_lock(&ep->mtx, nonblock);
-       if (error)
-               goto error_tgt_fput;
-       if (op == EPOLL_CTL_ADD) {
-               if (READ_ONCE(fd_file(f)->f_ep) || ep->gen == loop_check_gen ||
-                   is_file_epoll(fd_file(tf))) {
-                       mutex_unlock(&ep->mtx);
-                       error = epoll_mutex_lock(&epnested_mutex, nonblock);
-                       if (error)
-                               goto error_tgt_fput;
-                       loop_check_gen++;
-                       full_check = 1;
-                       if (is_file_epoll(fd_file(tf))) {
-                               tep = fd_file(tf)->private_data;
-                               error = -ELOOP;
-                               if (ep_loop_check(ep, tep) != 0)
-                                       goto error_tgt_fput;
-                       }
-                       error = epoll_mutex_lock(&ep->mtx, nonblock);
-                       if (error)
-                               goto error_tgt_fput;
-               }
-       }
+       full_check = ep_ctl_lock(ep, op, fd_file(f), fd_file(tf), nonblock);
+       if (full_check < 0)
+               return full_check;
  
         /*
-        * Try to lookup the file inside our RB tree. Since we grabbed "mtx"
-        * above, we can be sure to be able to use the item looked up by
-        * ep_find() till we release the mutex.
+        * Look the target up in ep's RB tree. We hold ep->mtx, so the
+        * item stays valid until we release.
          */
         epi = ep_find(ep, fd_file(tf), fd);
  
@@ -2610,14 +2646,8 @@ int do_epoll_ctl(int epfd, int op, int fd, struct epoll_event *epds,
                         error = -ENOENT;
                 break;
         }
-       mutex_unlock(&ep->mtx);
  
-error_tgt_fput:
-       if (full_check) {
-               clear_tfile_check_list();
-               loop_check_gen++;
-               mutex_unlock(&epnested_mutex);
-       }
+       ep_ctl_unlock(ep, full_check);
         return error;
  }
author	Christian Brauner <brauner@kernel.org>
	Fri, 24 Apr 2026 13:46:44 +0000 (15:46 +0200)
committer	Christian Brauner <brauner@kernel.org>
	Tue, 28 Apr 2026 15:27:28 +0000 (17:27 +0200)