From: Christian Brauner <brauner@kernel.org>
Date: Fri, 24 Apr 2026 13:46:44 +0000 (+0200)
Subject: eventpoll: extract lock dance from do_epoll_ctl() into ep_ctl_lock()
X-Git-Tag: v7.2-rc1~224^2~4^2~4
X-Git-Url: http://git.ipfire.org/gitweb/?a=commitdiff_plain;h=8d5278e05b7426eea986db62fa7de3c0cfaaef8f;p=thirdparty%2Fkernel%2Flinux.git

eventpoll: extract lock dance from do_epoll_ctl() into ep_ctl_lock()

do_epoll_ctl() interleaved three concerns in one body: input
validation, the ep->mtx + epnested_mutex acquisition dance for
EPOLL_CTL_ADD on potentially-nested topologies, and the op dispatch
with final unlock. The middle concern is the error-prone one; the
error_tgt_fput label existed mainly to orchestrate it.

Extract the acquisition as ep_ctl_lock() and the release as
ep_ctl_unlock(). ep_ctl_lock() always takes ep->mtx and, for
EPOLL_CTL_ADD on a topology that can change, additionally runs the
loop / path check under epnested_mutex. The return value is a
ternary:

   0        ep->mtx held.
   1        ep->mtx AND epnested_mutex held (full-check mode).
   -errno   failure, no locks held.

The non-negative value doubles as the @full_check argument to
ep_insert() and as the argument to ep_ctl_unlock(), so the caller
neither needs an out-parameter nor a separate boolean:

   full_check = ep_ctl_lock(ep, op, epfile, tfile, nonblock);
   if (full_check < 0)
       return full_check;
   ...
   ep_ctl_unlock(ep, full_check);

ep_ctl_unlock() drops ep->mtx and, if full_check == 1, clears
tfile_check_list, bumps loop_check_gen, and drops epnested_mutex --
mirroring the old error_tgt_fput block.

With that in place do_epoll_ctl()'s preconditions become direct
returns (no locks held, nothing to clean up), the acquisition is a
single call, the op dispatch is unchanged, and the epilogue is a
single ep_ctl_unlock() before return. The error_tgt_fput label goes
away.

The two loop_check_gen bumps (one at the start of the full check,
one after) are preserved inside ep_ctl_lock() / ep_ctl_unlock(),
keeping the invariant that ep->gen stamps left on per-eventpoll
caches never equal loop_check_gen after the check completes.

No functional change.

Signed-off-by: Christian Brauner (Amutable) <brauner@kernel.org>
Link: https://patch.msgid.link/20260424-work-epoll-rework-v1-13-249ed00a20f3@kernel.org
Signed-off-by: Christian Brauner <brauner@kernel.org>
---

diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index 6d4167a347ab0..d49457dc8c7f9 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -2479,14 +2479,92 @@ static inline int epoll_mutex_lock(struct mutex *mutex, bool nonblock)
 	return mutex_trylock(mutex) ? 0 : -EAGAIN;
 }
 
+/*
+ * Acquire the locks required for do_epoll_ctl() on @ep for @op.
+ *
+ * Always takes ep->mtx. For EPOLL_CTL_ADD, additionally runs the
+ * loop / path check under epnested_mutex when the topology can
+ * change: @ep is already watched (epfile->f_ep non-NULL), @ep was
+ * recently loop-checked (ep->gen == loop_check_gen), or @tfile is
+ * itself an eventpoll.
+ *
+ * Return value encodes both outcome and lock state:
+ *
+ *   0        success; ep->mtx held.
+ *   1        success; ep->mtx held AND the full check ran under
+ *            epnested_mutex (which is also still held). The value
+ *            doubles as the @full_check argument to ep_insert().
+ *   -errno   failure; no locks held.
+ *
+ * The caller releases what was taken with ep_ctl_unlock(ep, ret).
+ *
+ * Holding epnested_mutex on add is what prevents two racing
+ * EPOLL_CTL_ADDs on different eps from building a cycle without
+ * either walker observing it.
+ */
+static int ep_ctl_lock(struct eventpoll *ep, int op,
+		       struct file *epfile, struct file *tfile,
+		       bool nonblock)
+{
+	struct eventpoll *tep;
+	int error;
+
+	error = epoll_mutex_lock(&ep->mtx, nonblock);
+	if (error)
+		return error;
+
+	if (op != EPOLL_CTL_ADD)
+		return 0;
+	if (!READ_ONCE(epfile->f_ep) && ep->gen != loop_check_gen &&
+	    !is_file_epoll(tfile))
+		return 0;
+
+	/* Full check needed: drop ep->mtx so we can take epnested_mutex. */
+	mutex_unlock(&ep->mtx);
+	error = epoll_mutex_lock(&epnested_mutex, nonblock);
+	if (error)
+		return error;
+
+	loop_check_gen++;
+
+	if (is_file_epoll(tfile)) {
+		tep = tfile->private_data;
+		if (ep_loop_check(ep, tep) != 0) {
+			error = -ELOOP;
+			goto err_unlock_nested;
+		}
+	}
+
+	error = epoll_mutex_lock(&ep->mtx, nonblock);
+	if (error)
+		goto err_unlock_nested;
+
+	return 1;
+
+err_unlock_nested:
+	clear_tfile_check_list();
+	loop_check_gen++;
+	mutex_unlock(&epnested_mutex);
+	return error;
+}
+
+static void ep_ctl_unlock(struct eventpoll *ep, int full_check)
+{
+	mutex_unlock(&ep->mtx);
+	if (full_check) {
+		clear_tfile_check_list();
+		loop_check_gen++;
+		mutex_unlock(&epnested_mutex);
+	}
+}
+
 int do_epoll_ctl(int epfd, int op, int fd, struct epoll_event *epds,
 		 bool nonblock)
 {
 	int error;
-	int full_check = 0;
+	int full_check;
 	struct eventpoll *ep;
 	struct epitem *epi;
-	struct eventpoll *tep = NULL;
 
 	CLASS(fd, f)(epfd);
 	if (fd_empty(f))
@@ -2506,76 +2584,34 @@ int do_epoll_ctl(int epfd, int op, int fd, struct epoll_event *epds,
 		ep_take_care_of_epollwakeup(epds);
 
 	/*
-	 * We have to check that the file structure underneath the file descriptor
-	 * the user passed to us _is_ an eventpoll file. And also we do not permit
+	 * The @epfd file must itself be an eventpoll, and we do not permit
 	 * adding an epoll file descriptor inside itself.
 	 */
-	error = -EINVAL;
 	if (fd_file(f) == fd_file(tf) || !is_file_epoll(fd_file(f)))
-		goto error_tgt_fput;
+		return -EINVAL;
 
 	/*
 	 * epoll adds to the wakeup queue at EPOLL_CTL_ADD time only,
 	 * so EPOLLEXCLUSIVE is not allowed for a EPOLL_CTL_MOD operation.
-	 * Also, we do not currently supported nested exclusive wakeups.
+	 * Also, nested exclusive wakeups are not supported.
 	 */
 	if (ep_op_has_event(op) && (epds->events & EPOLLEXCLUSIVE)) {
 		if (op == EPOLL_CTL_MOD)
-			goto error_tgt_fput;
+			return -EINVAL;
 		if (op == EPOLL_CTL_ADD && (is_file_epoll(fd_file(tf)) ||
 				(epds->events & ~EPOLLEXCLUSIVE_OK_BITS)))
-			goto error_tgt_fput;
+			return -EINVAL;
 	}
 
-	/*
-	 * At this point it is safe to assume that the "private_data" contains
-	 * our own data structure.
-	 */
 	ep = fd_file(f)->private_data;
 
-	/*
-	 * When we insert an epoll file descriptor inside another epoll file
-	 * descriptor, there is the chance of creating closed loops, which are
-	 * better be handled here, than in more critical paths. While we are
-	 * checking for loops we also determine the list of files reachable
-	 * and hang them on the tfile_check_list, so we can check that we
-	 * haven't created too many possible wakeup paths.
-	 *
-	 * We do not need to take the global 'epumutex' on EPOLL_CTL_ADD when
-	 * the epoll file descriptor is attaching directly to a wakeup source,
-	 * unless the epoll file descriptor is nested. The purpose of taking the
-	 * 'epnested_mutex' on add is to prevent complex toplogies such as loops and
-	 * deep wakeup paths from forming in parallel through multiple
-	 * EPOLL_CTL_ADD operations.
-	 */
-	error = epoll_mutex_lock(&ep->mtx, nonblock);
-	if (error)
-		goto error_tgt_fput;
-	if (op == EPOLL_CTL_ADD) {
-		if (READ_ONCE(fd_file(f)->f_ep) || ep->gen == loop_check_gen ||
-		    is_file_epoll(fd_file(tf))) {
-			mutex_unlock(&ep->mtx);
-			error = epoll_mutex_lock(&epnested_mutex, nonblock);
-			if (error)
-				goto error_tgt_fput;
-			loop_check_gen++;
-			full_check = 1;
-			if (is_file_epoll(fd_file(tf))) {
-				tep = fd_file(tf)->private_data;
-				error = -ELOOP;
-				if (ep_loop_check(ep, tep) != 0)
-					goto error_tgt_fput;
-			}
-			error = epoll_mutex_lock(&ep->mtx, nonblock);
-			if (error)
-				goto error_tgt_fput;
-		}
-	}
+	full_check = ep_ctl_lock(ep, op, fd_file(f), fd_file(tf), nonblock);
+	if (full_check < 0)
+		return full_check;
 
 	/*
-	 * Try to lookup the file inside our RB tree. Since we grabbed "mtx"
-	 * above, we can be sure to be able to use the item looked up by
-	 * ep_find() till we release the mutex.
+	 * Look the target up in ep's RB tree. We hold ep->mtx, so the
+	 * item stays valid until we release.
 	 */
 	epi = ep_find(ep, fd_file(tf), fd);
 
@@ -2610,14 +2646,8 @@ int do_epoll_ctl(int epfd, int op, int fd, struct epoll_event *epds,
 			error = -ENOENT;
 		break;
 	}
-	mutex_unlock(&ep->mtx);
 
-error_tgt_fput:
-	if (full_check) {
-		clear_tfile_check_list();
-		loop_check_gen++;
-		mutex_unlock(&epnested_mutex);
-	}
+	ep_ctl_unlock(ep, full_check);
 	return error;
 }