struct list_head rdllink;
/*
- * Works together "struct eventpoll"->ovflist in keeping the
- * single linked chain of items.
+ * Link on the owning eventpoll's scan-overflow list (ep->ovflist),
+ * EP_UNACTIVE_PTR when not linked. See epi_on_ovflist() /
+ * epi_clear_ovflist() and the "Ready-list state machine" section
+ * in the top-of-file banner.
*/
- struct epitem *next;
+ struct epitem *ovflist_next;
/* The file descriptor information this item refers to */
- struct epoll_filefd ffd;
+ struct epoll_key ffd;
/* List containing poll wait queues */
struct eppoll_entry *pwqlist;
static const struct file_operations eventpoll_fops;
- static inline bool is_file_epoll(struct file *f)
-int is_file_epoll(struct file *f)
++bool is_file_epoll(struct file *f)
{
return f->f_op == &eventpoll_fops;
}
}
/*
- * Must be called with "mtx" held.
+ * Charge the user's epoll_watches quota, allocate a fresh epitem for
- * @tfile/@fd, and initialize its fields. The returned item is not yet
- * linked into any data structure; the caller must install it via
++ * @tf, and initialize its fields. The returned item is not yet linked
++ * into any data structure; the caller must install it via
+ * ep_register_epitem() (which takes over on success) or kmem_cache_free()
+ * it and decrement epoll_watches on its own.
+ *
+ * Returns ERR_PTR(-ENOSPC) if the quota is exceeded, ERR_PTR(-ENOMEM)
+ * if the slab allocation fails.
*/
-static int ep_insert(struct eventpoll *ep, const struct epoll_event *event,
- struct epoll_key *tf, int full_check)
+static struct epitem *ep_alloc_epitem(struct eventpoll *ep,
+ const struct epoll_event *event,
- struct file *tfile, int fd)
++ struct epoll_key *tf)
{
- int error, pwake = 0;
- __poll_t revents;
struct epitem *epi;
- struct ep_pqueue epq;
- struct eventpoll *tep = NULL;
-
- if (is_file_epoll(tf->file))
- tep = tf->file->private_data;
-
- lockdep_assert_irqs_enabled();
if (unlikely(percpu_counter_compare(&ep->user->epoll_watches,
max_user_watches) >= 0))
- return -ENOSPC;
+ return ERR_PTR(-ENOSPC);
percpu_counter_inc(&ep->user->epoll_watches);
- if (!(epi = kmem_cache_zalloc(epi_cache, GFP_KERNEL))) {
+ epi = kmem_cache_zalloc(epi_cache, GFP_KERNEL);
+ if (unlikely(!epi)) {
percpu_counter_dec(&ep->user->epoll_watches);
- return -ENOMEM;
+ return ERR_PTR(-ENOMEM);
}
- /* Item initialization follow here ... */
INIT_LIST_HEAD(&epi->rdllink);
epi->ep = ep;
- ep_set_ffd(&epi->ffd, tfile, fd);
+ epi->ffd = *tf;
epi->event = *event;
- epi->next = EP_UNACTIVE_PTR;
+ epi_clear_ovflist(epi);
+
+ return epi;
+}
+
+/*
+ * Install @epi into its target file's f_ep hlist and into @ep's rbtree,
+ * taking one additional reference on @ep for the lifetime of the item.
+ *
+ * If @tep is non-NULL, the target file is itself an eventpoll; we hold
+ * tep->mtx at subclass 1 across the attach + rbtree insert to serialize
+ * with the target side. RB tree ops are protected by @ep->mtx, which
+ * the caller already holds.
+ *
+ * On failure the epi is freed and the epoll_watches counter decremented,
+ * matching ep_alloc_epitem()'s allocation. After this returns
+ * successfully, ep_insert()'s later error paths use ep_remove() for
+ * unwind; that cannot drop @ep's refcount to zero because the ep file
+ * itself still holds the original reference.
+ */
+static int ep_register_epitem(struct ep_ctl_ctx *ctx, struct eventpoll *ep,
+ struct epitem *epi, struct eventpoll *tep,
+ int full_check)
+{
+ struct file *tfile = epi->ffd.file;
+ int error;
if (tep)
mutex_lock_nested(&tep->mtx, 1);
if (tep)
mutex_unlock(&tep->mtx);
- /*
- * ep_remove() calls in the later error paths can't lead to
- * ep_free() as the ep file itself still holds an ep reference.
- */
ep_get(ep);
- const struct epoll_event *event, struct file *tfile,
- int fd, int full_check)
+ return 0;
+}
+
+/*
+ * Must be called with "mtx" held.
+ */
+static int ep_insert(struct ep_ctl_ctx *ctx, struct eventpoll *ep,
- if (is_file_epoll(tfile))
- tep = tfile->private_data;
++ const struct epoll_event *event, struct epoll_key *tf,
++ int full_check)
+{
+ int error, pwake = 0;
+ __poll_t revents;
+ struct epitem *epi;
+ struct ep_pqueue epq;
+ struct eventpoll *tep = NULL;
+
- epi = ep_alloc_epitem(ep, event, tfile, fd);
++ if (is_file_epoll(tf->file))
++ tep = tf->file->private_data;
+
+ lockdep_assert_irqs_enabled();
+
++ epi = ep_alloc_epitem(ep, event, tf);
+ if (IS_ERR(epi))
+ return PTR_ERR(epi);
+
+ error = ep_register_epitem(ctx, ep, epi, tep, full_check);
+ if (error)
+ return error;
- /* now check if we've created too many backpaths */
- if (unlikely(full_check && reverse_path_check())) {
+ /* Reject the insert if the new link would create too many back-paths. */
+ if (unlikely(full_check && reverse_path_check(ctx))) {
ep_remove(ep, epi);
return -EINVAL;
}
}
#endif
-static inline int epoll_mutex_lock(struct mutex *mutex, int depth,
- bool nonblock)
+static inline int epoll_mutex_lock(struct mutex *mutex, bool nonblock)
{
if (!nonblock) {
- mutex_lock_nested(mutex, depth);
+ mutex_lock(mutex);
return 0;
}
- if (mutex_trylock(mutex))
+ return mutex_trylock(mutex) ? 0 : -EAGAIN;
+}
+
+/*
+ * Acquire the locks required for do_epoll_ctl() on @ep for @op.
+ *
+ * Always takes ep->mtx. For EPOLL_CTL_ADD, additionally runs the
+ * loop / path check under epnested_mutex when the topology can
+ * change: @ep is already watched (epfile->f_ep non-NULL), @ep was
+ * recently loop-checked (ep->gen == loop_check_gen), or @tfile is
+ * itself an eventpoll.
+ *
+ * Return value encodes both outcome and lock state:
+ *
+ * 0 success; ep->mtx held.
+ * 1 success; ep->mtx held AND the full check ran under
+ * epnested_mutex (which is also still held). The value
+ * doubles as the @full_check argument to ep_insert().
+ * -errno failure; no locks held.
+ *
+ * The caller releases what was taken with ep_ctl_unlock(ep, ret).
+ *
+ * Holding epnested_mutex on add is what prevents two racing
+ * EPOLL_CTL_ADDs on different eps from building a cycle without
+ * either walker observing it.
+ */
+static int ep_ctl_lock(struct ep_ctl_ctx *ctx, struct eventpoll *ep, int op,
+ struct file *epfile, struct file *tfile, bool nonblock)
+{
+ struct eventpoll *tep;
+ int error;
+
+ error = epoll_mutex_lock(&ep->mtx, nonblock);
+ if (error)
+ return error;
+
+ if (op != EPOLL_CTL_ADD)
return 0;
- return -EAGAIN;
+ if (!READ_ONCE(epfile->f_ep) && ep->gen != loop_check_gen &&
+ !is_file_epoll(tfile))
+ return 0;
+
+ /* Full check needed: drop ep->mtx so we can take epnested_mutex. */
+ mutex_unlock(&ep->mtx);
+ error = epoll_mutex_lock(&epnested_mutex, nonblock);
+ if (error)
+ return error;
+
+ loop_check_gen++;
+
+ if (is_file_epoll(tfile)) {
+ tep = tfile->private_data;
+ if (ep_loop_check(ctx, ep, tep) != 0) {
+ error = -ELOOP;
+ goto err_unlock_nested;
+ }
+ }
+
+ error = epoll_mutex_lock(&ep->mtx, nonblock);
+ if (error)
+ goto err_unlock_nested;
+
+ return 1;
+
+err_unlock_nested:
+ clear_tfile_check_list(ctx);
+ loop_check_gen++;
+ mutex_unlock(&epnested_mutex);
+ return error;
+}
+
+static void ep_ctl_unlock(struct ep_ctl_ctx *ctx, struct eventpoll *ep,
+ int full_check)
+{
+ mutex_unlock(&ep->mtx);
+ if (full_check) {
+ clear_tfile_check_list(ctx);
+ loop_check_gen++;
+ mutex_unlock(&epnested_mutex);
+ }
}
- int do_epoll_ctl(int epfd, int op, int fd, struct epoll_event *epds,
- bool nonblock)
+ int do_epoll_ctl_file(struct file *f, int op, struct epoll_key *tf,
+ struct epoll_event *epds, bool nonblock)
{
int error;
- int full_check = 0;
+ int full_check;
struct eventpoll *ep;
struct epitem *epi;
- struct eventpoll *tep = NULL;
+ struct ep_ctl_ctx ctx = { };
- CLASS(fd, f)(epfd);
- if (fd_empty(f))
- return -EBADF;
-
- /* Get the "struct file *" for the target file */
- CLASS(fd, tf)(fd);
- if (fd_empty(tf))
- return -EBADF;
-
/* The target file descriptor must support poll */
- if (!file_can_poll(fd_file(tf)))
+ if (!file_can_poll(tf->file))
return -EPERM;
/* Check if EPOLLWAKEUP is allowed */
ep_take_care_of_epollwakeup(epds);
/*
- * The @epfd file must itself be an eventpoll, and we do not permit
- * We have to check that the file structure underneath the file descriptor
- * the user passed to us _is_ an eventpoll file. And also we do not permit
++ * The @f file must itself be an eventpoll, and we do not permit
* adding an epoll file descriptor inside itself.
*/
- if (fd_file(f) == fd_file(tf) || !is_file_epoll(fd_file(f)))
- error = -EINVAL;
+ if (f == tf->file || !is_file_epoll(f))
- goto error_tgt_fput;
+ return -EINVAL;
/*
* epoll adds to the wakeup queue at EPOLL_CTL_ADD time only,
*/
if (ep_op_has_event(op) && (epds->events & EPOLLEXCLUSIVE)) {
if (op == EPOLL_CTL_MOD)
- goto error_tgt_fput;
+ return -EINVAL;
- if (op == EPOLL_CTL_ADD && (is_file_epoll(fd_file(tf)) ||
+ if (op == EPOLL_CTL_ADD && (is_file_epoll(tf->file) ||
(epds->events & ~EPOLLEXCLUSIVE_OK_BITS)))
- goto error_tgt_fput;
+ return -EINVAL;
}
- ep = fd_file(f)->private_data;
- /*
- * At this point it is safe to assume that the "private_data" contains
- * our own data structure.
- */
+ ep = f->private_data;
- full_check = ep_ctl_lock(&ctx, ep, op, fd_file(f), fd_file(tf),
- nonblock);
- /*
- * When we insert an epoll file descriptor inside another epoll file
- * descriptor, there is the chance of creating closed loops, which are
- * better be handled here, than in more critical paths. While we are
- * checking for loops we also determine the list of files reachable
- * and hang them on the tfile_check_list, so we can check that we
- * haven't created too many possible wakeup paths.
- *
- * We do not need to take the global 'epumutex' on EPOLL_CTL_ADD when
- * the epoll file descriptor is attaching directly to a wakeup source,
- * unless the epoll file descriptor is nested. The purpose of taking the
- * 'epnested_mutex' on add is to prevent complex toplogies such as loops and
- * deep wakeup paths from forming in parallel through multiple
- * EPOLL_CTL_ADD operations.
- */
- error = epoll_mutex_lock(&ep->mtx, 0, nonblock);
- if (error)
- goto error_tgt_fput;
- if (op == EPOLL_CTL_ADD) {
- if (READ_ONCE(f->f_ep) || ep->gen == loop_check_gen ||
- is_file_epoll(tf->file)) {
- mutex_unlock(&ep->mtx);
- error = epoll_mutex_lock(&epnested_mutex, 0, nonblock);
- if (error)
- goto error_tgt_fput;
- loop_check_gen++;
- full_check = 1;
- if (is_file_epoll(tf->file)) {
- tep = tf->file->private_data;
- error = -ELOOP;
- if (ep_loop_check(ep, tep) != 0)
- goto error_tgt_fput;
- }
- error = epoll_mutex_lock(&ep->mtx, 0, nonblock);
- if (error)
- goto error_tgt_fput;
- }
- }
++ full_check = ep_ctl_lock(&ctx, ep, op, f, tf->file, nonblock);
+ if (full_check < 0)
+ return full_check;
/*
- * Try to lookup the file inside our RB tree. Since we grabbed "mtx"
- * above, we can be sure to be able to use the item looked up by
- * ep_find() till we release the mutex.
+ * Look the target up in ep's RB tree. We hold ep->mtx, so the
+ * item stays valid until we release.
*/
- epi = ep_find(ep, fd_file(tf), fd);
+ epi = ep_find(ep, tf);
error = -EINVAL;
switch (op) {
case EPOLL_CTL_ADD:
if (!epi) {
epds->events |= EPOLLERR | EPOLLHUP;
- error = ep_insert(&ctx, ep, epds, fd_file(tf), fd,
- full_check);
- error = ep_insert(ep, epds, tf, full_check);
++ error = ep_insert(&ctx, ep, epds, tf, full_check);
} else
error = -EEXIST;
break;
error = -ENOENT;
break;
}
- mutex_unlock(&ep->mtx);
-error_tgt_fput:
- if (full_check) {
- clear_tfile_check_list();
- loop_check_gen++;
- mutex_unlock(&epnested_mutex);
- }
+ ep_ctl_unlock(&ctx, ep, full_check);
return error;
-
}
+ int do_epoll_ctl(int epfd, int op, int fd, struct epoll_event *epds,
+ bool nonblock)
+ {
+ struct epoll_key efd;
+
+ CLASS(fd, f)(epfd);
+ if (fd_empty(f))
+ return -EBADF;
+
+ /* Get the "struct file *" for the target file */
+ CLASS(fd, tf)(fd);
+ if (fd_empty(tf))
+ return -EBADF;
+
+ efd.file = fd_file(tf);
+ efd.fd = fd;
+ return do_epoll_ctl_file(fd_file(f), op, &efd, epds, nonblock);
+ }
+
/*
* The following function implements the controller interface for
* the eventpoll file that enables the insertion/removal/change of