--- /dev/null
+From 971316f0503a5c50633d07b83b6db2f15a3a5b00 Mon Sep 17 00:00:00 2001
+From: Oleg Nesterov <oleg@redhat.com>
+Date: Fri, 24 Feb 2012 20:07:29 +0100
+Subject: epoll: ep_unregister_pollwait() can use the freed pwq->whead
+
+From: Oleg Nesterov <oleg@redhat.com>
+
+commit 971316f0503a5c50633d07b83b6db2f15a3a5b00 upstream.
+
+signalfd_cleanup() ensures that ->signalfd_wqh is not used, but
+this is not enough. eppoll_entry->whead still points to the memory
+we are going to free, ep_unregister_pollwait()->remove_wait_queue()
+is obviously unsafe.
+
+Change ep_poll_callback(POLLFREE) to set eppoll_entry->whead = NULL,
+change ep_unregister_pollwait() to check pwq->whead != NULL under
+rcu_read_lock() before remove_wait_queue(). We add the new helper,
+ep_remove_wait_queue(), for this.
+
+This works because sighand_cachep is SLAB_DESTROY_BY_RCU and because
+->signalfd_wqh is initialized in sighand_ctor(), not in copy_sighand.
+ep_unregister_pollwait()->remove_wait_queue() can play with already
+freed and potentially reused ->sighand, but this is fine. This memory
+must have the valid ->signalfd_wqh until rcu_read_unlock().
+
+Reported-by: Maxime Bizon <mbizon@freebox.fr>
+Signed-off-by: Oleg Nesterov <oleg@redhat.com>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ fs/eventpoll.c | 30 +++++++++++++++++++++++++++---
+ fs/signalfd.c | 6 +++++-
+ 2 files changed, 32 insertions(+), 4 deletions(-)
+
+--- a/fs/eventpoll.c
++++ b/fs/eventpoll.c
+@@ -299,6 +299,11 @@ static inline int ep_is_linked(struct li
+ return !list_empty(p);
+ }
+
++static inline struct eppoll_entry *ep_pwq_from_wait(wait_queue_t *p)
++{
++ return container_of(p, struct eppoll_entry, wait);
++}
++
+ /* Get the "struct epitem" from a wait queue pointer */
+ static inline struct epitem *ep_item_from_wait(wait_queue_t *p)
+ {
+@@ -446,6 +451,18 @@ static void ep_poll_safewake(wait_queue_
+ put_cpu();
+ }
+
++static void ep_remove_wait_queue(struct eppoll_entry *pwq)
++{
++ wait_queue_head_t *whead;
++
++ rcu_read_lock();
++ /* If it is cleared by POLLFREE, it should be rcu-safe */
++ whead = rcu_dereference(pwq->whead);
++ if (whead)
++ remove_wait_queue(whead, &pwq->wait);
++ rcu_read_unlock();
++}
++
+ /*
+ * This function unregisters poll callbacks from the associated file
+ * descriptor. Must be called with "mtx" held (or "epmutex" if called from
+@@ -460,7 +477,7 @@ static void ep_unregister_pollwait(struc
+ pwq = list_first_entry(lsthead, struct eppoll_entry, llink);
+
+ list_del(&pwq->llink);
+- remove_wait_queue(pwq->whead, &pwq->wait);
++ ep_remove_wait_queue(pwq);
+ kmem_cache_free(pwq_cache, pwq);
+ }
+ }
+@@ -827,9 +844,16 @@ static int ep_poll_callback(wait_queue_t
+ struct epitem *epi = ep_item_from_wait(wait);
+ struct eventpoll *ep = epi->ep;
+
+- /* the caller holds eppoll_entry->whead->lock */
+- if ((unsigned long)key & POLLFREE)
++ if ((unsigned long)key & POLLFREE) {
++ ep_pwq_from_wait(wait)->whead = NULL;
++ /*
++ * whead = NULL above can race with ep_remove_wait_queue()
++ * which can do another remove_wait_queue() after us, so we
++ * can't use __remove_wait_queue(). whead->lock is held by
++ * the caller.
++ */
+ list_del_init(&wait->task_list);
++ }
+
+ spin_lock_irqsave(&ep->lock, flags);
+
+--- a/fs/signalfd.c
++++ b/fs/signalfd.c
+@@ -33,7 +33,11 @@
+ void signalfd_cleanup(struct sighand_struct *sighand)
+ {
+ wait_queue_head_t *wqh = &sighand->signalfd_wqh;
+-
++ /*
++ * The lockless check can race with remove_wait_queue() in progress,
++ * but in this case its caller should run under rcu_read_lock() and
++ * sighand_cachep is SLAB_DESTROY_BY_RCU, we can safely return.
++ */
+ if (likely(!waitqueue_active(wqh)))
+ return;
+
--- /dev/null
+From d80e731ecab420ddcb79ee9d0ac427acbc187b4b Mon Sep 17 00:00:00 2001
+From: Oleg Nesterov <oleg@redhat.com>
+Date: Fri, 24 Feb 2012 20:07:11 +0100
+Subject: epoll: introduce POLLFREE to flush ->signalfd_wqh before kfree()
+
+From: Oleg Nesterov <oleg@redhat.com>
+
+commit d80e731ecab420ddcb79ee9d0ac427acbc187b4b upstream.
+
+This patch is intentionally incomplete to simplify the review.
+It ignores ep_unregister_pollwait() which plays with the same wqh.
+See the next change.
+
+epoll assumes that the EPOLL_CTL_ADD'ed file controls everything
+f_op->poll() needs. In particular it assumes that the wait queue
+can't go away until eventpoll_release(). This is not true in case
+of signalfd, the task which does EPOLL_CTL_ADD uses its ->sighand
+which is not connected to the file.
+
+This patch adds the special event, POLLFREE, currently only for
+epoll. It expects that init_poll_funcptr()'ed hook should do the
+necessary cleanup. Perhaps it should be defined as EPOLLFREE in
+eventpoll.
+
+__cleanup_sighand() is changed to do wake_up_poll(POLLFREE) if
+->signalfd_wqh is not empty, we add the new signalfd_cleanup()
+helper.
+
+ep_poll_callback(POLLFREE) simply does list_del_init(task_list).
+This make this poll entry inconsistent, but we don't care. If you
+share epoll fd which contains our sigfd with another process you
+should blame yourself. signalfd is "really special". I simply do
+not know how we can define the "right" semantics if it used with
+epoll.
+
+The main problem is, epoll calls signalfd_poll() once to establish
+the connection with the wait queue, after that signalfd_poll(NULL)
+returns the different/inconsistent results depending on who does
+EPOLL_CTL_MOD/signalfd_read/etc. IOW: apart from sigmask, signalfd
+has nothing to do with the file, it works with the current thread.
+
+In short: this patch is the hack which tries to fix the symptoms.
+It also assumes that nobody can take tasklist_lock under epoll
+locks, this seems to be true.
+
+Note:
+
+ - we do not have wake_up_all_poll() but wake_up_poll()
+ is fine, poll/epoll doesn't use WQ_FLAG_EXCLUSIVE.
+
+ - signalfd_cleanup() uses POLLHUP along with POLLFREE,
+ we need a couple of simple changes in eventpoll.c to
+ make sure it can't be "lost".
+
+Reported-by: Maxime Bizon <mbizon@freebox.fr>
+Signed-off-by: Oleg Nesterov <oleg@redhat.com>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ fs/eventpoll.c | 4 ++++
+ fs/signalfd.c | 11 +++++++++++
+ include/asm-generic/poll.h | 2 ++
+ include/linux/signalfd.h | 5 ++++-
+ kernel/fork.c | 5 ++++-
+ 5 files changed, 25 insertions(+), 2 deletions(-)
+
+--- a/fs/eventpoll.c
++++ b/fs/eventpoll.c
+@@ -827,6 +827,10 @@ static int ep_poll_callback(wait_queue_t
+ struct epitem *epi = ep_item_from_wait(wait);
+ struct eventpoll *ep = epi->ep;
+
++ /* the caller holds eppoll_entry->whead->lock */
++ if ((unsigned long)key & POLLFREE)
++ list_del_init(&wait->task_list);
++
+ spin_lock_irqsave(&ep->lock, flags);
+
+ /*
+--- a/fs/signalfd.c
++++ b/fs/signalfd.c
+@@ -30,6 +30,17 @@
+ #include <linux/signalfd.h>
+ #include <linux/syscalls.h>
+
++void signalfd_cleanup(struct sighand_struct *sighand)
++{
++ wait_queue_head_t *wqh = &sighand->signalfd_wqh;
++
++ if (likely(!waitqueue_active(wqh)))
++ return;
++
++ /* wait_queue_t->func(POLLFREE) should do remove_wait_queue() */
++ wake_up_poll(wqh, POLLHUP | POLLFREE);
++}
++
+ struct signalfd_ctx {
+ sigset_t sigmask;
+ };
+--- a/include/asm-generic/poll.h
++++ b/include/asm-generic/poll.h
+@@ -28,6 +28,8 @@
+ #define POLLRDHUP 0x2000
+ #endif
+
++#define POLLFREE 0x4000 /* currently only for epoll */
++
+ struct pollfd {
+ int fd;
+ short events;
+--- a/include/linux/signalfd.h
++++ b/include/linux/signalfd.h
+@@ -61,13 +61,16 @@ static inline void signalfd_notify(struc
+ wake_up(&tsk->sighand->signalfd_wqh);
+ }
+
++extern void signalfd_cleanup(struct sighand_struct *sighand);
++
+ #else /* CONFIG_SIGNALFD */
+
+ static inline void signalfd_notify(struct task_struct *tsk, int sig) { }
+
++static inline void signalfd_cleanup(struct sighand_struct *sighand) { }
++
+ #endif /* CONFIG_SIGNALFD */
+
+ #endif /* __KERNEL__ */
+
+ #endif /* _LINUX_SIGNALFD_H */
+-
+--- a/kernel/fork.c
++++ b/kernel/fork.c
+@@ -67,6 +67,7 @@
+ #include <linux/user-return-notifier.h>
+ #include <linux/oom.h>
+ #include <linux/khugepaged.h>
++#include <linux/signalfd.h>
+
+ #include <asm/pgtable.h>
+ #include <asm/pgalloc.h>
+@@ -917,8 +918,10 @@ static int copy_sighand(unsigned long cl
+
+ void __cleanup_sighand(struct sighand_struct *sighand)
+ {
+- if (atomic_dec_and_test(&sighand->count))
++ if (atomic_dec_and_test(&sighand->count)) {
++ signalfd_cleanup(sighand);
+ kmem_cache_free(sighand_cachep, sighand);
++ }
+ }
+
+
--- /dev/null
+From 28d82dc1c4edbc352129f97f4ca22624d1fe61de Mon Sep 17 00:00:00 2001
+From: Jason Baron <jbaron@redhat.com>
+Date: Thu, 12 Jan 2012 17:17:43 -0800
+Subject: epoll: limit paths
+
+From: Jason Baron <jbaron@redhat.com>
+
+commit 28d82dc1c4edbc352129f97f4ca22624d1fe61de upstream.
+
+The current epoll code can be tickled to run basically indefinitely in
+both loop detection path check (on ep_insert()), and in the wakeup paths.
+The programs that tickle this behavior set up deeply linked networks of
+epoll file descriptors that cause the epoll algorithms to traverse them
+indefinitely. A couple of these sample programs have been previously
+posted in this thread: https://lkml.org/lkml/2011/2/25/297.
+
+To fix the loop detection path check algorithms, I simply keep track of
+the epoll nodes that have been already visited. Thus, the loop detection
+becomes proportional to the number of epoll file descriptor and links.
+This dramatically decreases the run-time of the loop check algorithm. In
+one diabolical case I tried it reduced the run-time from 15 mintues (all
+in kernel time) to .3 seconds.
+
+Fixing the wakeup paths could be done at wakeup time in a similar manner
+by keeping track of nodes that have already been visited, but the
+complexity is harder, since there can be multiple wakeups on different
+cpus...Thus, I've opted to limit the number of possible wakeup paths when
+the paths are created.
+
+This is accomplished, by noting that the end file descriptor points that
+are found during the loop detection pass (from the newly added link), are
+actually the sources for wakeup events. I keep a list of these file
+descriptors and limit the number and length of these paths that emanate
+from these 'source file descriptors'. In the current implemetation I
+allow 1000 paths of length 1, 500 of length 2, 100 of length 3, 50 of
+length 4 and 10 of length 5. Note that it is sufficient to check the
+'source file descriptors' reachable from the newly added link, since no
+other 'source file descriptors' will have newly added links. This allows
+us to check only the wakeup paths that may have gotten too long, and not
+re-check all possible wakeup paths on the system.
+
+In terms of the path limit selection, I think its first worth noting that
+the most common case for epoll, is probably the model where you have 1
+epoll file descriptor that is monitoring n number of 'source file
+descriptors'. In this case, each 'source file descriptor' has a 1 path of
+length 1. Thus, I believe that the limits I'm proposing are quite
+reasonable and in fact may be too generous. Thus, I'm hoping that the
+proposed limits will not prevent any workloads that currently work to
+fail.
+
+In terms of locking, I have extended the use of the 'epmutex' to all
+epoll_ctl add and remove operations. Currently its only used in a subset
+of the add paths. I need to hold the epmutex, so that we can correctly
+traverse a coherent graph, to check the number of paths. I believe that
+this additional locking is probably ok, since its in the setup/teardown
+paths, and doesn't affect the running paths, but it certainly is going to
+add some extra overhead. Also, worth noting is that the epmuex was
+recently added to the ep_ctl add operations in the initial path loop
+detection code using the argument that it was not on a critical path.
+
+Another thing to note here, is the length of epoll chains that is allowed.
+Currently, eventpoll.c defines:
+
+/* Maximum number of nesting allowed inside epoll sets */
+#define EP_MAX_NESTS 4
+
+This basically means that I am limited to a graph depth of 5 (EP_MAX_NESTS
++ 1). However, this limit is currently only enforced during the loop
+check detection code, and only when the epoll file descriptors are added
+in a certain order. Thus, this limit is currently easily bypassed. The
+newly added check for wakeup paths, stricly limits the wakeup paths to a
+length of 5, regardless of the order in which ep's are linked together.
+Thus, a side-effect of the new code is a more consistent enforcement of
+the graph depth.
+
+Thus far, I've tested this, using the sample programs previously
+mentioned, which now either return quickly or return -EINVAL. I've also
+testing using the piptest.c epoll tester, which showed no difference in
+performance. I've also created a number of different epoll networks and
+tested that they behave as expectded.
+
+I believe this solves the original diabolical test cases, while still
+preserving the sane epoll nesting.
+
+Signed-off-by: Jason Baron <jbaron@redhat.com>
+Cc: Nelson Elhage <nelhage@ksplice.com>
+Cc: Davide Libenzi <davidel@xmailserver.org>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ fs/eventpoll.c | 234 +++++++++++++++++++++++++++++++++++++++++-----
+ include/linux/eventpoll.h | 1
+ include/linux/fs.h | 1
+ 3 files changed, 211 insertions(+), 25 deletions(-)
+
+--- a/fs/eventpoll.c
++++ b/fs/eventpoll.c
+@@ -197,6 +197,12 @@ struct eventpoll {
+
+ /* The user that created the eventpoll descriptor */
+ struct user_struct *user;
++
++ struct file *file;
++
++ /* used to optimize loop detection check */
++ int visited;
++ struct list_head visited_list_link;
+ };
+
+ /* Wait structure used by the poll hooks */
+@@ -255,6 +261,15 @@ static struct kmem_cache *epi_cache __re
+ /* Slab cache used to allocate "struct eppoll_entry" */
+ static struct kmem_cache *pwq_cache __read_mostly;
+
++/* Visited nodes during ep_loop_check(), so we can unset them when we finish */
++static LIST_HEAD(visited_list);
++
++/*
++ * List of files with newly added links, where we may need to limit the number
++ * of emanating paths. Protected by the epmutex.
++ */
++static LIST_HEAD(tfile_check_list);
++
+ #ifdef CONFIG_SYSCTL
+
+ #include <linux/sysctl.h>
+@@ -276,6 +291,12 @@ ctl_table epoll_table[] = {
+ };
+ #endif /* CONFIG_SYSCTL */
+
++static const struct file_operations eventpoll_fops;
++
++static inline int is_file_epoll(struct file *f)
++{
++ return f->f_op == &eventpoll_fops;
++}
+
+ /* Setup the structure that is used as key for the RB tree */
+ static inline void ep_set_ffd(struct epoll_filefd *ffd,
+@@ -728,12 +749,6 @@ static const struct file_operations even
+ .llseek = noop_llseek,
+ };
+
+-/* Fast test to see if the file is an evenpoll file */
+-static inline int is_file_epoll(struct file *f)
+-{
+- return f->f_op == &eventpoll_fops;
+-}
+-
+ /*
+ * This is called from eventpoll_release() to unlink files from the eventpoll
+ * interface. We need to have this facility to cleanup correctly files that are
+@@ -954,6 +969,99 @@ static void ep_rbtree_insert(struct even
+ rb_insert_color(&epi->rbn, &ep->rbr);
+ }
+
++
++
++#define PATH_ARR_SIZE 5
++/*
++ * These are the number paths of length 1 to 5, that we are allowing to emanate
++ * from a single file of interest. For example, we allow 1000 paths of length
++ * 1, to emanate from each file of interest. This essentially represents the
++ * potential wakeup paths, which need to be limited in order to avoid massive
++ * uncontrolled wakeup storms. The common use case should be a single ep which
++ * is connected to n file sources. In this case each file source has 1 path
++ * of length 1. Thus, the numbers below should be more than sufficient. These
++ * path limits are enforced during an EPOLL_CTL_ADD operation, since a modify
++ * and delete can't add additional paths. Protected by the epmutex.
++ */
++static const int path_limits[PATH_ARR_SIZE] = { 1000, 500, 100, 50, 10 };
++static int path_count[PATH_ARR_SIZE];
++
++static int path_count_inc(int nests)
++{
++ if (++path_count[nests] > path_limits[nests])
++ return -1;
++ return 0;
++}
++
++static void path_count_init(void)
++{
++ int i;
++
++ for (i = 0; i < PATH_ARR_SIZE; i++)
++ path_count[i] = 0;
++}
++
++static int reverse_path_check_proc(void *priv, void *cookie, int call_nests)
++{
++ int error = 0;
++ struct file *file = priv;
++ struct file *child_file;
++ struct epitem *epi;
++
++ list_for_each_entry(epi, &file->f_ep_links, fllink) {
++ child_file = epi->ep->file;
++ if (is_file_epoll(child_file)) {
++ if (list_empty(&child_file->f_ep_links)) {
++ if (path_count_inc(call_nests)) {
++ error = -1;
++ break;
++ }
++ } else {
++ error = ep_call_nested(&poll_loop_ncalls,
++ EP_MAX_NESTS,
++ reverse_path_check_proc,
++ child_file, child_file,
++ current);
++ }
++ if (error != 0)
++ break;
++ } else {
++ printk(KERN_ERR "reverse_path_check_proc: "
++ "file is not an ep!\n");
++ }
++ }
++ return error;
++}
++
++/**
++ * reverse_path_check - The tfile_check_list is list of file *, which have
++ * links that are proposed to be newly added. We need to
++ * make sure that those added links don't add too many
++ * paths such that we will spend all our time waking up
++ * eventpoll objects.
++ *
++ * Returns: Returns zero if the proposed links don't create too many paths,
++ * -1 otherwise.
++ */
++static int reverse_path_check(void)
++{
++ int length = 0;
++ int error = 0;
++ struct file *current_file;
++
++ /* let's call this for all tfiles */
++ list_for_each_entry(current_file, &tfile_check_list, f_tfile_llink) {
++ length++;
++ path_count_init();
++ error = ep_call_nested(&poll_loop_ncalls, EP_MAX_NESTS,
++ reverse_path_check_proc, current_file,
++ current_file, current);
++ if (error)
++ break;
++ }
++ return error;
++}
++
+ /*
+ * Must be called with "mtx" held.
+ */
+@@ -1015,6 +1123,11 @@ static int ep_insert(struct eventpoll *e
+ */
+ ep_rbtree_insert(ep, epi);
+
++ /* now check if we've created too many backpaths */
++ error = -EINVAL;
++ if (reverse_path_check())
++ goto error_remove_epi;
++
+ /* We have to drop the new item inside our item list to keep track of it */
+ spin_lock_irqsave(&ep->lock, flags);
+
+@@ -1039,6 +1152,14 @@ static int ep_insert(struct eventpoll *e
+
+ return 0;
+
++error_remove_epi:
++ spin_lock(&tfile->f_lock);
++ if (ep_is_linked(&epi->fllink))
++ list_del_init(&epi->fllink);
++ spin_unlock(&tfile->f_lock);
++
++ rb_erase(&epi->rbn, &ep->rbr);
++
+ error_unregister:
+ ep_unregister_pollwait(ep, epi);
+
+@@ -1303,18 +1424,36 @@ static int ep_loop_check_proc(void *priv
+ int error = 0;
+ struct file *file = priv;
+ struct eventpoll *ep = file->private_data;
++ struct eventpoll *ep_tovisit;
+ struct rb_node *rbp;
+ struct epitem *epi;
+
+ mutex_lock_nested(&ep->mtx, call_nests + 1);
++ ep->visited = 1;
++ list_add(&ep->visited_list_link, &visited_list);
+ for (rbp = rb_first(&ep->rbr); rbp; rbp = rb_next(rbp)) {
+ epi = rb_entry(rbp, struct epitem, rbn);
+ if (unlikely(is_file_epoll(epi->ffd.file))) {
++ ep_tovisit = epi->ffd.file->private_data;
++ if (ep_tovisit->visited)
++ continue;
+ error = ep_call_nested(&poll_loop_ncalls, EP_MAX_NESTS,
+- ep_loop_check_proc, epi->ffd.file,
+- epi->ffd.file->private_data, current);
++ ep_loop_check_proc, epi->ffd.file,
++ ep_tovisit, current);
+ if (error != 0)
+ break;
++ } else {
++ /*
++ * If we've reached a file that is not associated with
++ * an ep, then we need to check if the newly added
++ * links are going to add too many wakeup paths. We do
++ * this by adding it to the tfile_check_list, if it's
++ * not already there, and calling reverse_path_check()
++ * during ep_insert().
++ */
++ if (list_empty(&epi->ffd.file->f_tfile_llink))
++ list_add(&epi->ffd.file->f_tfile_llink,
++ &tfile_check_list);
+ }
+ }
+ mutex_unlock(&ep->mtx);
+@@ -1335,8 +1474,31 @@ static int ep_loop_check_proc(void *priv
+ */
+ static int ep_loop_check(struct eventpoll *ep, struct file *file)
+ {
+- return ep_call_nested(&poll_loop_ncalls, EP_MAX_NESTS,
++ int ret;
++ struct eventpoll *ep_cur, *ep_next;
++
++ ret = ep_call_nested(&poll_loop_ncalls, EP_MAX_NESTS,
+ ep_loop_check_proc, file, ep, current);
++ /* clear visited list */
++ list_for_each_entry_safe(ep_cur, ep_next, &visited_list,
++ visited_list_link) {
++ ep_cur->visited = 0;
++ list_del(&ep_cur->visited_list_link);
++ }
++ return ret;
++}
++
++static void clear_tfile_check_list(void)
++{
++ struct file *file;
++
++ /* first clear the tfile_check_list */
++ while (!list_empty(&tfile_check_list)) {
++ file = list_first_entry(&tfile_check_list, struct file,
++ f_tfile_llink);
++ list_del_init(&file->f_tfile_llink);
++ }
++ INIT_LIST_HEAD(&tfile_check_list);
+ }
+
+ /*
+@@ -1344,8 +1506,9 @@ static int ep_loop_check(struct eventpol
+ */
+ SYSCALL_DEFINE1(epoll_create1, int, flags)
+ {
+- int error;
++ int error, fd;
+ struct eventpoll *ep = NULL;
++ struct file *file;
+
+ /* Check the EPOLL_* constant for consistency. */
+ BUILD_BUG_ON(EPOLL_CLOEXEC != O_CLOEXEC);
+@@ -1362,11 +1525,25 @@ SYSCALL_DEFINE1(epoll_create1, int, flag
+ * Creates all the items needed to setup an eventpoll file. That is,
+ * a file structure and a free file descriptor.
+ */
+- error = anon_inode_getfd("[eventpoll]", &eventpoll_fops, ep,
++ fd = get_unused_fd_flags(O_RDWR | (flags & O_CLOEXEC));
++ if (fd < 0) {
++ error = fd;
++ goto out_free_ep;
++ }
++ file = anon_inode_getfile("[eventpoll]", &eventpoll_fops, ep,
+ O_RDWR | (flags & O_CLOEXEC));
+- if (error < 0)
+- ep_free(ep);
+-
++ if (IS_ERR(file)) {
++ error = PTR_ERR(file);
++ goto out_free_fd;
++ }
++ fd_install(fd, file);
++ ep->file = file;
++ return fd;
++
++out_free_fd:
++ put_unused_fd(fd);
++out_free_ep:
++ ep_free(ep);
+ return error;
+ }
+
+@@ -1432,21 +1609,27 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, in
+ /*
+ * When we insert an epoll file descriptor, inside another epoll file
+ * descriptor, there is the change of creating closed loops, which are
+- * better be handled here, than in more critical paths.
++ * better be handled here, than in more critical paths. While we are
++ * checking for loops we also determine the list of files reachable
++ * and hang them on the tfile_check_list, so we can check that we
++ * haven't created too many possible wakeup paths.
+ *
+- * We hold epmutex across the loop check and the insert in this case, in
+- * order to prevent two separate inserts from racing and each doing the
+- * insert "at the same time" such that ep_loop_check passes on both
+- * before either one does the insert, thereby creating a cycle.
++ * We need to hold the epmutex across both ep_insert and ep_remove
++ * b/c we want to make sure we are looking at a coherent view of
++ * epoll network.
+ */
+- if (unlikely(is_file_epoll(tfile) && op == EPOLL_CTL_ADD)) {
++ if (op == EPOLL_CTL_ADD || op == EPOLL_CTL_DEL) {
+ mutex_lock(&epmutex);
+ did_lock_epmutex = 1;
+- error = -ELOOP;
+- if (ep_loop_check(ep, tfile) != 0)
+- goto error_tgt_fput;
+ }
+-
++ if (op == EPOLL_CTL_ADD) {
++ if (is_file_epoll(tfile)) {
++ error = -ELOOP;
++ if (ep_loop_check(ep, tfile) != 0)
++ goto error_tgt_fput;
++ } else
++ list_add(&tfile->f_tfile_llink, &tfile_check_list);
++ }
+
+ mutex_lock_nested(&ep->mtx, 0);
+
+@@ -1465,6 +1648,7 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, in
+ error = ep_insert(ep, &epds, tfile, fd);
+ } else
+ error = -EEXIST;
++ clear_tfile_check_list();
+ break;
+ case EPOLL_CTL_DEL:
+ if (epi)
+@@ -1483,7 +1667,7 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, in
+ mutex_unlock(&ep->mtx);
+
+ error_tgt_fput:
+- if (unlikely(did_lock_epmutex))
++ if (did_lock_epmutex)
+ mutex_unlock(&epmutex);
+
+ fput(tfile);
+--- a/include/linux/eventpoll.h
++++ b/include/linux/eventpoll.h
+@@ -61,6 +61,7 @@ struct file;
+ static inline void eventpoll_init_file(struct file *file)
+ {
+ INIT_LIST_HEAD(&file->f_ep_links);
++ INIT_LIST_HEAD(&file->f_tfile_llink);
+ }
+
+
+--- a/include/linux/fs.h
++++ b/include/linux/fs.h
+@@ -969,6 +969,7 @@ struct file {
+ #ifdef CONFIG_EPOLL
+ /* Used by fs/eventpoll.c to link all the hooks to this file */
+ struct list_head f_ep_links;
++ struct list_head f_tfile_llink;
+ #endif /* #ifdef CONFIG_EPOLL */
+ struct address_space *f_mapping;
+ #ifdef CONFIG_DEBUG_WRITECOUNT