--- /dev/null
+Subject: exit/exec: Seperate mm_release()
+From: Thomas Gleixner <tglx@linutronix.de>
+Date: Wed Nov 6 22:55:38 2019 +0100
+
+From: Thomas Gleixner <tglx@linutronix.de>
+
+commit 4610ba7ad877fafc0a25a30c6c82015304120426 upstream
+
+mm_release() contains the futex exit handling. mm_release() is called from
+do_exit()->exit_mm() and from exec()->exec_mm().
+
+In the exit_mm() case PF_EXITING and the futex state is updated. In the
+exec_mm() case these states are not touched.
+
+As the futex exit code needs further protections against exit races, this
+needs to be split into two functions.
+
+Preparatory only, no functional change.
+
+Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
+Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
+Reviewed-by: Ingo Molnar <mingo@kernel.org>
+Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
+Link: https://lkml.kernel.org/r/20191106224556.240518241@linutronix.de
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/exec.c | 2 +-
+ include/linux/sched/mm.h | 6 ++++--
+ kernel/exit.c | 2 +-
+ kernel/fork.c | 12 +++++++++++-
+ 4 files changed, 17 insertions(+), 5 deletions(-)
+
+--- a/fs/exec.c
++++ b/fs/exec.c
+@@ -1011,7 +1011,7 @@ static int exec_mmap(struct mm_struct *m
+ /* Notify parent that we're no longer interested in the old VM */
+ tsk = current;
+ old_mm = current->mm;
+- mm_release(tsk, old_mm);
++ exec_mm_release(tsk, old_mm);
+
+ if (old_mm) {
+ sync_mm_rss(old_mm);
+--- a/include/linux/sched/mm.h
++++ b/include/linux/sched/mm.h
+@@ -119,8 +119,10 @@ extern struct mm_struct *get_task_mm(str
+ * succeeds.
+ */
+ extern struct mm_struct *mm_access(struct task_struct *task, unsigned int mode);
+-/* Remove the current tasks stale references to the old mm_struct */
+-extern void mm_release(struct task_struct *, struct mm_struct *);
++/* Remove the current tasks stale references to the old mm_struct on exit() */
++extern void exit_mm_release(struct task_struct *, struct mm_struct *);
++/* Remove the current tasks stale references to the old mm_struct on exec() */
++extern void exec_mm_release(struct task_struct *, struct mm_struct *);
+
+ #ifdef CONFIG_MEMCG
+ extern void mm_update_next_owner(struct mm_struct *mm);
+--- a/kernel/exit.c
++++ b/kernel/exit.c
+@@ -498,7 +498,7 @@ static void exit_mm(void)
+ struct mm_struct *mm = current->mm;
+ struct core_state *core_state;
+
+- mm_release(current, mm);
++ exit_mm_release(current, mm);
+ if (!mm)
+ return;
+ sync_mm_rss(mm);
+--- a/kernel/fork.c
++++ b/kernel/fork.c
+@@ -1217,7 +1217,7 @@ static int wait_for_vfork_done(struct ta
+ * restoring the old one. . .
+ * Eric Biederman 10 January 1998
+ */
+-void mm_release(struct task_struct *tsk, struct mm_struct *mm)
++static void mm_release(struct task_struct *tsk, struct mm_struct *mm)
+ {
+ /* Get rid of any futexes when releasing the mm */
+ futex_mm_release(tsk);
+@@ -1254,6 +1254,16 @@ void mm_release(struct task_struct *tsk,
+ complete_vfork_done(tsk);
+ }
+
++void exit_mm_release(struct task_struct *tsk, struct mm_struct *mm)
++{
++ mm_release(tsk, mm);
++}
++
++void exec_mm_release(struct task_struct *tsk, struct mm_struct *mm)
++{
++ mm_release(tsk, mm);
++}
++
+ /*
+ * Allocate a new mm structure and copy contents from the
+ * mm structure of the passed in task structure.
--- /dev/null
+Subject: futex: Add mutex around futex exit
+From: Thomas Gleixner <tglx@linutronix.de>
+Date: Wed Nov 6 22:55:44 2019 +0100
+
+From: Thomas Gleixner <tglx@linutronix.de>
+
+commit 3f186d974826847a07bc7964d79ec4eded475ad9 upstream
+
+The mutex will be used in subsequent changes to replace the busy looping of
+a waiter when the futex owner is currently executing the exit cleanup to
+prevent a potential live lock.
+
+Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
+Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
+Reviewed-by: Ingo Molnar <mingo@kernel.org>
+Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
+Link: https://lkml.kernel.org/r/20191106224556.845798895@linutronix.de
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ include/linux/futex.h | 1 +
+ include/linux/sched.h | 1 +
+ kernel/futex.c | 16 ++++++++++++++++
+ 3 files changed, 18 insertions(+)
+
+--- a/include/linux/futex.h
++++ b/include/linux/futex.h
+@@ -68,6 +68,7 @@ static inline void futex_init_task(struc
+ INIT_LIST_HEAD(&tsk->pi_state_list);
+ tsk->pi_state_cache = NULL;
+ tsk->futex_state = FUTEX_STATE_OK;
++ mutex_init(&tsk->futex_exit_mutex);
+ }
+
+ void futex_exit_recursive(struct task_struct *tsk);
+--- a/include/linux/sched.h
++++ b/include/linux/sched.h
+@@ -996,6 +996,7 @@ struct task_struct {
+ #endif
+ struct list_head pi_state_list;
+ struct futex_pi_state *pi_state_cache;
++ struct mutex futex_exit_mutex;
+ unsigned int futex_state;
+ #endif
+ #ifdef CONFIG_PERF_EVENTS
+--- a/kernel/futex.c
++++ b/kernel/futex.c
+@@ -3735,12 +3735,23 @@ static void futex_cleanup(struct task_st
+ */
+ void futex_exit_recursive(struct task_struct *tsk)
+ {
++ /* If the state is FUTEX_STATE_EXITING then futex_exit_mutex is held */
++ if (tsk->futex_state == FUTEX_STATE_EXITING)
++ mutex_unlock(&tsk->futex_exit_mutex);
+ tsk->futex_state = FUTEX_STATE_DEAD;
+ }
+
+ static void futex_cleanup_begin(struct task_struct *tsk)
+ {
+ /*
++ * Prevent various race issues against a concurrent incoming waiter
++ * including live locks by forcing the waiter to block on
++ * tsk->futex_exit_mutex when it observes FUTEX_STATE_EXITING in
++ * attach_to_pi_owner().
++ */
++ mutex_lock(&tsk->futex_exit_mutex);
++
++ /*
+ * Switch the state to FUTEX_STATE_EXITING under tsk->pi_lock.
+ *
+ * This ensures that all subsequent checks of tsk->futex_state in
+@@ -3763,6 +3774,11 @@ static void futex_cleanup_end(struct tas
+ * take another loop until it becomes visible.
+ */
+ tsk->futex_state = state;
++ /*
++ * Drop the exit protection. This unblocks waiters which observed
++ * FUTEX_STATE_EXITING to reevaluate the state.
++ */
++ mutex_unlock(&tsk->futex_exit_mutex);
+ }
+
+ void futex_exec_release(struct task_struct *tsk)
--- /dev/null
+Subject: futex: Mark the begin of futex exit explicitly
+From: Thomas Gleixner <tglx@linutronix.de>
+Date: Wed Nov 6 22:55:41 2019 +0100
+
+From: Thomas Gleixner <tglx@linutronix.de>
+
+commit 18f694385c4fd77a09851fd301236746ca83f3cb upstream
+
+Instead of relying on PF_EXITING use an explicit state for the futex exit
+and set it in the futex exit function. This moves the smp barrier and the
+lock/unlock serialization into the futex code.
+
+As with the DEAD state this is restricted to the exit path as exec
+continues to use the same task struct.
+
+This allows to simplify that logic in a next step.
+
+Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
+Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
+Reviewed-by: Ingo Molnar <mingo@kernel.org>
+Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
+Link: https://lkml.kernel.org/r/20191106224556.539409004@linutronix.de
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ include/linux/futex.h | 31 +++----------------------------
+ kernel/exit.c | 13 +------------
+ kernel/futex.c | 37 ++++++++++++++++++++++++++++++++++++-
+ 3 files changed, 40 insertions(+), 41 deletions(-)
+
+--- a/include/linux/futex.h
++++ b/include/linux/futex.h
+@@ -55,6 +55,7 @@ union futex_key {
+ #ifdef CONFIG_FUTEX
+ enum {
+ FUTEX_STATE_OK,
++ FUTEX_STATE_EXITING,
+ FUTEX_STATE_DEAD,
+ };
+
+@@ -69,33 +70,7 @@ static inline void futex_init_task(struc
+ tsk->futex_state = FUTEX_STATE_OK;
+ }
+
+-/**
+- * futex_exit_done - Sets the tasks futex state to FUTEX_STATE_DEAD
+- * @tsk: task to set the state on
+- *
+- * Set the futex exit state of the task lockless. The futex waiter code
+- * observes that state when a task is exiting and loops until the task has
+- * actually finished the futex cleanup. The worst case for this is that the
+- * waiter runs through the wait loop until the state becomes visible.
+- *
+- * This has two callers:
+- *
+- * - futex_mm_release() after the futex exit cleanup has been done
+- *
+- * - do_exit() from the recursive fault handling path.
+- *
+- * In case of a recursive fault this is best effort. Either the futex exit
+- * code has run already or not. If the OWNER_DIED bit has been set on the
+- * futex then the waiter can take it over. If not, the problem is pushed
+- * back to user space. If the futex exit code did not run yet, then an
+- * already queued waiter might block forever, but there is nothing which
+- * can be done about that.
+- */
+-static inline void futex_exit_done(struct task_struct *tsk)
+-{
+- tsk->futex_state = FUTEX_STATE_DEAD;
+-}
+-
++void futex_exit_recursive(struct task_struct *tsk);
+ void futex_exit_release(struct task_struct *tsk);
+ void futex_exec_release(struct task_struct *tsk);
+
+@@ -103,7 +78,7 @@ long do_futex(u32 __user *uaddr, int op,
+ u32 __user *uaddr2, u32 val2, u32 val3);
+ #else
+ static inline void futex_init_task(struct task_struct *tsk) { }
+-static inline void futex_exit_done(struct task_struct *tsk) { }
++static inline void futex_exit_recursive(struct task_struct *tsk) { }
+ static inline void futex_exit_release(struct task_struct *tsk) { }
+ static inline void futex_exec_release(struct task_struct *tsk) { }
+ static inline long do_futex(u32 __user *uaddr, int op, u32 val,
+--- a/kernel/exit.c
++++ b/kernel/exit.c
+@@ -818,23 +818,12 @@ void __noreturn do_exit(long code)
+ */
+ if (unlikely(tsk->flags & PF_EXITING)) {
+ pr_alert("Fixing recursive fault but reboot is needed!\n");
+- futex_exit_done(tsk);
++ futex_exit_recursive(tsk);
+ set_current_state(TASK_UNINTERRUPTIBLE);
+ schedule();
+ }
+
+ exit_signals(tsk); /* sets PF_EXITING */
+- /*
+- * Ensure that all new tsk->pi_lock acquisitions must observe
+- * PF_EXITING. Serializes against futex.c:attach_to_pi_owner().
+- */
+- smp_mb();
+- /*
+- * Ensure that we must observe the pi_state in exit_mm() ->
+- * mm_release() -> exit_pi_state_list().
+- */
+- raw_spin_lock_irq(&tsk->pi_lock);
+- raw_spin_unlock_irq(&tsk->pi_lock);
+
+ /* sync mm's RSS info before statistics gathering */
+ if (tsk->mm)
+--- a/kernel/futex.c
++++ b/kernel/futex.c
+@@ -3716,10 +3716,45 @@ void futex_exec_release(struct task_stru
+ exit_pi_state_list(tsk);
+ }
+
++/**
++ * futex_exit_recursive - Set the tasks futex state to FUTEX_STATE_DEAD
++ * @tsk: task to set the state on
++ *
++ * Set the futex exit state of the task lockless. The futex waiter code
++ * observes that state when a task is exiting and loops until the task has
++ * actually finished the futex cleanup. The worst case for this is that the
++ * waiter runs through the wait loop until the state becomes visible.
++ *
++ * This is called from the recursive fault handling path in do_exit().
++ *
++ * This is best effort. Either the futex exit code has run already or
++ * not. If the OWNER_DIED bit has been set on the futex then the waiter can
++ * take it over. If not, the problem is pushed back to user space. If the
++ * futex exit code did not run yet, then an already queued waiter might
++ * block forever, but there is nothing which can be done about that.
++ */
++void futex_exit_recursive(struct task_struct *tsk)
++{
++ tsk->futex_state = FUTEX_STATE_DEAD;
++}
++
+ void futex_exit_release(struct task_struct *tsk)
+ {
++ tsk->futex_state = FUTEX_STATE_EXITING;
++ /*
++ * Ensure that all new tsk->pi_lock acquisitions must observe
++ * FUTEX_STATE_EXITING. Serializes against attach_to_pi_owner().
++ */
++ smp_mb();
++ /*
++ * Ensure that we must observe the pi_state in exit_pi_state_list().
++ */
++ raw_spin_lock_irq(&tsk->pi_lock);
++ raw_spin_unlock_irq(&tsk->pi_lock);
++
+ futex_exec_release(tsk);
+- futex_exit_done(tsk);
++
++ tsk->futex_state = FUTEX_STATE_DEAD;
+ }
+
+ long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout,
--- /dev/null
+Subject: futex: Move futex exit handling into futex code
+From: Thomas Gleixner <tglx@linutronix.de>
+Date: Wed Nov 6 22:55:36 2019 +0100
+
+From: Thomas Gleixner <tglx@linutronix.de>
+
+commit ba31c1a48538992316cc71ce94fa9cd3e7b427c0 upstream
+
+The futex exit handling is #ifdeffed into mm_release() which is not pretty
+to begin with. But upcoming changes to address futex exit races need to add
+more functionality to this exit code.
+
+Split it out into a function, move it into futex code and make the various
+futex exit functions static.
+
+Preparatory only and no functional change.
+
+Folded build fix from Borislav.
+
+Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
+Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
+Reviewed-by: Ingo Molnar <mingo@kernel.org>
+Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
+Link: https://lkml.kernel.org/r/20191106224556.049705556@linutronix.de
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ include/linux/compat.h | 2 --
+ include/linux/futex.h | 29 ++++++++++++++++-------------
+ kernel/fork.c | 25 +++----------------------
+ kernel/futex.c | 33 +++++++++++++++++++++++++++++----
+ 4 files changed, 48 insertions(+), 41 deletions(-)
+
+--- a/include/linux/compat.h
++++ b/include/linux/compat.h
+@@ -445,8 +445,6 @@ struct compat_kexec_segment;
+ struct compat_mq_attr;
+ struct compat_msgbuf;
+
+-extern void compat_exit_robust_list(struct task_struct *curr);
+-
+ #define BITS_PER_COMPAT_LONG (8*sizeof(compat_long_t))
+
+ #define BITS_TO_COMPAT_LONGS(bits) DIV_ROUND_UP(bits, BITS_PER_COMPAT_LONG)
+--- a/include/linux/futex.h
++++ b/include/linux/futex.h
+@@ -2,7 +2,9 @@
+ #ifndef _LINUX_FUTEX_H
+ #define _LINUX_FUTEX_H
+
++#include <linux/sched.h>
+ #include <linux/ktime.h>
++
+ #include <uapi/linux/futex.h>
+
+ struct inode;
+@@ -51,15 +53,24 @@ union futex_key {
+ #define FUTEX_KEY_INIT (union futex_key) { .both = { .ptr = 0ULL } }
+
+ #ifdef CONFIG_FUTEX
+-extern void exit_robust_list(struct task_struct *curr);
+
+-long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout,
+- u32 __user *uaddr2, u32 val2, u32 val3);
+-#else
+-static inline void exit_robust_list(struct task_struct *curr)
++static inline void futex_init_task(struct task_struct *tsk)
+ {
++ tsk->robust_list = NULL;
++#ifdef CONFIG_COMPAT
++ tsk->compat_robust_list = NULL;
++#endif
++ INIT_LIST_HEAD(&tsk->pi_state_list);
++ tsk->pi_state_cache = NULL;
+ }
+
++void futex_mm_release(struct task_struct *tsk);
++
++long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout,
++ u32 __user *uaddr2, u32 val2, u32 val3);
++#else
++static inline void futex_init_task(struct task_struct *tsk) { }
++static inline void futex_mm_release(struct task_struct *tsk) { }
+ static inline long do_futex(u32 __user *uaddr, int op, u32 val,
+ ktime_t *timeout, u32 __user *uaddr2,
+ u32 val2, u32 val3)
+@@ -68,12 +79,4 @@ static inline long do_futex(u32 __user *
+ }
+ #endif
+
+-#ifdef CONFIG_FUTEX_PI
+-extern void exit_pi_state_list(struct task_struct *curr);
+-#else
+-static inline void exit_pi_state_list(struct task_struct *curr)
+-{
+-}
+-#endif
+-
+ #endif
+--- a/kernel/fork.c
++++ b/kernel/fork.c
+@@ -1220,20 +1220,7 @@ static int wait_for_vfork_done(struct ta
+ void mm_release(struct task_struct *tsk, struct mm_struct *mm)
+ {
+ /* Get rid of any futexes when releasing the mm */
+-#ifdef CONFIG_FUTEX
+- if (unlikely(tsk->robust_list)) {
+- exit_robust_list(tsk);
+- tsk->robust_list = NULL;
+- }
+-#ifdef CONFIG_COMPAT
+- if (unlikely(tsk->compat_robust_list)) {
+- compat_exit_robust_list(tsk);
+- tsk->compat_robust_list = NULL;
+- }
+-#endif
+- if (unlikely(!list_empty(&tsk->pi_state_list)))
+- exit_pi_state_list(tsk);
+-#endif
++ futex_mm_release(tsk);
+
+ uprobe_free_utask(tsk);
+
+@@ -1937,14 +1924,8 @@ static __latent_entropy struct task_stru
+ #ifdef CONFIG_BLOCK
+ p->plug = NULL;
+ #endif
+-#ifdef CONFIG_FUTEX
+- p->robust_list = NULL;
+-#ifdef CONFIG_COMPAT
+- p->compat_robust_list = NULL;
+-#endif
+- INIT_LIST_HEAD(&p->pi_state_list);
+- p->pi_state_cache = NULL;
+-#endif
++ futex_init_task(p);
++
+ /*
+ * sigaltstack should be cleared when sharing the same VM
+ */
+--- a/kernel/futex.c
++++ b/kernel/futex.c
+@@ -341,6 +341,12 @@ static inline bool should_fail_futex(boo
+ }
+ #endif /* CONFIG_FAIL_FUTEX */
+
++#ifdef CONFIG_COMPAT
++static void compat_exit_robust_list(struct task_struct *curr);
++#else
++static inline void compat_exit_robust_list(struct task_struct *curr) { }
++#endif
++
+ static inline void futex_get_mm(union futex_key *key)
+ {
+ mmgrab(key->private.mm);
+@@ -890,7 +896,7 @@ static void put_pi_state(struct futex_pi
+ * Kernel cleans up PI-state, but userspace is likely hosed.
+ * (Robust-futex cleanup is separate and might save the day for userspace.)
+ */
+-void exit_pi_state_list(struct task_struct *curr)
++static void exit_pi_state_list(struct task_struct *curr)
+ {
+ struct list_head *next, *head = &curr->pi_state_list;
+ struct futex_pi_state *pi_state;
+@@ -960,7 +966,8 @@ void exit_pi_state_list(struct task_stru
+ }
+ raw_spin_unlock_irq(&curr->pi_lock);
+ }
+-
++#else
++static inline void exit_pi_state_list(struct task_struct *curr) { }
+ #endif
+
+ /*
+@@ -3625,7 +3632,7 @@ static inline int fetch_robust_entry(str
+ *
+ * We silently return on any sign of list-walking problem.
+ */
+-void exit_robust_list(struct task_struct *curr)
++static void exit_robust_list(struct task_struct *curr)
+ {
+ struct robust_list_head __user *head = curr->robust_list;
+ struct robust_list __user *entry, *next_entry, *pending;
+@@ -3690,6 +3697,24 @@ void exit_robust_list(struct task_struct
+ }
+ }
+
++void futex_mm_release(struct task_struct *tsk)
++{
++ if (unlikely(tsk->robust_list)) {
++ exit_robust_list(tsk);
++ tsk->robust_list = NULL;
++ }
++
++#ifdef CONFIG_COMPAT
++ if (unlikely(tsk->compat_robust_list)) {
++ compat_exit_robust_list(tsk);
++ tsk->compat_robust_list = NULL;
++ }
++#endif
++
++ if (unlikely(!list_empty(&tsk->pi_state_list)))
++ exit_pi_state_list(tsk);
++}
++
+ long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout,
+ u32 __user *uaddr2, u32 val2, u32 val3)
+ {
+@@ -3817,7 +3842,7 @@ static void __user *futex_uaddr(struct r
+ *
+ * We silently return on any sign of list-walking problem.
+ */
+-void compat_exit_robust_list(struct task_struct *curr)
++static void compat_exit_robust_list(struct task_struct *curr)
+ {
+ struct compat_robust_list_head __user *head = curr->compat_robust_list;
+ struct robust_list __user *entry, *next_entry, *pending;
--- /dev/null
+Subject: futex: Prevent exit livelock
+From: Thomas Gleixner <tglx@linutronix.de>
+Date: Wed Nov 6 22:55:46 2019 +0100
+
+From: Thomas Gleixner <tglx@linutronix.de>
+
+commit 3ef240eaff36b8119ac9e2ea17cbf41179c930ba upstream
+
+Oleg provided the following test case:
+
+int main(void)
+{
+ struct sched_param sp = {};
+
+ sp.sched_priority = 2;
+ assert(sched_setscheduler(0, SCHED_FIFO, &sp) == 0);
+
+ int lock = vfork();
+ if (!lock) {
+ sp.sched_priority = 1;
+ assert(sched_setscheduler(0, SCHED_FIFO, &sp) == 0);
+ _exit(0);
+ }
+
+ syscall(__NR_futex, &lock, FUTEX_LOCK_PI, 0,0,0);
+ return 0;
+}
+
+This creates an unkillable RT process spinning in futex_lock_pi() on a UP
+machine or if the process is affine to a single CPU. The reason is:
+
+ parent child
+
+ set FIFO prio 2
+
+ vfork() -> set FIFO prio 1
+ implies wait_for_child() sched_setscheduler(...)
+ exit()
+ do_exit()
+ ....
+ mm_release()
+ tsk->futex_state = FUTEX_STATE_EXITING;
+ exit_futex(); (NOOP in this case)
+ complete() --> wakes parent
+ sys_futex()
+ loop infinite because
+ tsk->futex_state == FUTEX_STATE_EXITING
+
+The same problem can happen just by regular preemption as well:
+
+ task holds futex
+ ...
+ do_exit()
+ tsk->futex_state = FUTEX_STATE_EXITING;
+
+ --> preemption (unrelated wakeup of some other higher prio task, e.g. timer)
+
+ switch_to(other_task)
+
+ return to user
+ sys_futex()
+ loop infinite as above
+
+Just for the fun of it the futex exit cleanup could trigger the wakeup
+itself before the task sets its futex state to DEAD.
+
+To cure this, the handling of the exiting owner is changed so:
+
+ - A refcount is held on the task
+
+ - The task pointer is stored in a caller visible location
+
+ - The caller drops all locks (hash bucket, mmap_sem) and blocks
+ on task::futex_exit_mutex. When the mutex is acquired then
+ the exiting task has completed the cleanup and the state
+ is consistent and can be reevaluated.
+
+This is not a pretty solution, but there is no choice other than returning
+an error code to user space, which would break the state consistency
+guarantee and open another can of problems including regressions.
+
+For stable backports the preparatory commits ac31c7ff8624 .. ba31c1a48538
+are required as well, but for anything older than 5.3.y the backports are
+going to be provided when this hits mainline as the other dependencies for
+those kernels are definitely not stable material.
+
+Fixes: 778e9a9c3e71 ("pi-futex: fix exit races and locking problems")
+Reported-by: Oleg Nesterov <oleg@redhat.com>
+Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
+Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
+Reviewed-by: Ingo Molnar <mingo@kernel.org>
+Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
+Cc: Stable Team <stable@vger.kernel.org>
+Link: https://lkml.kernel.org/r/20191106224557.041676471@linutronix.de
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ kernel/futex.c | 106 ++++++++++++++++++++++++++++++++++++++++++++++++---------
+ 1 file changed, 91 insertions(+), 15 deletions(-)
+
+--- a/kernel/futex.c
++++ b/kernel/futex.c
+@@ -1176,6 +1176,36 @@ out_error:
+ return ret;
+ }
+
++/**
++ * wait_for_owner_exiting - Block until the owner has exited
++ * @exiting: Pointer to the exiting task
++ *
++ * Caller must hold a refcount on @exiting.
++ */
++static void wait_for_owner_exiting(int ret, struct task_struct *exiting)
++{
++ if (ret != -EBUSY) {
++ WARN_ON_ONCE(exiting);
++ return;
++ }
++
++ if (WARN_ON_ONCE(ret == -EBUSY && !exiting))
++ return;
++
++ mutex_lock(&exiting->futex_exit_mutex);
++ /*
++ * No point in doing state checking here. If the waiter got here
++ * while the task was in exec()->exec_futex_release() then it can
++ * have any FUTEX_STATE_* value when the waiter has acquired the
++ * mutex. OK, if running, EXITING or DEAD if it reached exit()
++ * already. Highly unlikely and not a problem. Just one more round
++ * through the futex maze.
++ */
++ mutex_unlock(&exiting->futex_exit_mutex);
++
++ put_task_struct(exiting);
++}
++
+ static int handle_exit_race(u32 __user *uaddr, u32 uval,
+ struct task_struct *tsk)
+ {
+@@ -1237,7 +1267,8 @@ static int handle_exit_race(u32 __user *
+ * it after doing proper sanity checks.
+ */
+ static int attach_to_pi_owner(u32 __user *uaddr, u32 uval, union futex_key *key,
+- struct futex_pi_state **ps)
++ struct futex_pi_state **ps,
++ struct task_struct **exiting)
+ {
+ pid_t pid = uval & FUTEX_TID_MASK;
+ struct futex_pi_state *pi_state;
+@@ -1276,7 +1307,19 @@ static int attach_to_pi_owner(u32 __user
+ int ret = handle_exit_race(uaddr, uval, p);
+
+ raw_spin_unlock_irq(&p->pi_lock);
+- put_task_struct(p);
++ /*
++ * If the owner task is between FUTEX_STATE_EXITING and
++ * FUTEX_STATE_DEAD then store the task pointer and keep
++ * the reference on the task struct. The calling code will
++ * drop all locks, wait for the task to reach
++ * FUTEX_STATE_DEAD and then drop the refcount. This is
++ * required to prevent a live lock when the current task
++ * preempted the exiting task between the two states.
++ */
++ if (ret == -EBUSY)
++ *exiting = p;
++ else
++ put_task_struct(p);
+ return ret;
+ }
+
+@@ -1315,7 +1358,8 @@ static int attach_to_pi_owner(u32 __user
+
+ static int lookup_pi_state(u32 __user *uaddr, u32 uval,
+ struct futex_hash_bucket *hb,
+- union futex_key *key, struct futex_pi_state **ps)
++ union futex_key *key, struct futex_pi_state **ps,
++ struct task_struct **exiting)
+ {
+ struct futex_q *top_waiter = futex_top_waiter(hb, key);
+
+@@ -1330,7 +1374,7 @@ static int lookup_pi_state(u32 __user *u
+ * We are the first waiter - try to look up the owner based on
+ * @uval and attach to it.
+ */
+- return attach_to_pi_owner(uaddr, uval, key, ps);
++ return attach_to_pi_owner(uaddr, uval, key, ps, exiting);
+ }
+
+ static int lock_pi_update_atomic(u32 __user *uaddr, u32 uval, u32 newval)
+@@ -1358,6 +1402,8 @@ static int lock_pi_update_atomic(u32 __u
+ * lookup
+ * @task: the task to perform the atomic lock work for. This will
+ * be "current" except in the case of requeue pi.
++ * @exiting: Pointer to store the task pointer of the owner task
++ * which is in the middle of exiting
+ * @set_waiters: force setting the FUTEX_WAITERS bit (1) or not (0)
+ *
+ * Return:
+@@ -1366,11 +1412,17 @@ static int lock_pi_update_atomic(u32 __u
+ * - <0 - error
+ *
+ * The hb->lock and futex_key refs shall be held by the caller.
++ *
++ * @exiting is only set when the return value is -EBUSY. If so, this holds
++ * a refcount on the exiting task on return and the caller needs to drop it
++ * after waiting for the exit to complete.
+ */
+ static int futex_lock_pi_atomic(u32 __user *uaddr, struct futex_hash_bucket *hb,
+ union futex_key *key,
+ struct futex_pi_state **ps,
+- struct task_struct *task, int set_waiters)
++ struct task_struct *task,
++ struct task_struct **exiting,
++ int set_waiters)
+ {
+ u32 uval, newval, vpid = task_pid_vnr(task);
+ struct futex_q *top_waiter;
+@@ -1440,7 +1492,7 @@ static int futex_lock_pi_atomic(u32 __us
+ * attach to the owner. If that fails, no harm done, we only
+ * set the FUTEX_WAITERS bit in the user space variable.
+ */
+- return attach_to_pi_owner(uaddr, newval, key, ps);
++ return attach_to_pi_owner(uaddr, newval, key, ps, exiting);
+ }
+
+ /**
+@@ -1861,6 +1913,8 @@ void requeue_pi_wake_futex(struct futex_
+ * @key1: the from futex key
+ * @key2: the to futex key
+ * @ps: address to store the pi_state pointer
++ * @exiting: Pointer to store the task pointer of the owner task
++ * which is in the middle of exiting
+ * @set_waiters: force setting the FUTEX_WAITERS bit (1) or not (0)
+ *
+ * Try and get the lock on behalf of the top waiter if we can do it atomically.
+@@ -1868,16 +1922,20 @@ void requeue_pi_wake_futex(struct futex_
+ * then direct futex_lock_pi_atomic() to force setting the FUTEX_WAITERS bit.
+ * hb1 and hb2 must be held by the caller.
+ *
++ * @exiting is only set when the return value is -EBUSY. If so, this holds
++ * a refcount on the exiting task on return and the caller needs to drop it
++ * after waiting for the exit to complete.
++ *
+ * Return:
+ * - 0 - failed to acquire the lock atomically;
+ * - >0 - acquired the lock, return value is vpid of the top_waiter
+ * - <0 - error
+ */
+-static int futex_proxy_trylock_atomic(u32 __user *pifutex,
+- struct futex_hash_bucket *hb1,
+- struct futex_hash_bucket *hb2,
+- union futex_key *key1, union futex_key *key2,
+- struct futex_pi_state **ps, int set_waiters)
++static int
++futex_proxy_trylock_atomic(u32 __user *pifutex, struct futex_hash_bucket *hb1,
++ struct futex_hash_bucket *hb2, union futex_key *key1,
++ union futex_key *key2, struct futex_pi_state **ps,
++ struct task_struct **exiting, int set_waiters)
+ {
+ struct futex_q *top_waiter = NULL;
+ u32 curval;
+@@ -1914,7 +1972,7 @@ static int futex_proxy_trylock_atomic(u3
+ */
+ vpid = task_pid_vnr(top_waiter->task);
+ ret = futex_lock_pi_atomic(pifutex, hb2, key2, ps, top_waiter->task,
+- set_waiters);
++ exiting, set_waiters);
+ if (ret == 1) {
+ requeue_pi_wake_futex(top_waiter, key2, hb2);
+ return vpid;
+@@ -2043,6 +2101,8 @@ retry_private:
+ }
+
+ if (requeue_pi && (task_count - nr_wake < nr_requeue)) {
++ struct task_struct *exiting = NULL;
++
+ /*
+ * Attempt to acquire uaddr2 and wake the top waiter. If we
+ * intend to requeue waiters, force setting the FUTEX_WAITERS
+@@ -2050,7 +2110,8 @@ retry_private:
+ * faults rather in the requeue loop below.
+ */
+ ret = futex_proxy_trylock_atomic(uaddr2, hb1, hb2, &key1,
+- &key2, &pi_state, nr_requeue);
++ &key2, &pi_state,
++ &exiting, nr_requeue);
+
+ /*
+ * At this point the top_waiter has either taken uaddr2 or is
+@@ -2077,7 +2138,8 @@ retry_private:
+ * If that call succeeds then we have pi_state and an
+ * initial refcount on it.
+ */
+- ret = lookup_pi_state(uaddr2, ret, hb2, &key2, &pi_state);
++ ret = lookup_pi_state(uaddr2, ret, hb2, &key2,
++ &pi_state, &exiting);
+ }
+
+ switch (ret) {
+@@ -2107,6 +2169,12 @@ retry_private:
+ hb_waiters_dec(hb2);
+ put_futex_key(&key2);
+ put_futex_key(&key1);
++ /*
++ * Handle the case where the owner is in the middle of
++ * exiting. Wait for the exit to complete otherwise
++ * this task might loop forever, aka. live lock.
++ */
++ wait_for_owner_exiting(ret, exiting);
+ cond_resched();
+ goto retry;
+ default:
+@@ -2834,6 +2902,7 @@ static int futex_lock_pi(u32 __user *uad
+ {
+ struct hrtimer_sleeper timeout, *to = NULL;
+ struct futex_pi_state *pi_state = NULL;
++ struct task_struct *exiting = NULL;
+ struct rt_mutex_waiter rt_waiter;
+ struct futex_hash_bucket *hb;
+ struct futex_q q = futex_q_init;
+@@ -2861,7 +2930,8 @@ retry:
+ retry_private:
+ hb = queue_lock(&q);
+
+- ret = futex_lock_pi_atomic(uaddr, hb, &q.key, &q.pi_state, current, 0);
++ ret = futex_lock_pi_atomic(uaddr, hb, &q.key, &q.pi_state, current,
++ &exiting, 0);
+ if (unlikely(ret)) {
+ /*
+ * Atomic work succeeded and we got the lock,
+@@ -2884,6 +2954,12 @@ retry_private:
+ */
+ queue_unlock(hb);
+ put_futex_key(&q.key);
++ /*
++ * Handle the case where the owner is in the middle of
++ * exiting. Wait for the exit to complete otherwise
++ * this task might loop forever, aka. live lock.
++ */
++ wait_for_owner_exiting(ret, exiting);
+ cond_resched();
+ goto retry;
+ default:
--- /dev/null
+Subject: futex: Provide distinct return value when owner is exiting
+From: Thomas Gleixner <tglx@linutronix.de>
+Date: Wed Nov 6 22:55:45 2019 +0100
+
+From: Thomas Gleixner <tglx@linutronix.de>
+`
+commit ac31c7ff8624409ba3c4901df9237a616c187a5d upstream
+
+attach_to_pi_owner() returns -EAGAIN for various cases:
+
+ - Owner task is exiting
+ - Futex value has changed
+
+The caller drops the held locks (hash bucket, mmap_sem) and retries the
+operation. In case of the owner task exiting this can result in a live
+lock.
+
+As a preparatory step for seperating those cases, provide a distinct return
+value (EBUSY) for the owner exiting case.
+
+No functional change.
+
+Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
+Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
+Reviewed-by: Ingo Molnar <mingo@kernel.org>
+Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
+Link: https://lkml.kernel.org/r/20191106224556.935606117@linutronix.de
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ kernel/futex.c | 16 +++++++++-------
+ 1 file changed, 9 insertions(+), 7 deletions(-)
+
+--- a/kernel/futex.c
++++ b/kernel/futex.c
+@@ -1182,11 +1182,11 @@ static int handle_exit_race(u32 __user *
+ u32 uval2;
+
+ /*
+- * If the futex exit state is not yet FUTEX_STATE_DEAD, wait
+- * for it to finish.
++ * If the futex exit state is not yet FUTEX_STATE_DEAD, tell the
++ * caller that the alleged owner is busy.
+ */
+ if (tsk && tsk->futex_state != FUTEX_STATE_DEAD)
+- return -EAGAIN;
++ return -EBUSY;
+
+ /*
+ * Reread the user space value to handle the following situation:
+@@ -2095,12 +2095,13 @@ retry_private:
+ if (!ret)
+ goto retry;
+ goto out;
++ case -EBUSY:
+ case -EAGAIN:
+ /*
+ * Two reasons for this:
+- * - Owner is exiting and we just wait for the
++ * - EBUSY: Owner is exiting and we just wait for the
+ * exit to complete.
+- * - The user space value changed.
++ * - EAGAIN: The user space value changed.
+ */
+ double_unlock_hb(hb1, hb2);
+ hb_waiters_dec(hb2);
+@@ -2873,12 +2874,13 @@ retry_private:
+ goto out_unlock_put_key;
+ case -EFAULT:
+ goto uaddr_faulted;
++ case -EBUSY:
+ case -EAGAIN:
+ /*
+ * Two reasons for this:
+- * - Task is exiting and we just wait for the
++ * - EBUSY: Task is exiting and we just wait for the
+ * exit to complete.
+- * - The user space value changed.
++ * - EAGAIN: The user space value changed.
+ */
+ queue_unlock(hb);
+ put_futex_key(&q.key);
--- /dev/null
+Subject: futex: Provide state handling for exec() as well
+From: Thomas Gleixner <tglx@linutronix.de>
+Date: Wed Nov 6 22:55:43 2019 +0100
+
+From: Thomas Gleixner <tglx@linutronix.de>
+
+commit af8cbda2cfcaa5515d61ec500498d46e9a8247e2 upstream
+
+exec() attempts to handle potentially held futexes gracefully by running
+the futex exit handling code like exit() does.
+
+The current implementation has no protection against concurrent incoming
+waiters. The reason is that the futex state cannot be set to
+FUTEX_STATE_DEAD after the cleanup because the task struct is still active
+and just about to execute the new binary.
+
+While its arguably buggy when a task holds a futex over exec(), for
+consistency sake the state handling can at least cover the actual futex
+exit cleanup section. This provides state consistency protection accross
+the cleanup. As the futex state of the task becomes FUTEX_STATE_OK after the
+cleanup has been finished, this cannot prevent subsequent attempts to
+attach to the task in case that the cleanup was not successfull in mopping
+up all leftovers.
+
+Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
+Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
+Reviewed-by: Ingo Molnar <mingo@kernel.org>
+Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
+Link: https://lkml.kernel.org/r/20191106224556.753355618@linutronix.de
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ kernel/futex.c | 38 ++++++++++++++++++++++++++++++++++----
+ 1 file changed, 34 insertions(+), 4 deletions(-)
+
+--- a/kernel/futex.c
++++ b/kernel/futex.c
+@@ -3698,7 +3698,7 @@ static void exit_robust_list(struct task
+ }
+ }
+
+-void futex_exec_release(struct task_struct *tsk)
++static void futex_cleanup(struct task_struct *tsk)
+ {
+ if (unlikely(tsk->robust_list)) {
+ exit_robust_list(tsk);
+@@ -3738,7 +3738,7 @@ void futex_exit_recursive(struct task_st
+ tsk->futex_state = FUTEX_STATE_DEAD;
+ }
+
+-void futex_exit_release(struct task_struct *tsk)
++static void futex_cleanup_begin(struct task_struct *tsk)
+ {
+ /*
+ * Switch the state to FUTEX_STATE_EXITING under tsk->pi_lock.
+@@ -3754,10 +3754,40 @@ void futex_exit_release(struct task_stru
+ raw_spin_lock_irq(&tsk->pi_lock);
+ tsk->futex_state = FUTEX_STATE_EXITING;
+ raw_spin_unlock_irq(&tsk->pi_lock);
++}
+
+- futex_exec_release(tsk);
++static void futex_cleanup_end(struct task_struct *tsk, int state)
++{
++ /*
++ * Lockless store. The only side effect is that an observer might
++ * take another loop until it becomes visible.
++ */
++ tsk->futex_state = state;
++}
+
+- tsk->futex_state = FUTEX_STATE_DEAD;
++void futex_exec_release(struct task_struct *tsk)
++{
++ /*
++ * The state handling is done for consistency, but in the case of
++ * exec() there is no way to prevent futher damage as the PID stays
++ * the same. But for the unlikely and arguably buggy case that a
++ * futex is held on exec(), this provides at least as much state
++ * consistency protection which is possible.
++ */
++ futex_cleanup_begin(tsk);
++ futex_cleanup(tsk);
++ /*
++ * Reset the state to FUTEX_STATE_OK. The task is alive and about
++ * exec a new binary.
++ */
++ futex_cleanup_end(tsk, FUTEX_STATE_OK);
++}
++
++void futex_exit_release(struct task_struct *tsk)
++{
++ futex_cleanup_begin(tsk);
++ futex_cleanup(tsk);
++ futex_cleanup_end(tsk, FUTEX_STATE_DEAD);
+ }
+
+ long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout,
--- /dev/null
+Subject: futex: Replace PF_EXITPIDONE with a state
+From: Thomas Gleixner <tglx@linutronix.de>
+Date: Wed Nov 6 22:55:37 2019 +0100
+
+From: Thomas Gleixner <tglx@linutronix.de>
+
+commit 3d4775df0a89240f671861c6ab6e8d59af8e9e41 upstream
+
+The futex exit handling relies on PF_ flags. That's suboptimal as it
+requires a smp_mb() and an ugly lock/unlock of the exiting tasks pi_lock in
+the middle of do_exit() to enforce the observability of PF_EXITING in the
+futex code.
+
+Add a futex_state member to task_struct and convert the PF_EXITPIDONE logic
+over to the new state. The PF_EXITING dependency will be cleaned up in a
+later step.
+
+This prepares for handling various futex exit issues later.
+
+Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
+Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
+Reviewed-by: Ingo Molnar <mingo@kernel.org>
+Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
+Link: https://lkml.kernel.org/r/20191106224556.149449274@linutronix.de
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ include/linux/futex.h | 33 +++++++++++++++++++++++++++++++++
+ include/linux/sched.h | 2 +-
+ kernel/exit.c | 18 ++----------------
+ kernel/futex.c | 25 +++++++++++++------------
+ 4 files changed, 49 insertions(+), 29 deletions(-)
+
+--- a/include/linux/futex.h
++++ b/include/linux/futex.h
+@@ -53,6 +53,10 @@ union futex_key {
+ #define FUTEX_KEY_INIT (union futex_key) { .both = { .ptr = 0ULL } }
+
+ #ifdef CONFIG_FUTEX
++enum {
++ FUTEX_STATE_OK,
++ FUTEX_STATE_DEAD,
++};
+
+ static inline void futex_init_task(struct task_struct *tsk)
+ {
+@@ -62,6 +66,34 @@ static inline void futex_init_task(struc
+ #endif
+ INIT_LIST_HEAD(&tsk->pi_state_list);
+ tsk->pi_state_cache = NULL;
++ tsk->futex_state = FUTEX_STATE_OK;
++}
++
++/**
++ * futex_exit_done - Sets the tasks futex state to FUTEX_STATE_DEAD
++ * @tsk: task to set the state on
++ *
++ * Set the futex exit state of the task lockless. The futex waiter code
++ * observes that state when a task is exiting and loops until the task has
++ * actually finished the futex cleanup. The worst case for this is that the
++ * waiter runs through the wait loop until the state becomes visible.
++ *
++ * This has two callers:
++ *
++ * - futex_mm_release() after the futex exit cleanup has been done
++ *
++ * - do_exit() from the recursive fault handling path.
++ *
++ * In case of a recursive fault this is best effort. Either the futex exit
++ * code has run already or not. If the OWNER_DIED bit has been set on the
++ * futex then the waiter can take it over. If not, the problem is pushed
++ * back to user space. If the futex exit code did not run yet, then an
++ * already queued waiter might block forever, but there is nothing which
++ * can be done about that.
++ */
++static inline void futex_exit_done(struct task_struct *tsk)
++{
++ tsk->futex_state = FUTEX_STATE_DEAD;
+ }
+
+ void futex_mm_release(struct task_struct *tsk);
+@@ -71,6 +103,7 @@ long do_futex(u32 __user *uaddr, int op,
+ #else
+ static inline void futex_init_task(struct task_struct *tsk) { }
+ static inline void futex_mm_release(struct task_struct *tsk) { }
++static inline void futex_exit_done(struct task_struct *tsk) { }
+ static inline long do_futex(u32 __user *uaddr, int op, u32 val,
+ ktime_t *timeout, u32 __user *uaddr2,
+ u32 val2, u32 val3)
+--- a/include/linux/sched.h
++++ b/include/linux/sched.h
+@@ -996,6 +996,7 @@ struct task_struct {
+ #endif
+ struct list_head pi_state_list;
+ struct futex_pi_state *pi_state_cache;
++ unsigned int futex_state;
+ #endif
+ #ifdef CONFIG_PERF_EVENTS
+ struct perf_event_context *perf_event_ctxp[perf_nr_task_contexts];
+@@ -1377,7 +1378,6 @@ extern struct pid *cad_pid;
+ */
+ #define PF_IDLE 0x00000002 /* I am an IDLE thread */
+ #define PF_EXITING 0x00000004 /* Getting shut down */
+-#define PF_EXITPIDONE 0x00000008 /* PI exit done on shut down */
+ #define PF_VCPU 0x00000010 /* I'm a virtual CPU */
+ #define PF_WQ_WORKER 0x00000020 /* I'm a workqueue worker */
+ #define PF_FORKNOEXEC 0x00000040 /* Forked but didn't exec */
+--- a/kernel/exit.c
++++ b/kernel/exit.c
+@@ -818,16 +818,7 @@ void __noreturn do_exit(long code)
+ */
+ if (unlikely(tsk->flags & PF_EXITING)) {
+ pr_alert("Fixing recursive fault but reboot is needed!\n");
+- /*
+- * We can do this unlocked here. The futex code uses
+- * this flag just to verify whether the pi state
+- * cleanup has been done or not. In the worst case it
+- * loops once more. We pretend that the cleanup was
+- * done as there is no way to return. Either the
+- * OWNER_DIED bit is set by now or we push the blocked
+- * task into the wait for ever nirwana as well.
+- */
+- tsk->flags |= PF_EXITPIDONE;
++ futex_exit_done(tsk);
+ set_current_state(TASK_UNINTERRUPTIBLE);
+ schedule();
+ }
+@@ -918,12 +909,7 @@ void __noreturn do_exit(long code)
+ * Make sure we are holding no locks:
+ */
+ debug_check_no_locks_held();
+- /*
+- * We can do this unlocked here. The futex code uses this flag
+- * just to verify whether the pi state cleanup has been done
+- * or not. In the worst case it loops once more.
+- */
+- tsk->flags |= PF_EXITPIDONE;
++ futex_exit_done(tsk);
+
+ if (tsk->io_context)
+ exit_io_context(tsk);
+--- a/kernel/futex.c
++++ b/kernel/futex.c
+@@ -1182,9 +1182,10 @@ static int handle_exit_race(u32 __user *
+ u32 uval2;
+
+ /*
+- * If PF_EXITPIDONE is not yet set, then try again.
++ * If the futex exit state is not yet FUTEX_STATE_DEAD, wait
++ * for it to finish.
+ */
+- if (tsk && !(tsk->flags & PF_EXITPIDONE))
++ if (tsk && tsk->futex_state != FUTEX_STATE_DEAD)
+ return -EAGAIN;
+
+ /*
+@@ -1203,8 +1204,9 @@ static int handle_exit_race(u32 __user *
+ * *uaddr = 0xC0000000; tsk = get_task(PID);
+ * } if (!tsk->flags & PF_EXITING) {
+ * ... attach();
+- * tsk->flags |= PF_EXITPIDONE; } else {
+- * if (!(tsk->flags & PF_EXITPIDONE))
++ * tsk->futex_state = } else {
++ * FUTEX_STATE_DEAD; if (tsk->futex_state !=
++ * FUTEX_STATE_DEAD)
+ * return -EAGAIN;
+ * return -ESRCH; <--- FAIL
+ * }
+@@ -1260,17 +1262,16 @@ static int attach_to_pi_owner(u32 __user
+ }
+
+ /*
+- * We need to look at the task state flags to figure out,
+- * whether the task is exiting. To protect against the do_exit
+- * change of the task flags, we do this protected by
+- * p->pi_lock:
++ * We need to look at the task state to figure out, whether the
++ * task is exiting. To protect against the change of the task state
++ * in futex_exit_release(), we do this protected by p->pi_lock:
+ */
+ raw_spin_lock_irq(&p->pi_lock);
+- if (unlikely(p->flags & PF_EXITING)) {
++ if (unlikely(p->futex_state != FUTEX_STATE_OK)) {
+ /*
+- * The task is on the way out. When PF_EXITPIDONE is
+- * set, we know that the task has finished the
+- * cleanup:
++ * The task is on the way out. When the futex state is
++ * FUTEX_STATE_DEAD, we know that the task has finished
++ * the cleanup:
+ */
+ int ret = handle_exit_race(uaddr, uval, p);
+
--- /dev/null
+Subject: futex: Sanitize exit state handling
+From: Thomas Gleixner <tglx@linutronix.de>
+Date: Wed Nov 6 22:55:42 2019 +0100
+
+From: Thomas Gleixner <tglx@linutronix.de>
+
+commit 4a8e991b91aca9e20705d434677ac013974e0e30 upstream
+
+Instead of having a smp_mb() and an empty lock/unlock of task::pi_lock move
+the state setting into to the lock section.
+
+Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
+Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
+Reviewed-by: Ingo Molnar <mingo@kernel.org>
+Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
+Link: https://lkml.kernel.org/r/20191106224556.645603214@linutronix.de
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ kernel/futex.c | 17 ++++++++++-------
+ 1 file changed, 10 insertions(+), 7 deletions(-)
+
+--- a/kernel/futex.c
++++ b/kernel/futex.c
+@@ -3740,16 +3740,19 @@ void futex_exit_recursive(struct task_st
+
+ void futex_exit_release(struct task_struct *tsk)
+ {
+- tsk->futex_state = FUTEX_STATE_EXITING;
+- /*
+- * Ensure that all new tsk->pi_lock acquisitions must observe
+- * FUTEX_STATE_EXITING. Serializes against attach_to_pi_owner().
+- */
+- smp_mb();
+ /*
+- * Ensure that we must observe the pi_state in exit_pi_state_list().
++ * Switch the state to FUTEX_STATE_EXITING under tsk->pi_lock.
++ *
++ * This ensures that all subsequent checks of tsk->futex_state in
++ * attach_to_pi_owner() must observe FUTEX_STATE_EXITING with
++ * tsk->pi_lock held.
++ *
++ * It guarantees also that a pi_state which was queued right before
++ * the state change under tsk->pi_lock by a concurrent waiter must
++ * be observed in exit_pi_state_list().
+ */
+ raw_spin_lock_irq(&tsk->pi_lock);
++ tsk->futex_state = FUTEX_STATE_EXITING;
+ raw_spin_unlock_irq(&tsk->pi_lock);
+
+ futex_exec_release(tsk);
--- /dev/null
+Subject: futex: Set task::futex_state to DEAD right after handling futex exit
+From: Thomas Gleixner <tglx@linutronix.de>
+Date: Wed Nov 6 22:55:40 2019 +0100
+
+From: Thomas Gleixner <tglx@linutronix.de>
+
+commit f24f22435dcc11389acc87e5586239c1819d217c upstream
+
+Setting task::futex_state in do_exit() is rather arbitrarily placed for no
+reason. Move it into the futex code.
+
+Note, this is only done for the exit cleanup as the exec cleanup cannot set
+the state to FUTEX_STATE_DEAD because the task struct is still in active
+use.
+
+Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
+Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
+Reviewed-by: Ingo Molnar <mingo@kernel.org>
+Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
+Link: https://lkml.kernel.org/r/20191106224556.439511191@linutronix.de
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ kernel/exit.c | 1 -
+ kernel/futex.c | 1 +
+ 2 files changed, 1 insertion(+), 1 deletion(-)
+
+--- a/kernel/exit.c
++++ b/kernel/exit.c
+@@ -909,7 +909,6 @@ void __noreturn do_exit(long code)
+ * Make sure we are holding no locks:
+ */
+ debug_check_no_locks_held();
+- futex_exit_done(tsk);
+
+ if (tsk->io_context)
+ exit_io_context(tsk);
+--- a/kernel/futex.c
++++ b/kernel/futex.c
+@@ -3719,6 +3719,7 @@ void futex_exec_release(struct task_stru
+ void futex_exit_release(struct task_struct *tsk)
+ {
+ futex_exec_release(tsk);
++ futex_exit_done(tsk);
+ }
+
+ long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout,
--- /dev/null
+Subject: futex: Split futex_mm_release() for exit/exec
+From: Thomas Gleixner <tglx@linutronix.de>
+Date: Wed Nov 6 22:55:39 2019 +0100
+
+From: Thomas Gleixner <tglx@linutronix.de>
+
+commit 150d71584b12809144b8145b817e83b81158ae5f upstream
+
+To allow separate handling of the futex exit state in the futex exit code
+for exit and exec, split futex_mm_release() into two functions and invoke
+them from the corresponding exit/exec_mm_release() callsites.
+
+Preparatory only, no functional change.
+
+Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
+Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
+Reviewed-by: Ingo Molnar <mingo@kernel.org>
+Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
+Link: https://lkml.kernel.org/r/20191106224556.332094221@linutronix.de
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ include/linux/futex.h | 6 ++++--
+ kernel/fork.c | 5 ++---
+ kernel/futex.c | 7 ++++++-
+ 3 files changed, 12 insertions(+), 6 deletions(-)
+
+--- a/include/linux/futex.h
++++ b/include/linux/futex.h
+@@ -96,14 +96,16 @@ static inline void futex_exit_done(struc
+ tsk->futex_state = FUTEX_STATE_DEAD;
+ }
+
+-void futex_mm_release(struct task_struct *tsk);
++void futex_exit_release(struct task_struct *tsk);
++void futex_exec_release(struct task_struct *tsk);
+
+ long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout,
+ u32 __user *uaddr2, u32 val2, u32 val3);
+ #else
+ static inline void futex_init_task(struct task_struct *tsk) { }
+-static inline void futex_mm_release(struct task_struct *tsk) { }
+ static inline void futex_exit_done(struct task_struct *tsk) { }
++static inline void futex_exit_release(struct task_struct *tsk) { }
++static inline void futex_exec_release(struct task_struct *tsk) { }
+ static inline long do_futex(u32 __user *uaddr, int op, u32 val,
+ ktime_t *timeout, u32 __user *uaddr2,
+ u32 val2, u32 val3)
+--- a/kernel/fork.c
++++ b/kernel/fork.c
+@@ -1219,9 +1219,6 @@ static int wait_for_vfork_done(struct ta
+ */
+ static void mm_release(struct task_struct *tsk, struct mm_struct *mm)
+ {
+- /* Get rid of any futexes when releasing the mm */
+- futex_mm_release(tsk);
+-
+ uprobe_free_utask(tsk);
+
+ /* Get rid of any cached register state */
+@@ -1256,11 +1253,13 @@ static void mm_release(struct task_struc
+
+ void exit_mm_release(struct task_struct *tsk, struct mm_struct *mm)
+ {
++ futex_exit_release(tsk);
+ mm_release(tsk, mm);
+ }
+
+ void exec_mm_release(struct task_struct *tsk, struct mm_struct *mm)
+ {
++ futex_exec_release(tsk);
+ mm_release(tsk, mm);
+ }
+
+--- a/kernel/futex.c
++++ b/kernel/futex.c
+@@ -3698,7 +3698,7 @@ static void exit_robust_list(struct task
+ }
+ }
+
+-void futex_mm_release(struct task_struct *tsk)
++void futex_exec_release(struct task_struct *tsk)
+ {
+ if (unlikely(tsk->robust_list)) {
+ exit_robust_list(tsk);
+@@ -3716,6 +3716,11 @@ void futex_mm_release(struct task_struct
+ exit_pi_state_list(tsk);
+ }
+
++void futex_exit_release(struct task_struct *tsk)
++{
++ futex_exec_release(tsk);
++}
++
+ long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout,
+ u32 __user *uaddr2, u32 val2, u32 val3)
+ {
--- /dev/null
+futex_Move_futex_exit_handling_into_futex_code.patch
+futex_Replace_PF_EXITPIDONE_with_a_state.patch
+exitexec_Seperate_mm_release().patch
+futex_Split_futex_mm_release()_for_exitexec.patch
+futex_Set_taskfutex_state_to_DEAD_right_after_handling_futex_exit.patch
+futex_Mark_the_begin_of_futex_exit_explicitly.patch
+futex_Sanitize_exit_state_handling.patch
+futex_Provide_state_handling_for_exec()_as_well.patch
+futex_Add_mutex_around_futex_exit.patch
+futex_Provide_distinct_return_value_when_owner_is_exiting.patch
+futex_Prevent_exit_livelock.patch