]> git.ipfire.org Git - thirdparty/kernel/stable-queue.git/commitdiff
futex patches stashed for next 4.19 release.
authorGreg Kroah-Hartman <gregkh@linuxfoundation.org>
Tue, 26 Jan 2021 10:29:34 +0000 (11:29 +0100)
committerGreg Kroah-Hartman <gregkh@linuxfoundation.org>
Tue, 26 Jan 2021 10:29:34 +0000 (11:29 +0100)
12 files changed:
pending/futex-4.19/exitexec_Seperate_mm_release().patch [new file with mode: 0644]
pending/futex-4.19/futex_Add_mutex_around_futex_exit.patch [new file with mode: 0644]
pending/futex-4.19/futex_Mark_the_begin_of_futex_exit_explicitly.patch [new file with mode: 0644]
pending/futex-4.19/futex_Move_futex_exit_handling_into_futex_code.patch [new file with mode: 0644]
pending/futex-4.19/futex_Prevent_exit_livelock.patch [new file with mode: 0644]
pending/futex-4.19/futex_Provide_distinct_return_value_when_owner_is_exiting.patch [new file with mode: 0644]
pending/futex-4.19/futex_Provide_state_handling_for_exec()_as_well.patch [new file with mode: 0644]
pending/futex-4.19/futex_Replace_PF_EXITPIDONE_with_a_state.patch [new file with mode: 0644]
pending/futex-4.19/futex_Sanitize_exit_state_handling.patch [new file with mode: 0644]
pending/futex-4.19/futex_Set_taskfutex_state_to_DEAD_right_after_handling_futex_exit.patch [new file with mode: 0644]
pending/futex-4.19/futex_Split_futex_mm_release()_for_exitexec.patch [new file with mode: 0644]
pending/futex-4.19/series [new file with mode: 0644]

diff --git a/pending/futex-4.19/exitexec_Seperate_mm_release().patch b/pending/futex-4.19/exitexec_Seperate_mm_release().patch
new file mode 100644 (file)
index 0000000..6f1eca6
--- /dev/null
@@ -0,0 +1,97 @@
+Subject: exit/exec: Seperate mm_release()
+From: Thomas Gleixner <tglx@linutronix.de>
+Date: Wed Nov  6 22:55:38 2019 +0100
+
+From: Thomas Gleixner <tglx@linutronix.de>
+
+commit 4610ba7ad877fafc0a25a30c6c82015304120426 upstream
+
+mm_release() contains the futex exit handling. mm_release() is called from
+do_exit()->exit_mm() and from exec()->exec_mm().
+
+In the exit_mm() case PF_EXITING and the futex state is updated. In the
+exec_mm() case these states are not touched.
+
+As the futex exit code needs further protections against exit races, this
+needs to be split into two functions.
+
+Preparatory only, no functional change.
+
+Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
+Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
+Reviewed-by: Ingo Molnar <mingo@kernel.org>
+Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
+Link: https://lkml.kernel.org/r/20191106224556.240518241@linutronix.de
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/exec.c                |    2 +-
+ include/linux/sched/mm.h |    6 ++++--
+ kernel/exit.c            |    2 +-
+ kernel/fork.c            |   12 +++++++++++-
+ 4 files changed, 17 insertions(+), 5 deletions(-)
+
+--- a/fs/exec.c
++++ b/fs/exec.c
+@@ -1011,7 +1011,7 @@ static int exec_mmap(struct mm_struct *m
+       /* Notify parent that we're no longer interested in the old VM */
+       tsk = current;
+       old_mm = current->mm;
+-      mm_release(tsk, old_mm);
++      exec_mm_release(tsk, old_mm);
+       if (old_mm) {
+               sync_mm_rss(old_mm);
+--- a/include/linux/sched/mm.h
++++ b/include/linux/sched/mm.h
+@@ -119,8 +119,10 @@ extern struct mm_struct *get_task_mm(str
+  * succeeds.
+  */
+ extern struct mm_struct *mm_access(struct task_struct *task, unsigned int mode);
+-/* Remove the current tasks stale references to the old mm_struct */
+-extern void mm_release(struct task_struct *, struct mm_struct *);
++/* Remove the current tasks stale references to the old mm_struct on exit() */
++extern void exit_mm_release(struct task_struct *, struct mm_struct *);
++/* Remove the current tasks stale references to the old mm_struct on exec() */
++extern void exec_mm_release(struct task_struct *, struct mm_struct *);
+ #ifdef CONFIG_MEMCG
+ extern void mm_update_next_owner(struct mm_struct *mm);
+--- a/kernel/exit.c
++++ b/kernel/exit.c
+@@ -498,7 +498,7 @@ static void exit_mm(void)
+       struct mm_struct *mm = current->mm;
+       struct core_state *core_state;
+-      mm_release(current, mm);
++      exit_mm_release(current, mm);
+       if (!mm)
+               return;
+       sync_mm_rss(mm);
+--- a/kernel/fork.c
++++ b/kernel/fork.c
+@@ -1217,7 +1217,7 @@ static int wait_for_vfork_done(struct ta
+  * restoring the old one. . .
+  * Eric Biederman 10 January 1998
+  */
+-void mm_release(struct task_struct *tsk, struct mm_struct *mm)
++static void mm_release(struct task_struct *tsk, struct mm_struct *mm)
+ {
+       /* Get rid of any futexes when releasing the mm */
+       futex_mm_release(tsk);
+@@ -1254,6 +1254,16 @@ void mm_release(struct task_struct *tsk,
+               complete_vfork_done(tsk);
+ }
++void exit_mm_release(struct task_struct *tsk, struct mm_struct *mm)
++{
++      mm_release(tsk, mm);
++}
++
++void exec_mm_release(struct task_struct *tsk, struct mm_struct *mm)
++{
++      mm_release(tsk, mm);
++}
++
+ /*
+  * Allocate a new mm structure and copy contents from the
+  * mm structure of the passed in task structure.
diff --git a/pending/futex-4.19/futex_Add_mutex_around_futex_exit.patch b/pending/futex-4.19/futex_Add_mutex_around_futex_exit.patch
new file mode 100644 (file)
index 0000000..ef9d6d5
--- /dev/null
@@ -0,0 +1,82 @@
+Subject: futex: Add mutex around futex exit
+From: Thomas Gleixner <tglx@linutronix.de>
+Date: Wed Nov  6 22:55:44 2019 +0100
+
+From: Thomas Gleixner <tglx@linutronix.de>
+
+commit 3f186d974826847a07bc7964d79ec4eded475ad9 upstream
+
+The mutex will be used in subsequent changes to replace the busy looping of
+a waiter when the futex owner is currently executing the exit cleanup to
+prevent a potential live lock.
+
+Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
+Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
+Reviewed-by: Ingo Molnar <mingo@kernel.org>
+Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
+Link: https://lkml.kernel.org/r/20191106224556.845798895@linutronix.de
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ include/linux/futex.h |    1 +
+ include/linux/sched.h |    1 +
+ kernel/futex.c        |   16 ++++++++++++++++
+ 3 files changed, 18 insertions(+)
+
+--- a/include/linux/futex.h
++++ b/include/linux/futex.h
+@@ -68,6 +68,7 @@ static inline void futex_init_task(struc
+       INIT_LIST_HEAD(&tsk->pi_state_list);
+       tsk->pi_state_cache = NULL;
+       tsk->futex_state = FUTEX_STATE_OK;
++      mutex_init(&tsk->futex_exit_mutex);
+ }
+ void futex_exit_recursive(struct task_struct *tsk);
+--- a/include/linux/sched.h
++++ b/include/linux/sched.h
+@@ -996,6 +996,7 @@ struct task_struct {
+ #endif
+       struct list_head                pi_state_list;
+       struct futex_pi_state           *pi_state_cache;
++      struct mutex                    futex_exit_mutex;
+       unsigned int                    futex_state;
+ #endif
+ #ifdef CONFIG_PERF_EVENTS
+--- a/kernel/futex.c
++++ b/kernel/futex.c
+@@ -3735,12 +3735,23 @@ static void futex_cleanup(struct task_st
+  */
+ void futex_exit_recursive(struct task_struct *tsk)
+ {
++      /* If the state is FUTEX_STATE_EXITING then futex_exit_mutex is held */
++      if (tsk->futex_state == FUTEX_STATE_EXITING)
++              mutex_unlock(&tsk->futex_exit_mutex);
+       tsk->futex_state = FUTEX_STATE_DEAD;
+ }
+ static void futex_cleanup_begin(struct task_struct *tsk)
+ {
+       /*
++       * Prevent various race issues against a concurrent incoming waiter
++       * including live locks by forcing the waiter to block on
++       * tsk->futex_exit_mutex when it observes FUTEX_STATE_EXITING in
++       * attach_to_pi_owner().
++       */
++      mutex_lock(&tsk->futex_exit_mutex);
++
++      /*
+        * Switch the state to FUTEX_STATE_EXITING under tsk->pi_lock.
+        *
+        * This ensures that all subsequent checks of tsk->futex_state in
+@@ -3763,6 +3774,11 @@ static void futex_cleanup_end(struct tas
+        * take another loop until it becomes visible.
+        */
+       tsk->futex_state = state;
++      /*
++       * Drop the exit protection. This unblocks waiters which observed
++       * FUTEX_STATE_EXITING to reevaluate the state.
++       */
++      mutex_unlock(&tsk->futex_exit_mutex);
+ }
+ void futex_exec_release(struct task_struct *tsk)
diff --git a/pending/futex-4.19/futex_Mark_the_begin_of_futex_exit_explicitly.patch b/pending/futex-4.19/futex_Mark_the_begin_of_futex_exit_explicitly.patch
new file mode 100644 (file)
index 0000000..973da35
--- /dev/null
@@ -0,0 +1,159 @@
+Subject: futex: Mark the begin of futex exit explicitly
+From: Thomas Gleixner <tglx@linutronix.de>
+Date: Wed Nov  6 22:55:41 2019 +0100
+
+From: Thomas Gleixner <tglx@linutronix.de>
+
+commit 18f694385c4fd77a09851fd301236746ca83f3cb upstream
+
+Instead of relying on PF_EXITING use an explicit state for the futex exit
+and set it in the futex exit function. This moves the smp barrier and the
+lock/unlock serialization into the futex code.
+
+As with the DEAD state this is restricted to the exit path as exec
+continues to use the same task struct.
+
+This allows to simplify that logic in a next step.
+
+Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
+Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
+Reviewed-by: Ingo Molnar <mingo@kernel.org>
+Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
+Link: https://lkml.kernel.org/r/20191106224556.539409004@linutronix.de
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ include/linux/futex.h |   31 +++----------------------------
+ kernel/exit.c         |   13 +------------
+ kernel/futex.c        |   37 ++++++++++++++++++++++++++++++++++++-
+ 3 files changed, 40 insertions(+), 41 deletions(-)
+
+--- a/include/linux/futex.h
++++ b/include/linux/futex.h
+@@ -55,6 +55,7 @@ union futex_key {
+ #ifdef CONFIG_FUTEX
+ enum {
+       FUTEX_STATE_OK,
++      FUTEX_STATE_EXITING,
+       FUTEX_STATE_DEAD,
+ };
+@@ -69,33 +70,7 @@ static inline void futex_init_task(struc
+       tsk->futex_state = FUTEX_STATE_OK;
+ }
+-/**
+- * futex_exit_done - Sets the tasks futex state to FUTEX_STATE_DEAD
+- * @tsk:      task to set the state on
+- *
+- * Set the futex exit state of the task lockless. The futex waiter code
+- * observes that state when a task is exiting and loops until the task has
+- * actually finished the futex cleanup. The worst case for this is that the
+- * waiter runs through the wait loop until the state becomes visible.
+- *
+- * This has two callers:
+- *
+- * - futex_mm_release() after the futex exit cleanup has been done
+- *
+- * - do_exit() from the recursive fault handling path.
+- *
+- * In case of a recursive fault this is best effort. Either the futex exit
+- * code has run already or not. If the OWNER_DIED bit has been set on the
+- * futex then the waiter can take it over. If not, the problem is pushed
+- * back to user space. If the futex exit code did not run yet, then an
+- * already queued waiter might block forever, but there is nothing which
+- * can be done about that.
+- */
+-static inline void futex_exit_done(struct task_struct *tsk)
+-{
+-      tsk->futex_state = FUTEX_STATE_DEAD;
+-}
+-
++void futex_exit_recursive(struct task_struct *tsk);
+ void futex_exit_release(struct task_struct *tsk);
+ void futex_exec_release(struct task_struct *tsk);
+@@ -103,7 +78,7 @@ long do_futex(u32 __user *uaddr, int op,
+             u32 __user *uaddr2, u32 val2, u32 val3);
+ #else
+ static inline void futex_init_task(struct task_struct *tsk) { }
+-static inline void futex_exit_done(struct task_struct *tsk) { }
++static inline void futex_exit_recursive(struct task_struct *tsk) { }
+ static inline void futex_exit_release(struct task_struct *tsk) { }
+ static inline void futex_exec_release(struct task_struct *tsk) { }
+ static inline long do_futex(u32 __user *uaddr, int op, u32 val,
+--- a/kernel/exit.c
++++ b/kernel/exit.c
+@@ -818,23 +818,12 @@ void __noreturn do_exit(long code)
+        */
+       if (unlikely(tsk->flags & PF_EXITING)) {
+               pr_alert("Fixing recursive fault but reboot is needed!\n");
+-              futex_exit_done(tsk);
++              futex_exit_recursive(tsk);
+               set_current_state(TASK_UNINTERRUPTIBLE);
+               schedule();
+       }
+       exit_signals(tsk);  /* sets PF_EXITING */
+-      /*
+-       * Ensure that all new tsk->pi_lock acquisitions must observe
+-       * PF_EXITING. Serializes against futex.c:attach_to_pi_owner().
+-       */
+-      smp_mb();
+-      /*
+-       * Ensure that we must observe the pi_state in exit_mm() ->
+-       * mm_release() -> exit_pi_state_list().
+-       */
+-      raw_spin_lock_irq(&tsk->pi_lock);
+-      raw_spin_unlock_irq(&tsk->pi_lock);
+       /* sync mm's RSS info before statistics gathering */
+       if (tsk->mm)
+--- a/kernel/futex.c
++++ b/kernel/futex.c
+@@ -3716,10 +3716,45 @@ void futex_exec_release(struct task_stru
+               exit_pi_state_list(tsk);
+ }
++/**
++ * futex_exit_recursive - Set the tasks futex state to FUTEX_STATE_DEAD
++ * @tsk:      task to set the state on
++ *
++ * Set the futex exit state of the task lockless. The futex waiter code
++ * observes that state when a task is exiting and loops until the task has
++ * actually finished the futex cleanup. The worst case for this is that the
++ * waiter runs through the wait loop until the state becomes visible.
++ *
++ * This is called from the recursive fault handling path in do_exit().
++ *
++ * This is best effort. Either the futex exit code has run already or
++ * not. If the OWNER_DIED bit has been set on the futex then the waiter can
++ * take it over. If not, the problem is pushed back to user space. If the
++ * futex exit code did not run yet, then an already queued waiter might
++ * block forever, but there is nothing which can be done about that.
++ */
++void futex_exit_recursive(struct task_struct *tsk)
++{
++      tsk->futex_state = FUTEX_STATE_DEAD;
++}
++
+ void futex_exit_release(struct task_struct *tsk)
+ {
++      tsk->futex_state = FUTEX_STATE_EXITING;
++      /*
++       * Ensure that all new tsk->pi_lock acquisitions must observe
++       * FUTEX_STATE_EXITING. Serializes against attach_to_pi_owner().
++       */
++      smp_mb();
++      /*
++       * Ensure that we must observe the pi_state in exit_pi_state_list().
++       */
++      raw_spin_lock_irq(&tsk->pi_lock);
++      raw_spin_unlock_irq(&tsk->pi_lock);
++
+       futex_exec_release(tsk);
+-      futex_exit_done(tsk);
++
++      tsk->futex_state = FUTEX_STATE_DEAD;
+ }
+ long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout,
diff --git a/pending/futex-4.19/futex_Move_futex_exit_handling_into_futex_code.patch b/pending/futex-4.19/futex_Move_futex_exit_handling_into_futex_code.patch
new file mode 100644 (file)
index 0000000..d08d9ed
--- /dev/null
@@ -0,0 +1,216 @@
+Subject: futex: Move futex exit handling into futex code
+From: Thomas Gleixner <tglx@linutronix.de>
+Date: Wed Nov  6 22:55:36 2019 +0100
+
+From: Thomas Gleixner <tglx@linutronix.de>
+
+commit ba31c1a48538992316cc71ce94fa9cd3e7b427c0 upstream
+
+The futex exit handling is #ifdeffed into mm_release() which is not pretty
+to begin with. But upcoming changes to address futex exit races need to add
+more functionality to this exit code.
+
+Split it out into a function, move it into futex code and make the various
+futex exit functions static.
+
+Preparatory only and no functional change.
+
+Folded build fix from Borislav.
+
+Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
+Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
+Reviewed-by: Ingo Molnar <mingo@kernel.org>
+Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
+Link: https://lkml.kernel.org/r/20191106224556.049705556@linutronix.de
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ include/linux/compat.h |    2 --
+ include/linux/futex.h  |   29 ++++++++++++++++-------------
+ kernel/fork.c          |   25 +++----------------------
+ kernel/futex.c         |   33 +++++++++++++++++++++++++++++----
+ 4 files changed, 48 insertions(+), 41 deletions(-)
+
+--- a/include/linux/compat.h
++++ b/include/linux/compat.h
+@@ -445,8 +445,6 @@ struct compat_kexec_segment;
+ struct compat_mq_attr;
+ struct compat_msgbuf;
+-extern void compat_exit_robust_list(struct task_struct *curr);
+-
+ #define BITS_PER_COMPAT_LONG    (8*sizeof(compat_long_t))
+ #define BITS_TO_COMPAT_LONGS(bits) DIV_ROUND_UP(bits, BITS_PER_COMPAT_LONG)
+--- a/include/linux/futex.h
++++ b/include/linux/futex.h
+@@ -2,7 +2,9 @@
+ #ifndef _LINUX_FUTEX_H
+ #define _LINUX_FUTEX_H
++#include <linux/sched.h>
+ #include <linux/ktime.h>
++
+ #include <uapi/linux/futex.h>
+ struct inode;
+@@ -51,15 +53,24 @@ union futex_key {
+ #define FUTEX_KEY_INIT (union futex_key) { .both = { .ptr = 0ULL } }
+ #ifdef CONFIG_FUTEX
+-extern void exit_robust_list(struct task_struct *curr);
+-long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout,
+-            u32 __user *uaddr2, u32 val2, u32 val3);
+-#else
+-static inline void exit_robust_list(struct task_struct *curr)
++static inline void futex_init_task(struct task_struct *tsk)
+ {
++      tsk->robust_list = NULL;
++#ifdef CONFIG_COMPAT
++      tsk->compat_robust_list = NULL;
++#endif
++      INIT_LIST_HEAD(&tsk->pi_state_list);
++      tsk->pi_state_cache = NULL;
+ }
++void futex_mm_release(struct task_struct *tsk);
++
++long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout,
++            u32 __user *uaddr2, u32 val2, u32 val3);
++#else
++static inline void futex_init_task(struct task_struct *tsk) { }
++static inline void futex_mm_release(struct task_struct *tsk) { }
+ static inline long do_futex(u32 __user *uaddr, int op, u32 val,
+                           ktime_t *timeout, u32 __user *uaddr2,
+                           u32 val2, u32 val3)
+@@ -68,12 +79,4 @@ static inline long do_futex(u32 __user *
+ }
+ #endif
+-#ifdef CONFIG_FUTEX_PI
+-extern void exit_pi_state_list(struct task_struct *curr);
+-#else
+-static inline void exit_pi_state_list(struct task_struct *curr)
+-{
+-}
+-#endif
+-
+ #endif
+--- a/kernel/fork.c
++++ b/kernel/fork.c
+@@ -1220,20 +1220,7 @@ static int wait_for_vfork_done(struct ta
+ void mm_release(struct task_struct *tsk, struct mm_struct *mm)
+ {
+       /* Get rid of any futexes when releasing the mm */
+-#ifdef CONFIG_FUTEX
+-      if (unlikely(tsk->robust_list)) {
+-              exit_robust_list(tsk);
+-              tsk->robust_list = NULL;
+-      }
+-#ifdef CONFIG_COMPAT
+-      if (unlikely(tsk->compat_robust_list)) {
+-              compat_exit_robust_list(tsk);
+-              tsk->compat_robust_list = NULL;
+-      }
+-#endif
+-      if (unlikely(!list_empty(&tsk->pi_state_list)))
+-              exit_pi_state_list(tsk);
+-#endif
++      futex_mm_release(tsk);
+       uprobe_free_utask(tsk);
+@@ -1937,14 +1924,8 @@ static __latent_entropy struct task_stru
+ #ifdef CONFIG_BLOCK
+       p->plug = NULL;
+ #endif
+-#ifdef CONFIG_FUTEX
+-      p->robust_list = NULL;
+-#ifdef CONFIG_COMPAT
+-      p->compat_robust_list = NULL;
+-#endif
+-      INIT_LIST_HEAD(&p->pi_state_list);
+-      p->pi_state_cache = NULL;
+-#endif
++      futex_init_task(p);
++
+       /*
+        * sigaltstack should be cleared when sharing the same VM
+        */
+--- a/kernel/futex.c
++++ b/kernel/futex.c
+@@ -341,6 +341,12 @@ static inline bool should_fail_futex(boo
+ }
+ #endif /* CONFIG_FAIL_FUTEX */
++#ifdef CONFIG_COMPAT
++static void compat_exit_robust_list(struct task_struct *curr);
++#else
++static inline void compat_exit_robust_list(struct task_struct *curr) { }
++#endif
++
+ static inline void futex_get_mm(union futex_key *key)
+ {
+       mmgrab(key->private.mm);
+@@ -890,7 +896,7 @@ static void put_pi_state(struct futex_pi
+  * Kernel cleans up PI-state, but userspace is likely hosed.
+  * (Robust-futex cleanup is separate and might save the day for userspace.)
+  */
+-void exit_pi_state_list(struct task_struct *curr)
++static void exit_pi_state_list(struct task_struct *curr)
+ {
+       struct list_head *next, *head = &curr->pi_state_list;
+       struct futex_pi_state *pi_state;
+@@ -960,7 +966,8 @@ void exit_pi_state_list(struct task_stru
+       }
+       raw_spin_unlock_irq(&curr->pi_lock);
+ }
+-
++#else
++static inline void exit_pi_state_list(struct task_struct *curr) { }
+ #endif
+ /*
+@@ -3625,7 +3632,7 @@ static inline int fetch_robust_entry(str
+  *
+  * We silently return on any sign of list-walking problem.
+  */
+-void exit_robust_list(struct task_struct *curr)
++static void exit_robust_list(struct task_struct *curr)
+ {
+       struct robust_list_head __user *head = curr->robust_list;
+       struct robust_list __user *entry, *next_entry, *pending;
+@@ -3690,6 +3697,24 @@ void exit_robust_list(struct task_struct
+       }
+ }
++void futex_mm_release(struct task_struct *tsk)
++{
++      if (unlikely(tsk->robust_list)) {
++              exit_robust_list(tsk);
++              tsk->robust_list = NULL;
++      }
++
++#ifdef CONFIG_COMPAT
++      if (unlikely(tsk->compat_robust_list)) {
++              compat_exit_robust_list(tsk);
++              tsk->compat_robust_list = NULL;
++      }
++#endif
++
++      if (unlikely(!list_empty(&tsk->pi_state_list)))
++              exit_pi_state_list(tsk);
++}
++
+ long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout,
+               u32 __user *uaddr2, u32 val2, u32 val3)
+ {
+@@ -3817,7 +3842,7 @@ static void __user *futex_uaddr(struct r
+  *
+  * We silently return on any sign of list-walking problem.
+  */
+-void compat_exit_robust_list(struct task_struct *curr)
++static void compat_exit_robust_list(struct task_struct *curr)
+ {
+       struct compat_robust_list_head __user *head = curr->compat_robust_list;
+       struct robust_list __user *entry, *next_entry, *pending;
diff --git a/pending/futex-4.19/futex_Prevent_exit_livelock.patch b/pending/futex-4.19/futex_Prevent_exit_livelock.patch
new file mode 100644 (file)
index 0000000..99b8d8c
--- /dev/null
@@ -0,0 +1,342 @@
+Subject: futex: Prevent exit livelock
+From: Thomas Gleixner <tglx@linutronix.de>
+Date: Wed Nov  6 22:55:46 2019 +0100
+
+From: Thomas Gleixner <tglx@linutronix.de>
+
+commit 3ef240eaff36b8119ac9e2ea17cbf41179c930ba upstream
+
+Oleg provided the following test case:
+
+int main(void)
+{
+       struct sched_param sp = {};
+
+       sp.sched_priority = 2;
+       assert(sched_setscheduler(0, SCHED_FIFO, &sp) == 0);
+
+       int lock = vfork();
+       if (!lock) {
+               sp.sched_priority = 1;
+               assert(sched_setscheduler(0, SCHED_FIFO, &sp) == 0);
+               _exit(0);
+       }
+
+       syscall(__NR_futex, &lock, FUTEX_LOCK_PI, 0,0,0);
+       return 0;
+}
+
+This creates an unkillable RT process spinning in futex_lock_pi() on a UP
+machine or if the process is affine to a single CPU. The reason is:
+
+ parent                                        child
+
+  set FIFO prio 2
+
+  vfork()                      ->      set FIFO prio 1
+   implies wait_for_child()            sched_setscheduler(...)
+                                       exit()
+                                       do_exit()
+                                       ....
+                                       mm_release()
+                                         tsk->futex_state = FUTEX_STATE_EXITING;
+                                         exit_futex(); (NOOP in this case)
+                                         complete() --> wakes parent
+  sys_futex()
+    loop infinite because
+    tsk->futex_state == FUTEX_STATE_EXITING
+
+The same problem can happen just by regular preemption as well:
+
+  task holds futex
+  ...
+  do_exit()
+    tsk->futex_state = FUTEX_STATE_EXITING;
+
+  --> preemption (unrelated wakeup of some other higher prio task, e.g. timer)
+
+  switch_to(other_task)
+
+  return to user
+  sys_futex()
+       loop infinite as above
+
+Just for the fun of it the futex exit cleanup could trigger the wakeup
+itself before the task sets its futex state to DEAD.
+
+To cure this, the handling of the exiting owner is changed so:
+
+   - A refcount is held on the task
+
+   - The task pointer is stored in a caller visible location
+
+   - The caller drops all locks (hash bucket, mmap_sem) and blocks
+     on task::futex_exit_mutex. When the mutex is acquired then
+     the exiting task has completed the cleanup and the state
+     is consistent and can be reevaluated.
+
+This is not a pretty solution, but there is no choice other than returning
+an error code to user space, which would break the state consistency
+guarantee and open another can of problems including regressions.
+
+For stable backports the preparatory commits ac31c7ff8624 .. ba31c1a48538
+are required as well, but for anything older than 5.3.y the backports are
+going to be provided when this hits mainline as the other dependencies for
+those kernels are definitely not stable material.
+
+Fixes: 778e9a9c3e71 ("pi-futex: fix exit races and locking problems")
+Reported-by: Oleg Nesterov <oleg@redhat.com>
+Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
+Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
+Reviewed-by: Ingo Molnar <mingo@kernel.org>
+Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
+Cc: Stable Team <stable@vger.kernel.org>
+Link: https://lkml.kernel.org/r/20191106224557.041676471@linutronix.de
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ kernel/futex.c |  106 ++++++++++++++++++++++++++++++++++++++++++++++++---------
+ 1 file changed, 91 insertions(+), 15 deletions(-)
+
+--- a/kernel/futex.c
++++ b/kernel/futex.c
+@@ -1176,6 +1176,36 @@ out_error:
+       return ret;
+ }
++/**
++ * wait_for_owner_exiting - Block until the owner has exited
++ * @exiting:  Pointer to the exiting task
++ *
++ * Caller must hold a refcount on @exiting.
++ */
++static void wait_for_owner_exiting(int ret, struct task_struct *exiting)
++{
++      if (ret != -EBUSY) {
++              WARN_ON_ONCE(exiting);
++              return;
++      }
++
++      if (WARN_ON_ONCE(ret == -EBUSY && !exiting))
++              return;
++
++      mutex_lock(&exiting->futex_exit_mutex);
++      /*
++       * No point in doing state checking here. If the waiter got here
++       * while the task was in exec()->exec_futex_release() then it can
++       * have any FUTEX_STATE_* value when the waiter has acquired the
++       * mutex. OK, if running, EXITING or DEAD if it reached exit()
++       * already. Highly unlikely and not a problem. Just one more round
++       * through the futex maze.
++       */
++      mutex_unlock(&exiting->futex_exit_mutex);
++
++      put_task_struct(exiting);
++}
++
+ static int handle_exit_race(u32 __user *uaddr, u32 uval,
+                           struct task_struct *tsk)
+ {
+@@ -1237,7 +1267,8 @@ static int handle_exit_race(u32 __user *
+  * it after doing proper sanity checks.
+  */
+ static int attach_to_pi_owner(u32 __user *uaddr, u32 uval, union futex_key *key,
+-                            struct futex_pi_state **ps)
++                            struct futex_pi_state **ps,
++                            struct task_struct **exiting)
+ {
+       pid_t pid = uval & FUTEX_TID_MASK;
+       struct futex_pi_state *pi_state;
+@@ -1276,7 +1307,19 @@ static int attach_to_pi_owner(u32 __user
+               int ret = handle_exit_race(uaddr, uval, p);
+               raw_spin_unlock_irq(&p->pi_lock);
+-              put_task_struct(p);
++              /*
++               * If the owner task is between FUTEX_STATE_EXITING and
++               * FUTEX_STATE_DEAD then store the task pointer and keep
++               * the reference on the task struct. The calling code will
++               * drop all locks, wait for the task to reach
++               * FUTEX_STATE_DEAD and then drop the refcount. This is
++               * required to prevent a live lock when the current task
++               * preempted the exiting task between the two states.
++               */
++              if (ret == -EBUSY)
++                      *exiting = p;
++              else
++                      put_task_struct(p);
+               return ret;
+       }
+@@ -1315,7 +1358,8 @@ static int attach_to_pi_owner(u32 __user
+ static int lookup_pi_state(u32 __user *uaddr, u32 uval,
+                          struct futex_hash_bucket *hb,
+-                         union futex_key *key, struct futex_pi_state **ps)
++                         union futex_key *key, struct futex_pi_state **ps,
++                         struct task_struct **exiting)
+ {
+       struct futex_q *top_waiter = futex_top_waiter(hb, key);
+@@ -1330,7 +1374,7 @@ static int lookup_pi_state(u32 __user *u
+        * We are the first waiter - try to look up the owner based on
+        * @uval and attach to it.
+        */
+-      return attach_to_pi_owner(uaddr, uval, key, ps);
++      return attach_to_pi_owner(uaddr, uval, key, ps, exiting);
+ }
+ static int lock_pi_update_atomic(u32 __user *uaddr, u32 uval, u32 newval)
+@@ -1358,6 +1402,8 @@ static int lock_pi_update_atomic(u32 __u
+  *                    lookup
+  * @task:             the task to perform the atomic lock work for.  This will
+  *                    be "current" except in the case of requeue pi.
++ * @exiting:          Pointer to store the task pointer of the owner task
++ *                    which is in the middle of exiting
+  * @set_waiters:      force setting the FUTEX_WAITERS bit (1) or not (0)
+  *
+  * Return:
+@@ -1366,11 +1412,17 @@ static int lock_pi_update_atomic(u32 __u
+  *  - <0 - error
+  *
+  * The hb->lock and futex_key refs shall be held by the caller.
++ *
++ * @exiting is only set when the return value is -EBUSY. If so, this holds
++ * a refcount on the exiting task on return and the caller needs to drop it
++ * after waiting for the exit to complete.
+  */
+ static int futex_lock_pi_atomic(u32 __user *uaddr, struct futex_hash_bucket *hb,
+                               union futex_key *key,
+                               struct futex_pi_state **ps,
+-                              struct task_struct *task, int set_waiters)
++                              struct task_struct *task,
++                              struct task_struct **exiting,
++                              int set_waiters)
+ {
+       u32 uval, newval, vpid = task_pid_vnr(task);
+       struct futex_q *top_waiter;
+@@ -1440,7 +1492,7 @@ static int futex_lock_pi_atomic(u32 __us
+        * attach to the owner. If that fails, no harm done, we only
+        * set the FUTEX_WAITERS bit in the user space variable.
+        */
+-      return attach_to_pi_owner(uaddr, newval, key, ps);
++      return attach_to_pi_owner(uaddr, newval, key, ps, exiting);
+ }
+ /**
+@@ -1861,6 +1913,8 @@ void requeue_pi_wake_futex(struct futex_
+  * @key1:             the from futex key
+  * @key2:             the to futex key
+  * @ps:                       address to store the pi_state pointer
++ * @exiting:          Pointer to store the task pointer of the owner task
++ *                    which is in the middle of exiting
+  * @set_waiters:      force setting the FUTEX_WAITERS bit (1) or not (0)
+  *
+  * Try and get the lock on behalf of the top waiter if we can do it atomically.
+@@ -1868,16 +1922,20 @@ void requeue_pi_wake_futex(struct futex_
+  * then direct futex_lock_pi_atomic() to force setting the FUTEX_WAITERS bit.
+  * hb1 and hb2 must be held by the caller.
+  *
++ * @exiting is only set when the return value is -EBUSY. If so, this holds
++ * a refcount on the exiting task on return and the caller needs to drop it
++ * after waiting for the exit to complete.
++ *
+  * Return:
+  *  -  0 - failed to acquire the lock atomically;
+  *  - >0 - acquired the lock, return value is vpid of the top_waiter
+  *  - <0 - error
+  */
+-static int futex_proxy_trylock_atomic(u32 __user *pifutex,
+-                               struct futex_hash_bucket *hb1,
+-                               struct futex_hash_bucket *hb2,
+-                               union futex_key *key1, union futex_key *key2,
+-                               struct futex_pi_state **ps, int set_waiters)
++static int
++futex_proxy_trylock_atomic(u32 __user *pifutex, struct futex_hash_bucket *hb1,
++                         struct futex_hash_bucket *hb2, union futex_key *key1,
++                         union futex_key *key2, struct futex_pi_state **ps,
++                         struct task_struct **exiting, int set_waiters)
+ {
+       struct futex_q *top_waiter = NULL;
+       u32 curval;
+@@ -1914,7 +1972,7 @@ static int futex_proxy_trylock_atomic(u3
+        */
+       vpid = task_pid_vnr(top_waiter->task);
+       ret = futex_lock_pi_atomic(pifutex, hb2, key2, ps, top_waiter->task,
+-                                 set_waiters);
++                                 exiting, set_waiters);
+       if (ret == 1) {
+               requeue_pi_wake_futex(top_waiter, key2, hb2);
+               return vpid;
+@@ -2043,6 +2101,8 @@ retry_private:
+       }
+       if (requeue_pi && (task_count - nr_wake < nr_requeue)) {
++              struct task_struct *exiting = NULL;
++
+               /*
+                * Attempt to acquire uaddr2 and wake the top waiter. If we
+                * intend to requeue waiters, force setting the FUTEX_WAITERS
+@@ -2050,7 +2110,8 @@ retry_private:
+                * faults rather in the requeue loop below.
+                */
+               ret = futex_proxy_trylock_atomic(uaddr2, hb1, hb2, &key1,
+-                                               &key2, &pi_state, nr_requeue);
++                                               &key2, &pi_state,
++                                               &exiting, nr_requeue);
+               /*
+                * At this point the top_waiter has either taken uaddr2 or is
+@@ -2077,7 +2138,8 @@ retry_private:
+                        * If that call succeeds then we have pi_state and an
+                        * initial refcount on it.
+                        */
+-                      ret = lookup_pi_state(uaddr2, ret, hb2, &key2, &pi_state);
++                      ret = lookup_pi_state(uaddr2, ret, hb2, &key2,
++                                            &pi_state, &exiting);
+               }
+               switch (ret) {
+@@ -2107,6 +2169,12 @@ retry_private:
+                       hb_waiters_dec(hb2);
+                       put_futex_key(&key2);
+                       put_futex_key(&key1);
++                      /*
++                       * Handle the case where the owner is in the middle of
++                       * exiting. Wait for the exit to complete otherwise
++                       * this task might loop forever, aka. live lock.
++                       */
++                      wait_for_owner_exiting(ret, exiting);
+                       cond_resched();
+                       goto retry;
+               default:
+@@ -2834,6 +2902,7 @@ static int futex_lock_pi(u32 __user *uad
+ {
+       struct hrtimer_sleeper timeout, *to = NULL;
+       struct futex_pi_state *pi_state = NULL;
++      struct task_struct *exiting = NULL;
+       struct rt_mutex_waiter rt_waiter;
+       struct futex_hash_bucket *hb;
+       struct futex_q q = futex_q_init;
+@@ -2861,7 +2930,8 @@ retry:
+ retry_private:
+       hb = queue_lock(&q);
+-      ret = futex_lock_pi_atomic(uaddr, hb, &q.key, &q.pi_state, current, 0);
++      ret = futex_lock_pi_atomic(uaddr, hb, &q.key, &q.pi_state, current,
++                                 &exiting, 0);
+       if (unlikely(ret)) {
+               /*
+                * Atomic work succeeded and we got the lock,
+@@ -2884,6 +2954,12 @@ retry_private:
+                        */
+                       queue_unlock(hb);
+                       put_futex_key(&q.key);
++                      /*
++                       * Handle the case where the owner is in the middle of
++                       * exiting. Wait for the exit to complete otherwise
++                       * this task might loop forever, aka. live lock.
++                       */
++                      wait_for_owner_exiting(ret, exiting);
+                       cond_resched();
+                       goto retry;
+               default:
diff --git a/pending/futex-4.19/futex_Provide_distinct_return_value_when_owner_is_exiting.patch b/pending/futex-4.19/futex_Provide_distinct_return_value_when_owner_is_exiting.patch
new file mode 100644 (file)
index 0000000..6da55da
--- /dev/null
@@ -0,0 +1,81 @@
+Subject: futex: Provide distinct return value when owner is exiting
+From: Thomas Gleixner <tglx@linutronix.de>
+Date: Wed Nov  6 22:55:45 2019 +0100
+
+From: Thomas Gleixner <tglx@linutronix.de>
+`
+commit ac31c7ff8624409ba3c4901df9237a616c187a5d upstream
+
+attach_to_pi_owner() returns -EAGAIN for various cases:
+
+ - Owner task is exiting
+ - Futex value has changed
+
+The caller drops the held locks (hash bucket, mmap_sem) and retries the
+operation. In case of the owner task exiting this can result in a live
+lock.
+
+As a preparatory step for seperating those cases, provide a distinct return
+value (EBUSY) for the owner exiting case.
+
+No functional change.
+
+Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
+Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
+Reviewed-by: Ingo Molnar <mingo@kernel.org>
+Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
+Link: https://lkml.kernel.org/r/20191106224556.935606117@linutronix.de
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ kernel/futex.c |   16 +++++++++-------
+ 1 file changed, 9 insertions(+), 7 deletions(-)
+
+--- a/kernel/futex.c
++++ b/kernel/futex.c
+@@ -1182,11 +1182,11 @@ static int handle_exit_race(u32 __user *
+       u32 uval2;
+       /*
+-       * If the futex exit state is not yet FUTEX_STATE_DEAD, wait
+-       * for it to finish.
++       * If the futex exit state is not yet FUTEX_STATE_DEAD, tell the
++       * caller that the alleged owner is busy.
+        */
+       if (tsk && tsk->futex_state != FUTEX_STATE_DEAD)
+-              return -EAGAIN;
++              return -EBUSY;
+       /*
+        * Reread the user space value to handle the following situation:
+@@ -2095,12 +2095,13 @@ retry_private:
+                       if (!ret)
+                               goto retry;
+                       goto out;
++              case -EBUSY:
+               case -EAGAIN:
+                       /*
+                        * Two reasons for this:
+-                       * - Owner is exiting and we just wait for the
++                       * - EBUSY: Owner is exiting and we just wait for the
+                        *   exit to complete.
+-                       * - The user space value changed.
++                       * - EAGAIN: The user space value changed.
+                        */
+                       double_unlock_hb(hb1, hb2);
+                       hb_waiters_dec(hb2);
+@@ -2873,12 +2874,13 @@ retry_private:
+                       goto out_unlock_put_key;
+               case -EFAULT:
+                       goto uaddr_faulted;
++              case -EBUSY:
+               case -EAGAIN:
+                       /*
+                        * Two reasons for this:
+-                       * - Task is exiting and we just wait for the
++                       * - EBUSY: Task is exiting and we just wait for the
+                        *   exit to complete.
+-                       * - The user space value changed.
++                       * - EAGAIN: The user space value changed.
+                        */
+                       queue_unlock(hb);
+                       put_futex_key(&q.key);
diff --git a/pending/futex-4.19/futex_Provide_state_handling_for_exec()_as_well.patch b/pending/futex-4.19/futex_Provide_state_handling_for_exec()_as_well.patch
new file mode 100644 (file)
index 0000000..782f5a7
--- /dev/null
@@ -0,0 +1,97 @@
+Subject: futex: Provide state handling for exec() as well
+From: Thomas Gleixner <tglx@linutronix.de>
+Date: Wed Nov  6 22:55:43 2019 +0100
+
+From: Thomas Gleixner <tglx@linutronix.de>
+
+commit af8cbda2cfcaa5515d61ec500498d46e9a8247e2 upstream
+
+exec() attempts to handle potentially held futexes gracefully by running
+the futex exit handling code like exit() does.
+
+The current implementation has no protection against concurrent incoming
+waiters. The reason is that the futex state cannot be set to
+FUTEX_STATE_DEAD after the cleanup because the task struct is still active
+and just about to execute the new binary.
+
+While its arguably buggy when a task holds a futex over exec(), for
+consistency sake the state handling can at least cover the actual futex
+exit cleanup section. This provides state consistency protection accross
+the cleanup. As the futex state of the task becomes FUTEX_STATE_OK after the
+cleanup has been finished, this cannot prevent subsequent attempts to
+attach to the task in case that the cleanup was not successfull in mopping
+up all leftovers.
+
+Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
+Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
+Reviewed-by: Ingo Molnar <mingo@kernel.org>
+Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
+Link: https://lkml.kernel.org/r/20191106224556.753355618@linutronix.de
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ kernel/futex.c |   38 ++++++++++++++++++++++++++++++++++----
+ 1 file changed, 34 insertions(+), 4 deletions(-)
+
+--- a/kernel/futex.c
++++ b/kernel/futex.c
+@@ -3698,7 +3698,7 @@ static void exit_robust_list(struct task
+       }
+ }
+-void futex_exec_release(struct task_struct *tsk)
++static void futex_cleanup(struct task_struct *tsk)
+ {
+       if (unlikely(tsk->robust_list)) {
+               exit_robust_list(tsk);
+@@ -3738,7 +3738,7 @@ void futex_exit_recursive(struct task_st
+       tsk->futex_state = FUTEX_STATE_DEAD;
+ }
+-void futex_exit_release(struct task_struct *tsk)
++static void futex_cleanup_begin(struct task_struct *tsk)
+ {
+       /*
+        * Switch the state to FUTEX_STATE_EXITING under tsk->pi_lock.
+@@ -3754,10 +3754,40 @@ void futex_exit_release(struct task_stru
+       raw_spin_lock_irq(&tsk->pi_lock);
+       tsk->futex_state = FUTEX_STATE_EXITING;
+       raw_spin_unlock_irq(&tsk->pi_lock);
++}
+-      futex_exec_release(tsk);
++static void futex_cleanup_end(struct task_struct *tsk, int state)
++{
++      /*
++       * Lockless store. The only side effect is that an observer might
++       * take another loop until it becomes visible.
++       */
++      tsk->futex_state = state;
++}
+-      tsk->futex_state = FUTEX_STATE_DEAD;
++void futex_exec_release(struct task_struct *tsk)
++{
++      /*
++       * The state handling is done for consistency, but in the case of
++       * exec() there is no way to prevent futher damage as the PID stays
++       * the same. But for the unlikely and arguably buggy case that a
++       * futex is held on exec(), this provides at least as much state
++       * consistency protection which is possible.
++       */
++      futex_cleanup_begin(tsk);
++      futex_cleanup(tsk);
++      /*
++       * Reset the state to FUTEX_STATE_OK. The task is alive and about
++       * exec a new binary.
++       */
++      futex_cleanup_end(tsk, FUTEX_STATE_OK);
++}
++
++void futex_exit_release(struct task_struct *tsk)
++{
++      futex_cleanup_begin(tsk);
++      futex_cleanup(tsk);
++      futex_cleanup_end(tsk, FUTEX_STATE_DEAD);
+ }
+ long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout,
diff --git a/pending/futex-4.19/futex_Replace_PF_EXITPIDONE_with_a_state.patch b/pending/futex-4.19/futex_Replace_PF_EXITPIDONE_with_a_state.patch
new file mode 100644 (file)
index 0000000..11303d7
--- /dev/null
@@ -0,0 +1,192 @@
+Subject: futex: Replace PF_EXITPIDONE with a state
+From: Thomas Gleixner <tglx@linutronix.de>
+Date: Wed Nov  6 22:55:37 2019 +0100
+
+From: Thomas Gleixner <tglx@linutronix.de>
+
+commit 3d4775df0a89240f671861c6ab6e8d59af8e9e41 upstream
+
+The futex exit handling relies on PF_ flags. That's suboptimal as it
+requires a smp_mb() and an ugly lock/unlock of the exiting tasks pi_lock in
+the middle of do_exit() to enforce the observability of PF_EXITING in the
+futex code.
+
+Add a futex_state member to task_struct and convert the PF_EXITPIDONE logic
+over to the new state. The PF_EXITING dependency will be cleaned up in a
+later step.
+
+This prepares for handling various futex exit issues later.
+
+Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
+Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
+Reviewed-by: Ingo Molnar <mingo@kernel.org>
+Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
+Link: https://lkml.kernel.org/r/20191106224556.149449274@linutronix.de
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ include/linux/futex.h |   33 +++++++++++++++++++++++++++++++++
+ include/linux/sched.h |    2 +-
+ kernel/exit.c         |   18 ++----------------
+ kernel/futex.c        |   25 +++++++++++++------------
+ 4 files changed, 49 insertions(+), 29 deletions(-)
+
+--- a/include/linux/futex.h
++++ b/include/linux/futex.h
+@@ -53,6 +53,10 @@ union futex_key {
+ #define FUTEX_KEY_INIT (union futex_key) { .both = { .ptr = 0ULL } }
+ #ifdef CONFIG_FUTEX
++enum {
++      FUTEX_STATE_OK,
++      FUTEX_STATE_DEAD,
++};
+ static inline void futex_init_task(struct task_struct *tsk)
+ {
+@@ -62,6 +66,34 @@ static inline void futex_init_task(struc
+ #endif
+       INIT_LIST_HEAD(&tsk->pi_state_list);
+       tsk->pi_state_cache = NULL;
++      tsk->futex_state = FUTEX_STATE_OK;
++}
++
++/**
++ * futex_exit_done - Sets the tasks futex state to FUTEX_STATE_DEAD
++ * @tsk:      task to set the state on
++ *
++ * Set the futex exit state of the task lockless. The futex waiter code
++ * observes that state when a task is exiting and loops until the task has
++ * actually finished the futex cleanup. The worst case for this is that the
++ * waiter runs through the wait loop until the state becomes visible.
++ *
++ * This has two callers:
++ *
++ * - futex_mm_release() after the futex exit cleanup has been done
++ *
++ * - do_exit() from the recursive fault handling path.
++ *
++ * In case of a recursive fault this is best effort. Either the futex exit
++ * code has run already or not. If the OWNER_DIED bit has been set on the
++ * futex then the waiter can take it over. If not, the problem is pushed
++ * back to user space. If the futex exit code did not run yet, then an
++ * already queued waiter might block forever, but there is nothing which
++ * can be done about that.
++ */
++static inline void futex_exit_done(struct task_struct *tsk)
++{
++      tsk->futex_state = FUTEX_STATE_DEAD;
+ }
+ void futex_mm_release(struct task_struct *tsk);
+@@ -71,6 +103,7 @@ long do_futex(u32 __user *uaddr, int op,
+ #else
+ static inline void futex_init_task(struct task_struct *tsk) { }
+ static inline void futex_mm_release(struct task_struct *tsk) { }
++static inline void futex_exit_done(struct task_struct *tsk) { }
+ static inline long do_futex(u32 __user *uaddr, int op, u32 val,
+                           ktime_t *timeout, u32 __user *uaddr2,
+                           u32 val2, u32 val3)
+--- a/include/linux/sched.h
++++ b/include/linux/sched.h
+@@ -996,6 +996,7 @@ struct task_struct {
+ #endif
+       struct list_head                pi_state_list;
+       struct futex_pi_state           *pi_state_cache;
++      unsigned int                    futex_state;
+ #endif
+ #ifdef CONFIG_PERF_EVENTS
+       struct perf_event_context       *perf_event_ctxp[perf_nr_task_contexts];
+@@ -1377,7 +1378,6 @@ extern struct pid *cad_pid;
+  */
+ #define PF_IDLE                       0x00000002      /* I am an IDLE thread */
+ #define PF_EXITING            0x00000004      /* Getting shut down */
+-#define PF_EXITPIDONE         0x00000008      /* PI exit done on shut down */
+ #define PF_VCPU                       0x00000010      /* I'm a virtual CPU */
+ #define PF_WQ_WORKER          0x00000020      /* I'm a workqueue worker */
+ #define PF_FORKNOEXEC         0x00000040      /* Forked but didn't exec */
+--- a/kernel/exit.c
++++ b/kernel/exit.c
+@@ -818,16 +818,7 @@ void __noreturn do_exit(long code)
+        */
+       if (unlikely(tsk->flags & PF_EXITING)) {
+               pr_alert("Fixing recursive fault but reboot is needed!\n");
+-              /*
+-               * We can do this unlocked here. The futex code uses
+-               * this flag just to verify whether the pi state
+-               * cleanup has been done or not. In the worst case it
+-               * loops once more. We pretend that the cleanup was
+-               * done as there is no way to return. Either the
+-               * OWNER_DIED bit is set by now or we push the blocked
+-               * task into the wait for ever nirwana as well.
+-               */
+-              tsk->flags |= PF_EXITPIDONE;
++              futex_exit_done(tsk);
+               set_current_state(TASK_UNINTERRUPTIBLE);
+               schedule();
+       }
+@@ -918,12 +909,7 @@ void __noreturn do_exit(long code)
+        * Make sure we are holding no locks:
+        */
+       debug_check_no_locks_held();
+-      /*
+-       * We can do this unlocked here. The futex code uses this flag
+-       * just to verify whether the pi state cleanup has been done
+-       * or not. In the worst case it loops once more.
+-       */
+-      tsk->flags |= PF_EXITPIDONE;
++      futex_exit_done(tsk);
+       if (tsk->io_context)
+               exit_io_context(tsk);
+--- a/kernel/futex.c
++++ b/kernel/futex.c
+@@ -1182,9 +1182,10 @@ static int handle_exit_race(u32 __user *
+       u32 uval2;
+       /*
+-       * If PF_EXITPIDONE is not yet set, then try again.
++       * If the futex exit state is not yet FUTEX_STATE_DEAD, wait
++       * for it to finish.
+        */
+-      if (tsk && !(tsk->flags & PF_EXITPIDONE))
++      if (tsk && tsk->futex_state != FUTEX_STATE_DEAD)
+               return -EAGAIN;
+       /*
+@@ -1203,8 +1204,9 @@ static int handle_exit_race(u32 __user *
+        *    *uaddr = 0xC0000000;           tsk = get_task(PID);
+        *   }                               if (!tsk->flags & PF_EXITING) {
+        *  ...                                attach();
+-       *  tsk->flags |= PF_EXITPIDONE;     } else {
+-       *                                     if (!(tsk->flags & PF_EXITPIDONE))
++       *  tsk->futex_state =               } else {
++       *      FUTEX_STATE_DEAD;              if (tsk->futex_state !=
++       *                                        FUTEX_STATE_DEAD)
+        *                                       return -EAGAIN;
+        *                                     return -ESRCH; <--- FAIL
+        *                                   }
+@@ -1260,17 +1262,16 @@ static int attach_to_pi_owner(u32 __user
+       }
+       /*
+-       * We need to look at the task state flags to figure out,
+-       * whether the task is exiting. To protect against the do_exit
+-       * change of the task flags, we do this protected by
+-       * p->pi_lock:
++       * We need to look at the task state to figure out, whether the
++       * task is exiting. To protect against the change of the task state
++       * in futex_exit_release(), we do this protected by p->pi_lock:
+        */
+       raw_spin_lock_irq(&p->pi_lock);
+-      if (unlikely(p->flags & PF_EXITING)) {
++      if (unlikely(p->futex_state != FUTEX_STATE_OK)) {
+               /*
+-               * The task is on the way out. When PF_EXITPIDONE is
+-               * set, we know that the task has finished the
+-               * cleanup:
++               * The task is on the way out. When the futex state is
++               * FUTEX_STATE_DEAD, we know that the task has finished
++               * the cleanup:
+                */
+               int ret = handle_exit_race(uaddr, uval, p);
diff --git a/pending/futex-4.19/futex_Sanitize_exit_state_handling.patch b/pending/futex-4.19/futex_Sanitize_exit_state_handling.patch
new file mode 100644 (file)
index 0000000..ea0c362
--- /dev/null
@@ -0,0 +1,50 @@
+Subject: futex: Sanitize exit state handling
+From: Thomas Gleixner <tglx@linutronix.de>
+Date: Wed Nov  6 22:55:42 2019 +0100
+
+From: Thomas Gleixner <tglx@linutronix.de>
+
+commit 4a8e991b91aca9e20705d434677ac013974e0e30 upstream
+
+Instead of having a smp_mb() and an empty lock/unlock of task::pi_lock move
+the state setting into to the lock section.
+
+Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
+Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
+Reviewed-by: Ingo Molnar <mingo@kernel.org>
+Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
+Link: https://lkml.kernel.org/r/20191106224556.645603214@linutronix.de
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ kernel/futex.c |   17 ++++++++++-------
+ 1 file changed, 10 insertions(+), 7 deletions(-)
+
+--- a/kernel/futex.c
++++ b/kernel/futex.c
+@@ -3740,16 +3740,19 @@ void futex_exit_recursive(struct task_st
+ void futex_exit_release(struct task_struct *tsk)
+ {
+-      tsk->futex_state = FUTEX_STATE_EXITING;
+-      /*
+-       * Ensure that all new tsk->pi_lock acquisitions must observe
+-       * FUTEX_STATE_EXITING. Serializes against attach_to_pi_owner().
+-       */
+-      smp_mb();
+       /*
+-       * Ensure that we must observe the pi_state in exit_pi_state_list().
++       * Switch the state to FUTEX_STATE_EXITING under tsk->pi_lock.
++       *
++       * This ensures that all subsequent checks of tsk->futex_state in
++       * attach_to_pi_owner() must observe FUTEX_STATE_EXITING with
++       * tsk->pi_lock held.
++       *
++       * It guarantees also that a pi_state which was queued right before
++       * the state change under tsk->pi_lock by a concurrent waiter must
++       * be observed in exit_pi_state_list().
+        */
+       raw_spin_lock_irq(&tsk->pi_lock);
++      tsk->futex_state = FUTEX_STATE_EXITING;
+       raw_spin_unlock_irq(&tsk->pi_lock);
+       futex_exec_release(tsk);
diff --git a/pending/futex-4.19/futex_Set_taskfutex_state_to_DEAD_right_after_handling_futex_exit.patch b/pending/futex-4.19/futex_Set_taskfutex_state_to_DEAD_right_after_handling_futex_exit.patch
new file mode 100644 (file)
index 0000000..8ea9cbb
--- /dev/null
@@ -0,0 +1,46 @@
+Subject: futex: Set task::futex_state to DEAD right after handling futex exit
+From: Thomas Gleixner <tglx@linutronix.de>
+Date: Wed Nov  6 22:55:40 2019 +0100
+
+From: Thomas Gleixner <tglx@linutronix.de>
+
+commit f24f22435dcc11389acc87e5586239c1819d217c upstream
+
+Setting task::futex_state in do_exit() is rather arbitrarily placed for no
+reason. Move it into the futex code.
+
+Note, this is only done for the exit cleanup as the exec cleanup cannot set
+the state to FUTEX_STATE_DEAD because the task struct is still in active
+use.
+
+Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
+Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
+Reviewed-by: Ingo Molnar <mingo@kernel.org>
+Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
+Link: https://lkml.kernel.org/r/20191106224556.439511191@linutronix.de
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ kernel/exit.c  |    1 -
+ kernel/futex.c |    1 +
+ 2 files changed, 1 insertion(+), 1 deletion(-)
+
+--- a/kernel/exit.c
++++ b/kernel/exit.c
+@@ -909,7 +909,6 @@ void __noreturn do_exit(long code)
+        * Make sure we are holding no locks:
+        */
+       debug_check_no_locks_held();
+-      futex_exit_done(tsk);
+       if (tsk->io_context)
+               exit_io_context(tsk);
+--- a/kernel/futex.c
++++ b/kernel/futex.c
+@@ -3719,6 +3719,7 @@ void futex_exec_release(struct task_stru
+ void futex_exit_release(struct task_struct *tsk)
+ {
+       futex_exec_release(tsk);
++      futex_exit_done(tsk);
+ }
+ long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout,
diff --git a/pending/futex-4.19/futex_Split_futex_mm_release()_for_exitexec.patch b/pending/futex-4.19/futex_Split_futex_mm_release()_for_exitexec.patch
new file mode 100644 (file)
index 0000000..ef555df
--- /dev/null
@@ -0,0 +1,96 @@
+Subject: futex: Split futex_mm_release() for exit/exec
+From: Thomas Gleixner <tglx@linutronix.de>
+Date: Wed Nov  6 22:55:39 2019 +0100
+
+From: Thomas Gleixner <tglx@linutronix.de>
+
+commit 150d71584b12809144b8145b817e83b81158ae5f upstream
+
+To allow separate handling of the futex exit state in the futex exit code
+for exit and exec, split futex_mm_release() into two functions and invoke
+them from the corresponding exit/exec_mm_release() callsites.
+
+Preparatory only, no functional change.
+
+Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
+Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
+Reviewed-by: Ingo Molnar <mingo@kernel.org>
+Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
+Link: https://lkml.kernel.org/r/20191106224556.332094221@linutronix.de
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ include/linux/futex.h |    6 ++++--
+ kernel/fork.c         |    5 ++---
+ kernel/futex.c        |    7 ++++++-
+ 3 files changed, 12 insertions(+), 6 deletions(-)
+
+--- a/include/linux/futex.h
++++ b/include/linux/futex.h
+@@ -96,14 +96,16 @@ static inline void futex_exit_done(struc
+       tsk->futex_state = FUTEX_STATE_DEAD;
+ }
+-void futex_mm_release(struct task_struct *tsk);
++void futex_exit_release(struct task_struct *tsk);
++void futex_exec_release(struct task_struct *tsk);
+ long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout,
+             u32 __user *uaddr2, u32 val2, u32 val3);
+ #else
+ static inline void futex_init_task(struct task_struct *tsk) { }
+-static inline void futex_mm_release(struct task_struct *tsk) { }
+ static inline void futex_exit_done(struct task_struct *tsk) { }
++static inline void futex_exit_release(struct task_struct *tsk) { }
++static inline void futex_exec_release(struct task_struct *tsk) { }
+ static inline long do_futex(u32 __user *uaddr, int op, u32 val,
+                           ktime_t *timeout, u32 __user *uaddr2,
+                           u32 val2, u32 val3)
+--- a/kernel/fork.c
++++ b/kernel/fork.c
+@@ -1219,9 +1219,6 @@ static int wait_for_vfork_done(struct ta
+  */
+ static void mm_release(struct task_struct *tsk, struct mm_struct *mm)
+ {
+-      /* Get rid of any futexes when releasing the mm */
+-      futex_mm_release(tsk);
+-
+       uprobe_free_utask(tsk);
+       /* Get rid of any cached register state */
+@@ -1256,11 +1253,13 @@ static void mm_release(struct task_struc
+ void exit_mm_release(struct task_struct *tsk, struct mm_struct *mm)
+ {
++      futex_exit_release(tsk);
+       mm_release(tsk, mm);
+ }
+ void exec_mm_release(struct task_struct *tsk, struct mm_struct *mm)
+ {
++      futex_exec_release(tsk);
+       mm_release(tsk, mm);
+ }
+--- a/kernel/futex.c
++++ b/kernel/futex.c
+@@ -3698,7 +3698,7 @@ static void exit_robust_list(struct task
+       }
+ }
+-void futex_mm_release(struct task_struct *tsk)
++void futex_exec_release(struct task_struct *tsk)
+ {
+       if (unlikely(tsk->robust_list)) {
+               exit_robust_list(tsk);
+@@ -3716,6 +3716,11 @@ void futex_mm_release(struct task_struct
+               exit_pi_state_list(tsk);
+ }
++void futex_exit_release(struct task_struct *tsk)
++{
++      futex_exec_release(tsk);
++}
++
+ long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout,
+               u32 __user *uaddr2, u32 val2, u32 val3)
+ {
diff --git a/pending/futex-4.19/series b/pending/futex-4.19/series
new file mode 100644 (file)
index 0000000..df259c9
--- /dev/null
@@ -0,0 +1,11 @@
+futex_Move_futex_exit_handling_into_futex_code.patch
+futex_Replace_PF_EXITPIDONE_with_a_state.patch
+exitexec_Seperate_mm_release().patch
+futex_Split_futex_mm_release()_for_exitexec.patch
+futex_Set_taskfutex_state_to_DEAD_right_after_handling_futex_exit.patch
+futex_Mark_the_begin_of_futex_exit_explicitly.patch
+futex_Sanitize_exit_state_handling.patch
+futex_Provide_state_handling_for_exec()_as_well.patch
+futex_Add_mutex_around_futex_exit.patch
+futex_Provide_distinct_return_value_when_owner_is_exiting.patch
+futex_Prevent_exit_livelock.patch