From: Greg Kroah-Hartman Date: Tue, 26 Jan 2021 10:29:34 +0000 (+0100) Subject: futex patches stashed for next 4.19 release. X-Git-Tag: v4.19.171~5 X-Git-Url: http://git.ipfire.org/?a=commitdiff_plain;h=2332d09b70f80e17be97dca89fe992ef73a0a1b8;p=thirdparty%2Fkernel%2Fstable-queue.git futex patches stashed for next 4.19 release. --- diff --git a/pending/futex-4.19/exitexec_Seperate_mm_release().patch b/pending/futex-4.19/exitexec_Seperate_mm_release().patch new file mode 100644 index 00000000000..6f1eca6d78a --- /dev/null +++ b/pending/futex-4.19/exitexec_Seperate_mm_release().patch @@ -0,0 +1,97 @@ +Subject: exit/exec: Seperate mm_release() +From: Thomas Gleixner +Date: Wed Nov 6 22:55:38 2019 +0100 + +From: Thomas Gleixner + +commit 4610ba7ad877fafc0a25a30c6c82015304120426 upstream + +mm_release() contains the futex exit handling. mm_release() is called from +do_exit()->exit_mm() and from exec()->exec_mm(). + +In the exit_mm() case PF_EXITING and the futex state is updated. In the +exec_mm() case these states are not touched. + +As the futex exit code needs further protections against exit races, this +needs to be split into two functions. + +Preparatory only, no functional change. + +Signed-off-by: Thomas Gleixner +Signed-off-by: Thomas Gleixner +Reviewed-by: Ingo Molnar +Acked-by: Peter Zijlstra (Intel) +Link: https://lkml.kernel.org/r/20191106224556.240518241@linutronix.de +Signed-off-by: Greg Kroah-Hartman +--- + fs/exec.c | 2 +- + include/linux/sched/mm.h | 6 ++++-- + kernel/exit.c | 2 +- + kernel/fork.c | 12 +++++++++++- + 4 files changed, 17 insertions(+), 5 deletions(-) + +--- a/fs/exec.c ++++ b/fs/exec.c +@@ -1011,7 +1011,7 @@ static int exec_mmap(struct mm_struct *m + /* Notify parent that we're no longer interested in the old VM */ + tsk = current; + old_mm = current->mm; +- mm_release(tsk, old_mm); ++ exec_mm_release(tsk, old_mm); + + if (old_mm) { + sync_mm_rss(old_mm); +--- a/include/linux/sched/mm.h ++++ b/include/linux/sched/mm.h +@@ -119,8 +119,10 @@ extern struct mm_struct *get_task_mm(str + * succeeds. + */ + extern struct mm_struct *mm_access(struct task_struct *task, unsigned int mode); +-/* Remove the current tasks stale references to the old mm_struct */ +-extern void mm_release(struct task_struct *, struct mm_struct *); ++/* Remove the current tasks stale references to the old mm_struct on exit() */ ++extern void exit_mm_release(struct task_struct *, struct mm_struct *); ++/* Remove the current tasks stale references to the old mm_struct on exec() */ ++extern void exec_mm_release(struct task_struct *, struct mm_struct *); + + #ifdef CONFIG_MEMCG + extern void mm_update_next_owner(struct mm_struct *mm); +--- a/kernel/exit.c ++++ b/kernel/exit.c +@@ -498,7 +498,7 @@ static void exit_mm(void) + struct mm_struct *mm = current->mm; + struct core_state *core_state; + +- mm_release(current, mm); ++ exit_mm_release(current, mm); + if (!mm) + return; + sync_mm_rss(mm); +--- a/kernel/fork.c ++++ b/kernel/fork.c +@@ -1217,7 +1217,7 @@ static int wait_for_vfork_done(struct ta + * restoring the old one. . . + * Eric Biederman 10 January 1998 + */ +-void mm_release(struct task_struct *tsk, struct mm_struct *mm) ++static void mm_release(struct task_struct *tsk, struct mm_struct *mm) + { + /* Get rid of any futexes when releasing the mm */ + futex_mm_release(tsk); +@@ -1254,6 +1254,16 @@ void mm_release(struct task_struct *tsk, + complete_vfork_done(tsk); + } + ++void exit_mm_release(struct task_struct *tsk, struct mm_struct *mm) ++{ ++ mm_release(tsk, mm); ++} ++ ++void exec_mm_release(struct task_struct *tsk, struct mm_struct *mm) ++{ ++ mm_release(tsk, mm); ++} ++ + /* + * Allocate a new mm structure and copy contents from the + * mm structure of the passed in task structure. diff --git a/pending/futex-4.19/futex_Add_mutex_around_futex_exit.patch b/pending/futex-4.19/futex_Add_mutex_around_futex_exit.patch new file mode 100644 index 00000000000..ef9d6d5ac26 --- /dev/null +++ b/pending/futex-4.19/futex_Add_mutex_around_futex_exit.patch @@ -0,0 +1,82 @@ +Subject: futex: Add mutex around futex exit +From: Thomas Gleixner +Date: Wed Nov 6 22:55:44 2019 +0100 + +From: Thomas Gleixner + +commit 3f186d974826847a07bc7964d79ec4eded475ad9 upstream + +The mutex will be used in subsequent changes to replace the busy looping of +a waiter when the futex owner is currently executing the exit cleanup to +prevent a potential live lock. + +Signed-off-by: Thomas Gleixner +Signed-off-by: Thomas Gleixner +Reviewed-by: Ingo Molnar +Acked-by: Peter Zijlstra (Intel) +Link: https://lkml.kernel.org/r/20191106224556.845798895@linutronix.de +Signed-off-by: Greg Kroah-Hartman +--- + include/linux/futex.h | 1 + + include/linux/sched.h | 1 + + kernel/futex.c | 16 ++++++++++++++++ + 3 files changed, 18 insertions(+) + +--- a/include/linux/futex.h ++++ b/include/linux/futex.h +@@ -68,6 +68,7 @@ static inline void futex_init_task(struc + INIT_LIST_HEAD(&tsk->pi_state_list); + tsk->pi_state_cache = NULL; + tsk->futex_state = FUTEX_STATE_OK; ++ mutex_init(&tsk->futex_exit_mutex); + } + + void futex_exit_recursive(struct task_struct *tsk); +--- a/include/linux/sched.h ++++ b/include/linux/sched.h +@@ -996,6 +996,7 @@ struct task_struct { + #endif + struct list_head pi_state_list; + struct futex_pi_state *pi_state_cache; ++ struct mutex futex_exit_mutex; + unsigned int futex_state; + #endif + #ifdef CONFIG_PERF_EVENTS +--- a/kernel/futex.c ++++ b/kernel/futex.c +@@ -3735,12 +3735,23 @@ static void futex_cleanup(struct task_st + */ + void futex_exit_recursive(struct task_struct *tsk) + { ++ /* If the state is FUTEX_STATE_EXITING then futex_exit_mutex is held */ ++ if (tsk->futex_state == FUTEX_STATE_EXITING) ++ mutex_unlock(&tsk->futex_exit_mutex); + tsk->futex_state = FUTEX_STATE_DEAD; + } + + static void futex_cleanup_begin(struct task_struct *tsk) + { + /* ++ * Prevent various race issues against a concurrent incoming waiter ++ * including live locks by forcing the waiter to block on ++ * tsk->futex_exit_mutex when it observes FUTEX_STATE_EXITING in ++ * attach_to_pi_owner(). ++ */ ++ mutex_lock(&tsk->futex_exit_mutex); ++ ++ /* + * Switch the state to FUTEX_STATE_EXITING under tsk->pi_lock. + * + * This ensures that all subsequent checks of tsk->futex_state in +@@ -3763,6 +3774,11 @@ static void futex_cleanup_end(struct tas + * take another loop until it becomes visible. + */ + tsk->futex_state = state; ++ /* ++ * Drop the exit protection. This unblocks waiters which observed ++ * FUTEX_STATE_EXITING to reevaluate the state. ++ */ ++ mutex_unlock(&tsk->futex_exit_mutex); + } + + void futex_exec_release(struct task_struct *tsk) diff --git a/pending/futex-4.19/futex_Mark_the_begin_of_futex_exit_explicitly.patch b/pending/futex-4.19/futex_Mark_the_begin_of_futex_exit_explicitly.patch new file mode 100644 index 00000000000..973da35c0ca --- /dev/null +++ b/pending/futex-4.19/futex_Mark_the_begin_of_futex_exit_explicitly.patch @@ -0,0 +1,159 @@ +Subject: futex: Mark the begin of futex exit explicitly +From: Thomas Gleixner +Date: Wed Nov 6 22:55:41 2019 +0100 + +From: Thomas Gleixner + +commit 18f694385c4fd77a09851fd301236746ca83f3cb upstream + +Instead of relying on PF_EXITING use an explicit state for the futex exit +and set it in the futex exit function. This moves the smp barrier and the +lock/unlock serialization into the futex code. + +As with the DEAD state this is restricted to the exit path as exec +continues to use the same task struct. + +This allows to simplify that logic in a next step. + +Signed-off-by: Thomas Gleixner +Signed-off-by: Thomas Gleixner +Reviewed-by: Ingo Molnar +Acked-by: Peter Zijlstra (Intel) +Link: https://lkml.kernel.org/r/20191106224556.539409004@linutronix.de +Signed-off-by: Greg Kroah-Hartman +--- + include/linux/futex.h | 31 +++---------------------------- + kernel/exit.c | 13 +------------ + kernel/futex.c | 37 ++++++++++++++++++++++++++++++++++++- + 3 files changed, 40 insertions(+), 41 deletions(-) + +--- a/include/linux/futex.h ++++ b/include/linux/futex.h +@@ -55,6 +55,7 @@ union futex_key { + #ifdef CONFIG_FUTEX + enum { + FUTEX_STATE_OK, ++ FUTEX_STATE_EXITING, + FUTEX_STATE_DEAD, + }; + +@@ -69,33 +70,7 @@ static inline void futex_init_task(struc + tsk->futex_state = FUTEX_STATE_OK; + } + +-/** +- * futex_exit_done - Sets the tasks futex state to FUTEX_STATE_DEAD +- * @tsk: task to set the state on +- * +- * Set the futex exit state of the task lockless. The futex waiter code +- * observes that state when a task is exiting and loops until the task has +- * actually finished the futex cleanup. The worst case for this is that the +- * waiter runs through the wait loop until the state becomes visible. +- * +- * This has two callers: +- * +- * - futex_mm_release() after the futex exit cleanup has been done +- * +- * - do_exit() from the recursive fault handling path. +- * +- * In case of a recursive fault this is best effort. Either the futex exit +- * code has run already or not. If the OWNER_DIED bit has been set on the +- * futex then the waiter can take it over. If not, the problem is pushed +- * back to user space. If the futex exit code did not run yet, then an +- * already queued waiter might block forever, but there is nothing which +- * can be done about that. +- */ +-static inline void futex_exit_done(struct task_struct *tsk) +-{ +- tsk->futex_state = FUTEX_STATE_DEAD; +-} +- ++void futex_exit_recursive(struct task_struct *tsk); + void futex_exit_release(struct task_struct *tsk); + void futex_exec_release(struct task_struct *tsk); + +@@ -103,7 +78,7 @@ long do_futex(u32 __user *uaddr, int op, + u32 __user *uaddr2, u32 val2, u32 val3); + #else + static inline void futex_init_task(struct task_struct *tsk) { } +-static inline void futex_exit_done(struct task_struct *tsk) { } ++static inline void futex_exit_recursive(struct task_struct *tsk) { } + static inline void futex_exit_release(struct task_struct *tsk) { } + static inline void futex_exec_release(struct task_struct *tsk) { } + static inline long do_futex(u32 __user *uaddr, int op, u32 val, +--- a/kernel/exit.c ++++ b/kernel/exit.c +@@ -818,23 +818,12 @@ void __noreturn do_exit(long code) + */ + if (unlikely(tsk->flags & PF_EXITING)) { + pr_alert("Fixing recursive fault but reboot is needed!\n"); +- futex_exit_done(tsk); ++ futex_exit_recursive(tsk); + set_current_state(TASK_UNINTERRUPTIBLE); + schedule(); + } + + exit_signals(tsk); /* sets PF_EXITING */ +- /* +- * Ensure that all new tsk->pi_lock acquisitions must observe +- * PF_EXITING. Serializes against futex.c:attach_to_pi_owner(). +- */ +- smp_mb(); +- /* +- * Ensure that we must observe the pi_state in exit_mm() -> +- * mm_release() -> exit_pi_state_list(). +- */ +- raw_spin_lock_irq(&tsk->pi_lock); +- raw_spin_unlock_irq(&tsk->pi_lock); + + /* sync mm's RSS info before statistics gathering */ + if (tsk->mm) +--- a/kernel/futex.c ++++ b/kernel/futex.c +@@ -3716,10 +3716,45 @@ void futex_exec_release(struct task_stru + exit_pi_state_list(tsk); + } + ++/** ++ * futex_exit_recursive - Set the tasks futex state to FUTEX_STATE_DEAD ++ * @tsk: task to set the state on ++ * ++ * Set the futex exit state of the task lockless. The futex waiter code ++ * observes that state when a task is exiting and loops until the task has ++ * actually finished the futex cleanup. The worst case for this is that the ++ * waiter runs through the wait loop until the state becomes visible. ++ * ++ * This is called from the recursive fault handling path in do_exit(). ++ * ++ * This is best effort. Either the futex exit code has run already or ++ * not. If the OWNER_DIED bit has been set on the futex then the waiter can ++ * take it over. If not, the problem is pushed back to user space. If the ++ * futex exit code did not run yet, then an already queued waiter might ++ * block forever, but there is nothing which can be done about that. ++ */ ++void futex_exit_recursive(struct task_struct *tsk) ++{ ++ tsk->futex_state = FUTEX_STATE_DEAD; ++} ++ + void futex_exit_release(struct task_struct *tsk) + { ++ tsk->futex_state = FUTEX_STATE_EXITING; ++ /* ++ * Ensure that all new tsk->pi_lock acquisitions must observe ++ * FUTEX_STATE_EXITING. Serializes against attach_to_pi_owner(). ++ */ ++ smp_mb(); ++ /* ++ * Ensure that we must observe the pi_state in exit_pi_state_list(). ++ */ ++ raw_spin_lock_irq(&tsk->pi_lock); ++ raw_spin_unlock_irq(&tsk->pi_lock); ++ + futex_exec_release(tsk); +- futex_exit_done(tsk); ++ ++ tsk->futex_state = FUTEX_STATE_DEAD; + } + + long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout, diff --git a/pending/futex-4.19/futex_Move_futex_exit_handling_into_futex_code.patch b/pending/futex-4.19/futex_Move_futex_exit_handling_into_futex_code.patch new file mode 100644 index 00000000000..d08d9ed4a18 --- /dev/null +++ b/pending/futex-4.19/futex_Move_futex_exit_handling_into_futex_code.patch @@ -0,0 +1,216 @@ +Subject: futex: Move futex exit handling into futex code +From: Thomas Gleixner +Date: Wed Nov 6 22:55:36 2019 +0100 + +From: Thomas Gleixner + +commit ba31c1a48538992316cc71ce94fa9cd3e7b427c0 upstream + +The futex exit handling is #ifdeffed into mm_release() which is not pretty +to begin with. But upcoming changes to address futex exit races need to add +more functionality to this exit code. + +Split it out into a function, move it into futex code and make the various +futex exit functions static. + +Preparatory only and no functional change. + +Folded build fix from Borislav. + +Signed-off-by: Thomas Gleixner +Signed-off-by: Thomas Gleixner +Reviewed-by: Ingo Molnar +Acked-by: Peter Zijlstra (Intel) +Link: https://lkml.kernel.org/r/20191106224556.049705556@linutronix.de +Signed-off-by: Greg Kroah-Hartman +--- + include/linux/compat.h | 2 -- + include/linux/futex.h | 29 ++++++++++++++++------------- + kernel/fork.c | 25 +++---------------------- + kernel/futex.c | 33 +++++++++++++++++++++++++++++---- + 4 files changed, 48 insertions(+), 41 deletions(-) + +--- a/include/linux/compat.h ++++ b/include/linux/compat.h +@@ -445,8 +445,6 @@ struct compat_kexec_segment; + struct compat_mq_attr; + struct compat_msgbuf; + +-extern void compat_exit_robust_list(struct task_struct *curr); +- + #define BITS_PER_COMPAT_LONG (8*sizeof(compat_long_t)) + + #define BITS_TO_COMPAT_LONGS(bits) DIV_ROUND_UP(bits, BITS_PER_COMPAT_LONG) +--- a/include/linux/futex.h ++++ b/include/linux/futex.h +@@ -2,7 +2,9 @@ + #ifndef _LINUX_FUTEX_H + #define _LINUX_FUTEX_H + ++#include + #include ++ + #include + + struct inode; +@@ -51,15 +53,24 @@ union futex_key { + #define FUTEX_KEY_INIT (union futex_key) { .both = { .ptr = 0ULL } } + + #ifdef CONFIG_FUTEX +-extern void exit_robust_list(struct task_struct *curr); + +-long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout, +- u32 __user *uaddr2, u32 val2, u32 val3); +-#else +-static inline void exit_robust_list(struct task_struct *curr) ++static inline void futex_init_task(struct task_struct *tsk) + { ++ tsk->robust_list = NULL; ++#ifdef CONFIG_COMPAT ++ tsk->compat_robust_list = NULL; ++#endif ++ INIT_LIST_HEAD(&tsk->pi_state_list); ++ tsk->pi_state_cache = NULL; + } + ++void futex_mm_release(struct task_struct *tsk); ++ ++long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout, ++ u32 __user *uaddr2, u32 val2, u32 val3); ++#else ++static inline void futex_init_task(struct task_struct *tsk) { } ++static inline void futex_mm_release(struct task_struct *tsk) { } + static inline long do_futex(u32 __user *uaddr, int op, u32 val, + ktime_t *timeout, u32 __user *uaddr2, + u32 val2, u32 val3) +@@ -68,12 +79,4 @@ static inline long do_futex(u32 __user * + } + #endif + +-#ifdef CONFIG_FUTEX_PI +-extern void exit_pi_state_list(struct task_struct *curr); +-#else +-static inline void exit_pi_state_list(struct task_struct *curr) +-{ +-} +-#endif +- + #endif +--- a/kernel/fork.c ++++ b/kernel/fork.c +@@ -1220,20 +1220,7 @@ static int wait_for_vfork_done(struct ta + void mm_release(struct task_struct *tsk, struct mm_struct *mm) + { + /* Get rid of any futexes when releasing the mm */ +-#ifdef CONFIG_FUTEX +- if (unlikely(tsk->robust_list)) { +- exit_robust_list(tsk); +- tsk->robust_list = NULL; +- } +-#ifdef CONFIG_COMPAT +- if (unlikely(tsk->compat_robust_list)) { +- compat_exit_robust_list(tsk); +- tsk->compat_robust_list = NULL; +- } +-#endif +- if (unlikely(!list_empty(&tsk->pi_state_list))) +- exit_pi_state_list(tsk); +-#endif ++ futex_mm_release(tsk); + + uprobe_free_utask(tsk); + +@@ -1937,14 +1924,8 @@ static __latent_entropy struct task_stru + #ifdef CONFIG_BLOCK + p->plug = NULL; + #endif +-#ifdef CONFIG_FUTEX +- p->robust_list = NULL; +-#ifdef CONFIG_COMPAT +- p->compat_robust_list = NULL; +-#endif +- INIT_LIST_HEAD(&p->pi_state_list); +- p->pi_state_cache = NULL; +-#endif ++ futex_init_task(p); ++ + /* + * sigaltstack should be cleared when sharing the same VM + */ +--- a/kernel/futex.c ++++ b/kernel/futex.c +@@ -341,6 +341,12 @@ static inline bool should_fail_futex(boo + } + #endif /* CONFIG_FAIL_FUTEX */ + ++#ifdef CONFIG_COMPAT ++static void compat_exit_robust_list(struct task_struct *curr); ++#else ++static inline void compat_exit_robust_list(struct task_struct *curr) { } ++#endif ++ + static inline void futex_get_mm(union futex_key *key) + { + mmgrab(key->private.mm); +@@ -890,7 +896,7 @@ static void put_pi_state(struct futex_pi + * Kernel cleans up PI-state, but userspace is likely hosed. + * (Robust-futex cleanup is separate and might save the day for userspace.) + */ +-void exit_pi_state_list(struct task_struct *curr) ++static void exit_pi_state_list(struct task_struct *curr) + { + struct list_head *next, *head = &curr->pi_state_list; + struct futex_pi_state *pi_state; +@@ -960,7 +966,8 @@ void exit_pi_state_list(struct task_stru + } + raw_spin_unlock_irq(&curr->pi_lock); + } +- ++#else ++static inline void exit_pi_state_list(struct task_struct *curr) { } + #endif + + /* +@@ -3625,7 +3632,7 @@ static inline int fetch_robust_entry(str + * + * We silently return on any sign of list-walking problem. + */ +-void exit_robust_list(struct task_struct *curr) ++static void exit_robust_list(struct task_struct *curr) + { + struct robust_list_head __user *head = curr->robust_list; + struct robust_list __user *entry, *next_entry, *pending; +@@ -3690,6 +3697,24 @@ void exit_robust_list(struct task_struct + } + } + ++void futex_mm_release(struct task_struct *tsk) ++{ ++ if (unlikely(tsk->robust_list)) { ++ exit_robust_list(tsk); ++ tsk->robust_list = NULL; ++ } ++ ++#ifdef CONFIG_COMPAT ++ if (unlikely(tsk->compat_robust_list)) { ++ compat_exit_robust_list(tsk); ++ tsk->compat_robust_list = NULL; ++ } ++#endif ++ ++ if (unlikely(!list_empty(&tsk->pi_state_list))) ++ exit_pi_state_list(tsk); ++} ++ + long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout, + u32 __user *uaddr2, u32 val2, u32 val3) + { +@@ -3817,7 +3842,7 @@ static void __user *futex_uaddr(struct r + * + * We silently return on any sign of list-walking problem. + */ +-void compat_exit_robust_list(struct task_struct *curr) ++static void compat_exit_robust_list(struct task_struct *curr) + { + struct compat_robust_list_head __user *head = curr->compat_robust_list; + struct robust_list __user *entry, *next_entry, *pending; diff --git a/pending/futex-4.19/futex_Prevent_exit_livelock.patch b/pending/futex-4.19/futex_Prevent_exit_livelock.patch new file mode 100644 index 00000000000..99b8d8c8fe7 --- /dev/null +++ b/pending/futex-4.19/futex_Prevent_exit_livelock.patch @@ -0,0 +1,342 @@ +Subject: futex: Prevent exit livelock +From: Thomas Gleixner +Date: Wed Nov 6 22:55:46 2019 +0100 + +From: Thomas Gleixner + +commit 3ef240eaff36b8119ac9e2ea17cbf41179c930ba upstream + +Oleg provided the following test case: + +int main(void) +{ + struct sched_param sp = {}; + + sp.sched_priority = 2; + assert(sched_setscheduler(0, SCHED_FIFO, &sp) == 0); + + int lock = vfork(); + if (!lock) { + sp.sched_priority = 1; + assert(sched_setscheduler(0, SCHED_FIFO, &sp) == 0); + _exit(0); + } + + syscall(__NR_futex, &lock, FUTEX_LOCK_PI, 0,0,0); + return 0; +} + +This creates an unkillable RT process spinning in futex_lock_pi() on a UP +machine or if the process is affine to a single CPU. The reason is: + + parent child + + set FIFO prio 2 + + vfork() -> set FIFO prio 1 + implies wait_for_child() sched_setscheduler(...) + exit() + do_exit() + .... + mm_release() + tsk->futex_state = FUTEX_STATE_EXITING; + exit_futex(); (NOOP in this case) + complete() --> wakes parent + sys_futex() + loop infinite because + tsk->futex_state == FUTEX_STATE_EXITING + +The same problem can happen just by regular preemption as well: + + task holds futex + ... + do_exit() + tsk->futex_state = FUTEX_STATE_EXITING; + + --> preemption (unrelated wakeup of some other higher prio task, e.g. timer) + + switch_to(other_task) + + return to user + sys_futex() + loop infinite as above + +Just for the fun of it the futex exit cleanup could trigger the wakeup +itself before the task sets its futex state to DEAD. + +To cure this, the handling of the exiting owner is changed so: + + - A refcount is held on the task + + - The task pointer is stored in a caller visible location + + - The caller drops all locks (hash bucket, mmap_sem) and blocks + on task::futex_exit_mutex. When the mutex is acquired then + the exiting task has completed the cleanup and the state + is consistent and can be reevaluated. + +This is not a pretty solution, but there is no choice other than returning +an error code to user space, which would break the state consistency +guarantee and open another can of problems including regressions. + +For stable backports the preparatory commits ac31c7ff8624 .. ba31c1a48538 +are required as well, but for anything older than 5.3.y the backports are +going to be provided when this hits mainline as the other dependencies for +those kernels are definitely not stable material. + +Fixes: 778e9a9c3e71 ("pi-futex: fix exit races and locking problems") +Reported-by: Oleg Nesterov +Signed-off-by: Thomas Gleixner +Signed-off-by: Thomas Gleixner +Reviewed-by: Ingo Molnar +Acked-by: Peter Zijlstra (Intel) +Cc: Stable Team +Link: https://lkml.kernel.org/r/20191106224557.041676471@linutronix.de +Signed-off-by: Greg Kroah-Hartman +--- + kernel/futex.c | 106 ++++++++++++++++++++++++++++++++++++++++++++++++--------- + 1 file changed, 91 insertions(+), 15 deletions(-) + +--- a/kernel/futex.c ++++ b/kernel/futex.c +@@ -1176,6 +1176,36 @@ out_error: + return ret; + } + ++/** ++ * wait_for_owner_exiting - Block until the owner has exited ++ * @exiting: Pointer to the exiting task ++ * ++ * Caller must hold a refcount on @exiting. ++ */ ++static void wait_for_owner_exiting(int ret, struct task_struct *exiting) ++{ ++ if (ret != -EBUSY) { ++ WARN_ON_ONCE(exiting); ++ return; ++ } ++ ++ if (WARN_ON_ONCE(ret == -EBUSY && !exiting)) ++ return; ++ ++ mutex_lock(&exiting->futex_exit_mutex); ++ /* ++ * No point in doing state checking here. If the waiter got here ++ * while the task was in exec()->exec_futex_release() then it can ++ * have any FUTEX_STATE_* value when the waiter has acquired the ++ * mutex. OK, if running, EXITING or DEAD if it reached exit() ++ * already. Highly unlikely and not a problem. Just one more round ++ * through the futex maze. ++ */ ++ mutex_unlock(&exiting->futex_exit_mutex); ++ ++ put_task_struct(exiting); ++} ++ + static int handle_exit_race(u32 __user *uaddr, u32 uval, + struct task_struct *tsk) + { +@@ -1237,7 +1267,8 @@ static int handle_exit_race(u32 __user * + * it after doing proper sanity checks. + */ + static int attach_to_pi_owner(u32 __user *uaddr, u32 uval, union futex_key *key, +- struct futex_pi_state **ps) ++ struct futex_pi_state **ps, ++ struct task_struct **exiting) + { + pid_t pid = uval & FUTEX_TID_MASK; + struct futex_pi_state *pi_state; +@@ -1276,7 +1307,19 @@ static int attach_to_pi_owner(u32 __user + int ret = handle_exit_race(uaddr, uval, p); + + raw_spin_unlock_irq(&p->pi_lock); +- put_task_struct(p); ++ /* ++ * If the owner task is between FUTEX_STATE_EXITING and ++ * FUTEX_STATE_DEAD then store the task pointer and keep ++ * the reference on the task struct. The calling code will ++ * drop all locks, wait for the task to reach ++ * FUTEX_STATE_DEAD and then drop the refcount. This is ++ * required to prevent a live lock when the current task ++ * preempted the exiting task between the two states. ++ */ ++ if (ret == -EBUSY) ++ *exiting = p; ++ else ++ put_task_struct(p); + return ret; + } + +@@ -1315,7 +1358,8 @@ static int attach_to_pi_owner(u32 __user + + static int lookup_pi_state(u32 __user *uaddr, u32 uval, + struct futex_hash_bucket *hb, +- union futex_key *key, struct futex_pi_state **ps) ++ union futex_key *key, struct futex_pi_state **ps, ++ struct task_struct **exiting) + { + struct futex_q *top_waiter = futex_top_waiter(hb, key); + +@@ -1330,7 +1374,7 @@ static int lookup_pi_state(u32 __user *u + * We are the first waiter - try to look up the owner based on + * @uval and attach to it. + */ +- return attach_to_pi_owner(uaddr, uval, key, ps); ++ return attach_to_pi_owner(uaddr, uval, key, ps, exiting); + } + + static int lock_pi_update_atomic(u32 __user *uaddr, u32 uval, u32 newval) +@@ -1358,6 +1402,8 @@ static int lock_pi_update_atomic(u32 __u + * lookup + * @task: the task to perform the atomic lock work for. This will + * be "current" except in the case of requeue pi. ++ * @exiting: Pointer to store the task pointer of the owner task ++ * which is in the middle of exiting + * @set_waiters: force setting the FUTEX_WAITERS bit (1) or not (0) + * + * Return: +@@ -1366,11 +1412,17 @@ static int lock_pi_update_atomic(u32 __u + * - <0 - error + * + * The hb->lock and futex_key refs shall be held by the caller. ++ * ++ * @exiting is only set when the return value is -EBUSY. If so, this holds ++ * a refcount on the exiting task on return and the caller needs to drop it ++ * after waiting for the exit to complete. + */ + static int futex_lock_pi_atomic(u32 __user *uaddr, struct futex_hash_bucket *hb, + union futex_key *key, + struct futex_pi_state **ps, +- struct task_struct *task, int set_waiters) ++ struct task_struct *task, ++ struct task_struct **exiting, ++ int set_waiters) + { + u32 uval, newval, vpid = task_pid_vnr(task); + struct futex_q *top_waiter; +@@ -1440,7 +1492,7 @@ static int futex_lock_pi_atomic(u32 __us + * attach to the owner. If that fails, no harm done, we only + * set the FUTEX_WAITERS bit in the user space variable. + */ +- return attach_to_pi_owner(uaddr, newval, key, ps); ++ return attach_to_pi_owner(uaddr, newval, key, ps, exiting); + } + + /** +@@ -1861,6 +1913,8 @@ void requeue_pi_wake_futex(struct futex_ + * @key1: the from futex key + * @key2: the to futex key + * @ps: address to store the pi_state pointer ++ * @exiting: Pointer to store the task pointer of the owner task ++ * which is in the middle of exiting + * @set_waiters: force setting the FUTEX_WAITERS bit (1) or not (0) + * + * Try and get the lock on behalf of the top waiter if we can do it atomically. +@@ -1868,16 +1922,20 @@ void requeue_pi_wake_futex(struct futex_ + * then direct futex_lock_pi_atomic() to force setting the FUTEX_WAITERS bit. + * hb1 and hb2 must be held by the caller. + * ++ * @exiting is only set when the return value is -EBUSY. If so, this holds ++ * a refcount on the exiting task on return and the caller needs to drop it ++ * after waiting for the exit to complete. ++ * + * Return: + * - 0 - failed to acquire the lock atomically; + * - >0 - acquired the lock, return value is vpid of the top_waiter + * - <0 - error + */ +-static int futex_proxy_trylock_atomic(u32 __user *pifutex, +- struct futex_hash_bucket *hb1, +- struct futex_hash_bucket *hb2, +- union futex_key *key1, union futex_key *key2, +- struct futex_pi_state **ps, int set_waiters) ++static int ++futex_proxy_trylock_atomic(u32 __user *pifutex, struct futex_hash_bucket *hb1, ++ struct futex_hash_bucket *hb2, union futex_key *key1, ++ union futex_key *key2, struct futex_pi_state **ps, ++ struct task_struct **exiting, int set_waiters) + { + struct futex_q *top_waiter = NULL; + u32 curval; +@@ -1914,7 +1972,7 @@ static int futex_proxy_trylock_atomic(u3 + */ + vpid = task_pid_vnr(top_waiter->task); + ret = futex_lock_pi_atomic(pifutex, hb2, key2, ps, top_waiter->task, +- set_waiters); ++ exiting, set_waiters); + if (ret == 1) { + requeue_pi_wake_futex(top_waiter, key2, hb2); + return vpid; +@@ -2043,6 +2101,8 @@ retry_private: + } + + if (requeue_pi && (task_count - nr_wake < nr_requeue)) { ++ struct task_struct *exiting = NULL; ++ + /* + * Attempt to acquire uaddr2 and wake the top waiter. If we + * intend to requeue waiters, force setting the FUTEX_WAITERS +@@ -2050,7 +2110,8 @@ retry_private: + * faults rather in the requeue loop below. + */ + ret = futex_proxy_trylock_atomic(uaddr2, hb1, hb2, &key1, +- &key2, &pi_state, nr_requeue); ++ &key2, &pi_state, ++ &exiting, nr_requeue); + + /* + * At this point the top_waiter has either taken uaddr2 or is +@@ -2077,7 +2138,8 @@ retry_private: + * If that call succeeds then we have pi_state and an + * initial refcount on it. + */ +- ret = lookup_pi_state(uaddr2, ret, hb2, &key2, &pi_state); ++ ret = lookup_pi_state(uaddr2, ret, hb2, &key2, ++ &pi_state, &exiting); + } + + switch (ret) { +@@ -2107,6 +2169,12 @@ retry_private: + hb_waiters_dec(hb2); + put_futex_key(&key2); + put_futex_key(&key1); ++ /* ++ * Handle the case where the owner is in the middle of ++ * exiting. Wait for the exit to complete otherwise ++ * this task might loop forever, aka. live lock. ++ */ ++ wait_for_owner_exiting(ret, exiting); + cond_resched(); + goto retry; + default: +@@ -2834,6 +2902,7 @@ static int futex_lock_pi(u32 __user *uad + { + struct hrtimer_sleeper timeout, *to = NULL; + struct futex_pi_state *pi_state = NULL; ++ struct task_struct *exiting = NULL; + struct rt_mutex_waiter rt_waiter; + struct futex_hash_bucket *hb; + struct futex_q q = futex_q_init; +@@ -2861,7 +2930,8 @@ retry: + retry_private: + hb = queue_lock(&q); + +- ret = futex_lock_pi_atomic(uaddr, hb, &q.key, &q.pi_state, current, 0); ++ ret = futex_lock_pi_atomic(uaddr, hb, &q.key, &q.pi_state, current, ++ &exiting, 0); + if (unlikely(ret)) { + /* + * Atomic work succeeded and we got the lock, +@@ -2884,6 +2954,12 @@ retry_private: + */ + queue_unlock(hb); + put_futex_key(&q.key); ++ /* ++ * Handle the case where the owner is in the middle of ++ * exiting. Wait for the exit to complete otherwise ++ * this task might loop forever, aka. live lock. ++ */ ++ wait_for_owner_exiting(ret, exiting); + cond_resched(); + goto retry; + default: diff --git a/pending/futex-4.19/futex_Provide_distinct_return_value_when_owner_is_exiting.patch b/pending/futex-4.19/futex_Provide_distinct_return_value_when_owner_is_exiting.patch new file mode 100644 index 00000000000..6da55da7582 --- /dev/null +++ b/pending/futex-4.19/futex_Provide_distinct_return_value_when_owner_is_exiting.patch @@ -0,0 +1,81 @@ +Subject: futex: Provide distinct return value when owner is exiting +From: Thomas Gleixner +Date: Wed Nov 6 22:55:45 2019 +0100 + +From: Thomas Gleixner +` +commit ac31c7ff8624409ba3c4901df9237a616c187a5d upstream + +attach_to_pi_owner() returns -EAGAIN for various cases: + + - Owner task is exiting + - Futex value has changed + +The caller drops the held locks (hash bucket, mmap_sem) and retries the +operation. In case of the owner task exiting this can result in a live +lock. + +As a preparatory step for seperating those cases, provide a distinct return +value (EBUSY) for the owner exiting case. + +No functional change. + +Signed-off-by: Thomas Gleixner +Signed-off-by: Thomas Gleixner +Reviewed-by: Ingo Molnar +Acked-by: Peter Zijlstra (Intel) +Link: https://lkml.kernel.org/r/20191106224556.935606117@linutronix.de +Signed-off-by: Greg Kroah-Hartman +--- + kernel/futex.c | 16 +++++++++------- + 1 file changed, 9 insertions(+), 7 deletions(-) + +--- a/kernel/futex.c ++++ b/kernel/futex.c +@@ -1182,11 +1182,11 @@ static int handle_exit_race(u32 __user * + u32 uval2; + + /* +- * If the futex exit state is not yet FUTEX_STATE_DEAD, wait +- * for it to finish. ++ * If the futex exit state is not yet FUTEX_STATE_DEAD, tell the ++ * caller that the alleged owner is busy. + */ + if (tsk && tsk->futex_state != FUTEX_STATE_DEAD) +- return -EAGAIN; ++ return -EBUSY; + + /* + * Reread the user space value to handle the following situation: +@@ -2095,12 +2095,13 @@ retry_private: + if (!ret) + goto retry; + goto out; ++ case -EBUSY: + case -EAGAIN: + /* + * Two reasons for this: +- * - Owner is exiting and we just wait for the ++ * - EBUSY: Owner is exiting and we just wait for the + * exit to complete. +- * - The user space value changed. ++ * - EAGAIN: The user space value changed. + */ + double_unlock_hb(hb1, hb2); + hb_waiters_dec(hb2); +@@ -2873,12 +2874,13 @@ retry_private: + goto out_unlock_put_key; + case -EFAULT: + goto uaddr_faulted; ++ case -EBUSY: + case -EAGAIN: + /* + * Two reasons for this: +- * - Task is exiting and we just wait for the ++ * - EBUSY: Task is exiting and we just wait for the + * exit to complete. +- * - The user space value changed. ++ * - EAGAIN: The user space value changed. + */ + queue_unlock(hb); + put_futex_key(&q.key); diff --git a/pending/futex-4.19/futex_Provide_state_handling_for_exec()_as_well.patch b/pending/futex-4.19/futex_Provide_state_handling_for_exec()_as_well.patch new file mode 100644 index 00000000000..782f5a73616 --- /dev/null +++ b/pending/futex-4.19/futex_Provide_state_handling_for_exec()_as_well.patch @@ -0,0 +1,97 @@ +Subject: futex: Provide state handling for exec() as well +From: Thomas Gleixner +Date: Wed Nov 6 22:55:43 2019 +0100 + +From: Thomas Gleixner + +commit af8cbda2cfcaa5515d61ec500498d46e9a8247e2 upstream + +exec() attempts to handle potentially held futexes gracefully by running +the futex exit handling code like exit() does. + +The current implementation has no protection against concurrent incoming +waiters. The reason is that the futex state cannot be set to +FUTEX_STATE_DEAD after the cleanup because the task struct is still active +and just about to execute the new binary. + +While its arguably buggy when a task holds a futex over exec(), for +consistency sake the state handling can at least cover the actual futex +exit cleanup section. This provides state consistency protection accross +the cleanup. As the futex state of the task becomes FUTEX_STATE_OK after the +cleanup has been finished, this cannot prevent subsequent attempts to +attach to the task in case that the cleanup was not successfull in mopping +up all leftovers. + +Signed-off-by: Thomas Gleixner +Signed-off-by: Thomas Gleixner +Reviewed-by: Ingo Molnar +Acked-by: Peter Zijlstra (Intel) +Link: https://lkml.kernel.org/r/20191106224556.753355618@linutronix.de +Signed-off-by: Greg Kroah-Hartman +--- + kernel/futex.c | 38 ++++++++++++++++++++++++++++++++++---- + 1 file changed, 34 insertions(+), 4 deletions(-) + +--- a/kernel/futex.c ++++ b/kernel/futex.c +@@ -3698,7 +3698,7 @@ static void exit_robust_list(struct task + } + } + +-void futex_exec_release(struct task_struct *tsk) ++static void futex_cleanup(struct task_struct *tsk) + { + if (unlikely(tsk->robust_list)) { + exit_robust_list(tsk); +@@ -3738,7 +3738,7 @@ void futex_exit_recursive(struct task_st + tsk->futex_state = FUTEX_STATE_DEAD; + } + +-void futex_exit_release(struct task_struct *tsk) ++static void futex_cleanup_begin(struct task_struct *tsk) + { + /* + * Switch the state to FUTEX_STATE_EXITING under tsk->pi_lock. +@@ -3754,10 +3754,40 @@ void futex_exit_release(struct task_stru + raw_spin_lock_irq(&tsk->pi_lock); + tsk->futex_state = FUTEX_STATE_EXITING; + raw_spin_unlock_irq(&tsk->pi_lock); ++} + +- futex_exec_release(tsk); ++static void futex_cleanup_end(struct task_struct *tsk, int state) ++{ ++ /* ++ * Lockless store. The only side effect is that an observer might ++ * take another loop until it becomes visible. ++ */ ++ tsk->futex_state = state; ++} + +- tsk->futex_state = FUTEX_STATE_DEAD; ++void futex_exec_release(struct task_struct *tsk) ++{ ++ /* ++ * The state handling is done for consistency, but in the case of ++ * exec() there is no way to prevent futher damage as the PID stays ++ * the same. But for the unlikely and arguably buggy case that a ++ * futex is held on exec(), this provides at least as much state ++ * consistency protection which is possible. ++ */ ++ futex_cleanup_begin(tsk); ++ futex_cleanup(tsk); ++ /* ++ * Reset the state to FUTEX_STATE_OK. The task is alive and about ++ * exec a new binary. ++ */ ++ futex_cleanup_end(tsk, FUTEX_STATE_OK); ++} ++ ++void futex_exit_release(struct task_struct *tsk) ++{ ++ futex_cleanup_begin(tsk); ++ futex_cleanup(tsk); ++ futex_cleanup_end(tsk, FUTEX_STATE_DEAD); + } + + long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout, diff --git a/pending/futex-4.19/futex_Replace_PF_EXITPIDONE_with_a_state.patch b/pending/futex-4.19/futex_Replace_PF_EXITPIDONE_with_a_state.patch new file mode 100644 index 00000000000..11303d7f009 --- /dev/null +++ b/pending/futex-4.19/futex_Replace_PF_EXITPIDONE_with_a_state.patch @@ -0,0 +1,192 @@ +Subject: futex: Replace PF_EXITPIDONE with a state +From: Thomas Gleixner +Date: Wed Nov 6 22:55:37 2019 +0100 + +From: Thomas Gleixner + +commit 3d4775df0a89240f671861c6ab6e8d59af8e9e41 upstream + +The futex exit handling relies on PF_ flags. That's suboptimal as it +requires a smp_mb() and an ugly lock/unlock of the exiting tasks pi_lock in +the middle of do_exit() to enforce the observability of PF_EXITING in the +futex code. + +Add a futex_state member to task_struct and convert the PF_EXITPIDONE logic +over to the new state. The PF_EXITING dependency will be cleaned up in a +later step. + +This prepares for handling various futex exit issues later. + +Signed-off-by: Thomas Gleixner +Signed-off-by: Thomas Gleixner +Reviewed-by: Ingo Molnar +Acked-by: Peter Zijlstra (Intel) +Link: https://lkml.kernel.org/r/20191106224556.149449274@linutronix.de +Signed-off-by: Greg Kroah-Hartman +--- + include/linux/futex.h | 33 +++++++++++++++++++++++++++++++++ + include/linux/sched.h | 2 +- + kernel/exit.c | 18 ++---------------- + kernel/futex.c | 25 +++++++++++++------------ + 4 files changed, 49 insertions(+), 29 deletions(-) + +--- a/include/linux/futex.h ++++ b/include/linux/futex.h +@@ -53,6 +53,10 @@ union futex_key { + #define FUTEX_KEY_INIT (union futex_key) { .both = { .ptr = 0ULL } } + + #ifdef CONFIG_FUTEX ++enum { ++ FUTEX_STATE_OK, ++ FUTEX_STATE_DEAD, ++}; + + static inline void futex_init_task(struct task_struct *tsk) + { +@@ -62,6 +66,34 @@ static inline void futex_init_task(struc + #endif + INIT_LIST_HEAD(&tsk->pi_state_list); + tsk->pi_state_cache = NULL; ++ tsk->futex_state = FUTEX_STATE_OK; ++} ++ ++/** ++ * futex_exit_done - Sets the tasks futex state to FUTEX_STATE_DEAD ++ * @tsk: task to set the state on ++ * ++ * Set the futex exit state of the task lockless. The futex waiter code ++ * observes that state when a task is exiting and loops until the task has ++ * actually finished the futex cleanup. The worst case for this is that the ++ * waiter runs through the wait loop until the state becomes visible. ++ * ++ * This has two callers: ++ * ++ * - futex_mm_release() after the futex exit cleanup has been done ++ * ++ * - do_exit() from the recursive fault handling path. ++ * ++ * In case of a recursive fault this is best effort. Either the futex exit ++ * code has run already or not. If the OWNER_DIED bit has been set on the ++ * futex then the waiter can take it over. If not, the problem is pushed ++ * back to user space. If the futex exit code did not run yet, then an ++ * already queued waiter might block forever, but there is nothing which ++ * can be done about that. ++ */ ++static inline void futex_exit_done(struct task_struct *tsk) ++{ ++ tsk->futex_state = FUTEX_STATE_DEAD; + } + + void futex_mm_release(struct task_struct *tsk); +@@ -71,6 +103,7 @@ long do_futex(u32 __user *uaddr, int op, + #else + static inline void futex_init_task(struct task_struct *tsk) { } + static inline void futex_mm_release(struct task_struct *tsk) { } ++static inline void futex_exit_done(struct task_struct *tsk) { } + static inline long do_futex(u32 __user *uaddr, int op, u32 val, + ktime_t *timeout, u32 __user *uaddr2, + u32 val2, u32 val3) +--- a/include/linux/sched.h ++++ b/include/linux/sched.h +@@ -996,6 +996,7 @@ struct task_struct { + #endif + struct list_head pi_state_list; + struct futex_pi_state *pi_state_cache; ++ unsigned int futex_state; + #endif + #ifdef CONFIG_PERF_EVENTS + struct perf_event_context *perf_event_ctxp[perf_nr_task_contexts]; +@@ -1377,7 +1378,6 @@ extern struct pid *cad_pid; + */ + #define PF_IDLE 0x00000002 /* I am an IDLE thread */ + #define PF_EXITING 0x00000004 /* Getting shut down */ +-#define PF_EXITPIDONE 0x00000008 /* PI exit done on shut down */ + #define PF_VCPU 0x00000010 /* I'm a virtual CPU */ + #define PF_WQ_WORKER 0x00000020 /* I'm a workqueue worker */ + #define PF_FORKNOEXEC 0x00000040 /* Forked but didn't exec */ +--- a/kernel/exit.c ++++ b/kernel/exit.c +@@ -818,16 +818,7 @@ void __noreturn do_exit(long code) + */ + if (unlikely(tsk->flags & PF_EXITING)) { + pr_alert("Fixing recursive fault but reboot is needed!\n"); +- /* +- * We can do this unlocked here. The futex code uses +- * this flag just to verify whether the pi state +- * cleanup has been done or not. In the worst case it +- * loops once more. We pretend that the cleanup was +- * done as there is no way to return. Either the +- * OWNER_DIED bit is set by now or we push the blocked +- * task into the wait for ever nirwana as well. +- */ +- tsk->flags |= PF_EXITPIDONE; ++ futex_exit_done(tsk); + set_current_state(TASK_UNINTERRUPTIBLE); + schedule(); + } +@@ -918,12 +909,7 @@ void __noreturn do_exit(long code) + * Make sure we are holding no locks: + */ + debug_check_no_locks_held(); +- /* +- * We can do this unlocked here. The futex code uses this flag +- * just to verify whether the pi state cleanup has been done +- * or not. In the worst case it loops once more. +- */ +- tsk->flags |= PF_EXITPIDONE; ++ futex_exit_done(tsk); + + if (tsk->io_context) + exit_io_context(tsk); +--- a/kernel/futex.c ++++ b/kernel/futex.c +@@ -1182,9 +1182,10 @@ static int handle_exit_race(u32 __user * + u32 uval2; + + /* +- * If PF_EXITPIDONE is not yet set, then try again. ++ * If the futex exit state is not yet FUTEX_STATE_DEAD, wait ++ * for it to finish. + */ +- if (tsk && !(tsk->flags & PF_EXITPIDONE)) ++ if (tsk && tsk->futex_state != FUTEX_STATE_DEAD) + return -EAGAIN; + + /* +@@ -1203,8 +1204,9 @@ static int handle_exit_race(u32 __user * + * *uaddr = 0xC0000000; tsk = get_task(PID); + * } if (!tsk->flags & PF_EXITING) { + * ... attach(); +- * tsk->flags |= PF_EXITPIDONE; } else { +- * if (!(tsk->flags & PF_EXITPIDONE)) ++ * tsk->futex_state = } else { ++ * FUTEX_STATE_DEAD; if (tsk->futex_state != ++ * FUTEX_STATE_DEAD) + * return -EAGAIN; + * return -ESRCH; <--- FAIL + * } +@@ -1260,17 +1262,16 @@ static int attach_to_pi_owner(u32 __user + } + + /* +- * We need to look at the task state flags to figure out, +- * whether the task is exiting. To protect against the do_exit +- * change of the task flags, we do this protected by +- * p->pi_lock: ++ * We need to look at the task state to figure out, whether the ++ * task is exiting. To protect against the change of the task state ++ * in futex_exit_release(), we do this protected by p->pi_lock: + */ + raw_spin_lock_irq(&p->pi_lock); +- if (unlikely(p->flags & PF_EXITING)) { ++ if (unlikely(p->futex_state != FUTEX_STATE_OK)) { + /* +- * The task is on the way out. When PF_EXITPIDONE is +- * set, we know that the task has finished the +- * cleanup: ++ * The task is on the way out. When the futex state is ++ * FUTEX_STATE_DEAD, we know that the task has finished ++ * the cleanup: + */ + int ret = handle_exit_race(uaddr, uval, p); + diff --git a/pending/futex-4.19/futex_Sanitize_exit_state_handling.patch b/pending/futex-4.19/futex_Sanitize_exit_state_handling.patch new file mode 100644 index 00000000000..ea0c362f2bc --- /dev/null +++ b/pending/futex-4.19/futex_Sanitize_exit_state_handling.patch @@ -0,0 +1,50 @@ +Subject: futex: Sanitize exit state handling +From: Thomas Gleixner +Date: Wed Nov 6 22:55:42 2019 +0100 + +From: Thomas Gleixner + +commit 4a8e991b91aca9e20705d434677ac013974e0e30 upstream + +Instead of having a smp_mb() and an empty lock/unlock of task::pi_lock move +the state setting into to the lock section. + +Signed-off-by: Thomas Gleixner +Signed-off-by: Thomas Gleixner +Reviewed-by: Ingo Molnar +Acked-by: Peter Zijlstra (Intel) +Link: https://lkml.kernel.org/r/20191106224556.645603214@linutronix.de +Signed-off-by: Greg Kroah-Hartman +--- + kernel/futex.c | 17 ++++++++++------- + 1 file changed, 10 insertions(+), 7 deletions(-) + +--- a/kernel/futex.c ++++ b/kernel/futex.c +@@ -3740,16 +3740,19 @@ void futex_exit_recursive(struct task_st + + void futex_exit_release(struct task_struct *tsk) + { +- tsk->futex_state = FUTEX_STATE_EXITING; +- /* +- * Ensure that all new tsk->pi_lock acquisitions must observe +- * FUTEX_STATE_EXITING. Serializes against attach_to_pi_owner(). +- */ +- smp_mb(); + /* +- * Ensure that we must observe the pi_state in exit_pi_state_list(). ++ * Switch the state to FUTEX_STATE_EXITING under tsk->pi_lock. ++ * ++ * This ensures that all subsequent checks of tsk->futex_state in ++ * attach_to_pi_owner() must observe FUTEX_STATE_EXITING with ++ * tsk->pi_lock held. ++ * ++ * It guarantees also that a pi_state which was queued right before ++ * the state change under tsk->pi_lock by a concurrent waiter must ++ * be observed in exit_pi_state_list(). + */ + raw_spin_lock_irq(&tsk->pi_lock); ++ tsk->futex_state = FUTEX_STATE_EXITING; + raw_spin_unlock_irq(&tsk->pi_lock); + + futex_exec_release(tsk); diff --git a/pending/futex-4.19/futex_Set_taskfutex_state_to_DEAD_right_after_handling_futex_exit.patch b/pending/futex-4.19/futex_Set_taskfutex_state_to_DEAD_right_after_handling_futex_exit.patch new file mode 100644 index 00000000000..8ea9cbbb411 --- /dev/null +++ b/pending/futex-4.19/futex_Set_taskfutex_state_to_DEAD_right_after_handling_futex_exit.patch @@ -0,0 +1,46 @@ +Subject: futex: Set task::futex_state to DEAD right after handling futex exit +From: Thomas Gleixner +Date: Wed Nov 6 22:55:40 2019 +0100 + +From: Thomas Gleixner + +commit f24f22435dcc11389acc87e5586239c1819d217c upstream + +Setting task::futex_state in do_exit() is rather arbitrarily placed for no +reason. Move it into the futex code. + +Note, this is only done for the exit cleanup as the exec cleanup cannot set +the state to FUTEX_STATE_DEAD because the task struct is still in active +use. + +Signed-off-by: Thomas Gleixner +Signed-off-by: Thomas Gleixner +Reviewed-by: Ingo Molnar +Acked-by: Peter Zijlstra (Intel) +Link: https://lkml.kernel.org/r/20191106224556.439511191@linutronix.de +Signed-off-by: Greg Kroah-Hartman +--- + kernel/exit.c | 1 - + kernel/futex.c | 1 + + 2 files changed, 1 insertion(+), 1 deletion(-) + +--- a/kernel/exit.c ++++ b/kernel/exit.c +@@ -909,7 +909,6 @@ void __noreturn do_exit(long code) + * Make sure we are holding no locks: + */ + debug_check_no_locks_held(); +- futex_exit_done(tsk); + + if (tsk->io_context) + exit_io_context(tsk); +--- a/kernel/futex.c ++++ b/kernel/futex.c +@@ -3719,6 +3719,7 @@ void futex_exec_release(struct task_stru + void futex_exit_release(struct task_struct *tsk) + { + futex_exec_release(tsk); ++ futex_exit_done(tsk); + } + + long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout, diff --git a/pending/futex-4.19/futex_Split_futex_mm_release()_for_exitexec.patch b/pending/futex-4.19/futex_Split_futex_mm_release()_for_exitexec.patch new file mode 100644 index 00000000000..ef555df81fd --- /dev/null +++ b/pending/futex-4.19/futex_Split_futex_mm_release()_for_exitexec.patch @@ -0,0 +1,96 @@ +Subject: futex: Split futex_mm_release() for exit/exec +From: Thomas Gleixner +Date: Wed Nov 6 22:55:39 2019 +0100 + +From: Thomas Gleixner + +commit 150d71584b12809144b8145b817e83b81158ae5f upstream + +To allow separate handling of the futex exit state in the futex exit code +for exit and exec, split futex_mm_release() into two functions and invoke +them from the corresponding exit/exec_mm_release() callsites. + +Preparatory only, no functional change. + +Signed-off-by: Thomas Gleixner +Signed-off-by: Thomas Gleixner +Reviewed-by: Ingo Molnar +Acked-by: Peter Zijlstra (Intel) +Link: https://lkml.kernel.org/r/20191106224556.332094221@linutronix.de +Signed-off-by: Greg Kroah-Hartman +--- + include/linux/futex.h | 6 ++++-- + kernel/fork.c | 5 ++--- + kernel/futex.c | 7 ++++++- + 3 files changed, 12 insertions(+), 6 deletions(-) + +--- a/include/linux/futex.h ++++ b/include/linux/futex.h +@@ -96,14 +96,16 @@ static inline void futex_exit_done(struc + tsk->futex_state = FUTEX_STATE_DEAD; + } + +-void futex_mm_release(struct task_struct *tsk); ++void futex_exit_release(struct task_struct *tsk); ++void futex_exec_release(struct task_struct *tsk); + + long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout, + u32 __user *uaddr2, u32 val2, u32 val3); + #else + static inline void futex_init_task(struct task_struct *tsk) { } +-static inline void futex_mm_release(struct task_struct *tsk) { } + static inline void futex_exit_done(struct task_struct *tsk) { } ++static inline void futex_exit_release(struct task_struct *tsk) { } ++static inline void futex_exec_release(struct task_struct *tsk) { } + static inline long do_futex(u32 __user *uaddr, int op, u32 val, + ktime_t *timeout, u32 __user *uaddr2, + u32 val2, u32 val3) +--- a/kernel/fork.c ++++ b/kernel/fork.c +@@ -1219,9 +1219,6 @@ static int wait_for_vfork_done(struct ta + */ + static void mm_release(struct task_struct *tsk, struct mm_struct *mm) + { +- /* Get rid of any futexes when releasing the mm */ +- futex_mm_release(tsk); +- + uprobe_free_utask(tsk); + + /* Get rid of any cached register state */ +@@ -1256,11 +1253,13 @@ static void mm_release(struct task_struc + + void exit_mm_release(struct task_struct *tsk, struct mm_struct *mm) + { ++ futex_exit_release(tsk); + mm_release(tsk, mm); + } + + void exec_mm_release(struct task_struct *tsk, struct mm_struct *mm) + { ++ futex_exec_release(tsk); + mm_release(tsk, mm); + } + +--- a/kernel/futex.c ++++ b/kernel/futex.c +@@ -3698,7 +3698,7 @@ static void exit_robust_list(struct task + } + } + +-void futex_mm_release(struct task_struct *tsk) ++void futex_exec_release(struct task_struct *tsk) + { + if (unlikely(tsk->robust_list)) { + exit_robust_list(tsk); +@@ -3716,6 +3716,11 @@ void futex_mm_release(struct task_struct + exit_pi_state_list(tsk); + } + ++void futex_exit_release(struct task_struct *tsk) ++{ ++ futex_exec_release(tsk); ++} ++ + long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout, + u32 __user *uaddr2, u32 val2, u32 val3) + { diff --git a/pending/futex-4.19/series b/pending/futex-4.19/series new file mode 100644 index 00000000000..df259c9b485 --- /dev/null +++ b/pending/futex-4.19/series @@ -0,0 +1,11 @@ +futex_Move_futex_exit_handling_into_futex_code.patch +futex_Replace_PF_EXITPIDONE_with_a_state.patch +exitexec_Seperate_mm_release().patch +futex_Split_futex_mm_release()_for_exitexec.patch +futex_Set_taskfutex_state_to_DEAD_right_after_handling_futex_exit.patch +futex_Mark_the_begin_of_futex_exit_explicitly.patch +futex_Sanitize_exit_state_handling.patch +futex_Provide_state_handling_for_exec()_as_well.patch +futex_Add_mutex_around_futex_exit.patch +futex_Provide_distinct_return_value_when_owner_is_exiting.patch +futex_Prevent_exit_livelock.patch