From c99d299e4fa9b7b43cf205efff136ab46d2b33be Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Mon, 29 Mar 2021 07:49:51 +0200 Subject: [PATCH] 4.9-stable patches added patches: arm64-futex-bound-number-of-ldxr-stxr-loops-in-futex_wake_op.patch futex-avoid-freeing-an-active-timer.patch futex-drop-hb-lock-before-enqueueing-on-the-rtmutex.patch futex-fix-incorrect-should_fail_futex-handling.patch futex-fix-possible-missed-wakeup.patch futex-handle-early-deadlock-return-correctly.patch futex-handle-transient-ownerless-rtmutex-state-correctly.patch futex-prevent-robust-futex-exit-race.patch futex-rework-futex_lock_pi-to-use-rt_mutex_-_proxy_lock.patch futex-rt_mutex-fix-rt_mutex_cleanup_proxy_lock.patch futex-rt_mutex-introduce-rt_mutex_init_waiter.patch futex-use-smp_store_release-in-mark_wake_futex.patch locking-futex-allow-low-level-atomic-operations-to-return-eagain.patch --- ...-of-ldxr-stxr-loops-in-futex_wake_op.patch | 153 ++++++++ .../futex-avoid-freeing-an-active-timer.patch | 61 ++++ ...ock-before-enqueueing-on-the-rtmutex.patch | 211 +++++++++++ ...incorrect-should_fail_futex-handling.patch | 50 +++ .../futex-fix-possible-missed-wakeup.patch | 64 ++++ ...ndle-early-deadlock-return-correctly.patch | 231 ++++++++++++ ...nt-ownerless-rtmutex-state-correctly.patch | 85 +++++ ...futex-prevent-robust-futex-exit-race.patch | 266 ++++++++++++++ ...lock_pi-to-use-rt_mutex_-_proxy_lock.patch | 273 ++++++++++++++ ...utex-fix-rt_mutex_cleanup_proxy_lock.patch | 135 +++++++ ...mutex-introduce-rt_mutex_init_waiter.patch | 88 +++++ ...smp_store_release-in-mark_wake_futex.patch | 46 +++ ...l-atomic-operations-to-return-eagain.patch | 337 ++++++++++++++++++ queue-4.9/series | 13 + 14 files changed, 2013 insertions(+) create mode 100644 queue-4.9/arm64-futex-bound-number-of-ldxr-stxr-loops-in-futex_wake_op.patch create mode 100644 queue-4.9/futex-avoid-freeing-an-active-timer.patch create mode 100644 queue-4.9/futex-drop-hb-lock-before-enqueueing-on-the-rtmutex.patch create mode 100644 queue-4.9/futex-fix-incorrect-should_fail_futex-handling.patch create mode 100644 queue-4.9/futex-fix-possible-missed-wakeup.patch create mode 100644 queue-4.9/futex-handle-early-deadlock-return-correctly.patch create mode 100644 queue-4.9/futex-handle-transient-ownerless-rtmutex-state-correctly.patch create mode 100644 queue-4.9/futex-prevent-robust-futex-exit-race.patch create mode 100644 queue-4.9/futex-rework-futex_lock_pi-to-use-rt_mutex_-_proxy_lock.patch create mode 100644 queue-4.9/futex-rt_mutex-fix-rt_mutex_cleanup_proxy_lock.patch create mode 100644 queue-4.9/futex-rt_mutex-introduce-rt_mutex_init_waiter.patch create mode 100644 queue-4.9/futex-use-smp_store_release-in-mark_wake_futex.patch create mode 100644 queue-4.9/locking-futex-allow-low-level-atomic-operations-to-return-eagain.patch diff --git a/queue-4.9/arm64-futex-bound-number-of-ldxr-stxr-loops-in-futex_wake_op.patch b/queue-4.9/arm64-futex-bound-number-of-ldxr-stxr-loops-in-futex_wake_op.patch new file mode 100644 index 00000000000..7fc9294a557 --- /dev/null +++ b/queue-4.9/arm64-futex-bound-number-of-ldxr-stxr-loops-in-futex_wake_op.patch @@ -0,0 +1,153 @@ +From foo@baz Mon Mar 29 07:48:09 AM CEST 2021 +From: Ben Hutchings +Date: Sun, 28 Mar 2021 22:42:51 +0200 +Subject: arm64: futex: Bound number of LDXR/STXR loops in FUTEX_WAKE_OP +To: stable@vger.kernel.org +Cc: Lee Jones , "Luis Claudio R. Goncalves" , Florian Fainelli +Message-ID: +Content-Disposition: inline + +From: Will Deacon + +commit 03110a5cb2161690ae5ac04994d47ed0cd6cef75 upstream. + +Our futex implementation makes use of LDXR/STXR loops to perform atomic +updates to user memory from atomic context. This can lead to latency +problems if we end up spinning around the LL/SC sequence at the expense +of doing something useful. + +Rework our futex atomic operations so that we return -EAGAIN if we fail +to update the futex word after 128 attempts. The core futex code will +reschedule if necessary and we'll try again later. + +Fixes: 6170a97460db ("arm64: Atomic operations") +Signed-off-by: Will Deacon +[bwh: Backported to 4.9: adjust context] +Signed-off-by: Ben Hutchings +Signed-off-by: Greg Kroah-Hartman +--- + arch/arm64/include/asm/futex.h | 59 +++++++++++++++++++++++++---------------- + 1 file changed, 37 insertions(+), 22 deletions(-) + +--- a/arch/arm64/include/asm/futex.h ++++ b/arch/arm64/include/asm/futex.h +@@ -26,7 +26,12 @@ + #include + #include + ++#define FUTEX_MAX_LOOPS 128 /* What's the largest number you can think of? */ ++ + #define __futex_atomic_op(insn, ret, oldval, uaddr, tmp, oparg) \ ++do { \ ++ unsigned int loops = FUTEX_MAX_LOOPS; \ ++ \ + asm volatile( \ + ALTERNATIVE("nop", SET_PSTATE_PAN(0), ARM64_HAS_PAN, \ + CONFIG_ARM64_PAN) \ +@@ -34,21 +39,26 @@ + "1: ldxr %w1, %2\n" \ + insn "\n" \ + "2: stlxr %w0, %w3, %2\n" \ +-" cbnz %w0, 1b\n" \ +-" dmb ish\n" \ ++" cbz %w0, 3f\n" \ ++" sub %w4, %w4, %w0\n" \ ++" cbnz %w4, 1b\n" \ ++" mov %w0, %w7\n" \ + "3:\n" \ ++" dmb ish\n" \ + " .pushsection .fixup,\"ax\"\n" \ + " .align 2\n" \ +-"4: mov %w0, %w5\n" \ ++"4: mov %w0, %w6\n" \ + " b 3b\n" \ + " .popsection\n" \ + _ASM_EXTABLE(1b, 4b) \ + _ASM_EXTABLE(2b, 4b) \ + ALTERNATIVE("nop", SET_PSTATE_PAN(1), ARM64_HAS_PAN, \ + CONFIG_ARM64_PAN) \ +- : "=&r" (ret), "=&r" (oldval), "+Q" (*uaddr), "=&r" (tmp) \ +- : "r" (oparg), "Ir" (-EFAULT) \ +- : "memory") ++ : "=&r" (ret), "=&r" (oldval), "+Q" (*uaddr), "=&r" (tmp), \ ++ "+r" (loops) \ ++ : "r" (oparg), "Ir" (-EFAULT), "Ir" (-EAGAIN) \ ++ : "memory"); \ ++} while (0) + + static inline int + arch_futex_atomic_op_inuser(int op, int oparg, int *oval, u32 __user *uaddr) +@@ -59,23 +69,23 @@ arch_futex_atomic_op_inuser(int op, int + + switch (op) { + case FUTEX_OP_SET: +- __futex_atomic_op("mov %w3, %w4", ++ __futex_atomic_op("mov %w3, %w5", + ret, oldval, uaddr, tmp, oparg); + break; + case FUTEX_OP_ADD: +- __futex_atomic_op("add %w3, %w1, %w4", ++ __futex_atomic_op("add %w3, %w1, %w5", + ret, oldval, uaddr, tmp, oparg); + break; + case FUTEX_OP_OR: +- __futex_atomic_op("orr %w3, %w1, %w4", ++ __futex_atomic_op("orr %w3, %w1, %w5", + ret, oldval, uaddr, tmp, oparg); + break; + case FUTEX_OP_ANDN: +- __futex_atomic_op("and %w3, %w1, %w4", ++ __futex_atomic_op("and %w3, %w1, %w5", + ret, oldval, uaddr, tmp, ~oparg); + break; + case FUTEX_OP_XOR: +- __futex_atomic_op("eor %w3, %w1, %w4", ++ __futex_atomic_op("eor %w3, %w1, %w5", + ret, oldval, uaddr, tmp, oparg); + break; + default: +@@ -95,6 +105,7 @@ futex_atomic_cmpxchg_inatomic(u32 *uval, + u32 oldval, u32 newval) + { + int ret = 0; ++ unsigned int loops = FUTEX_MAX_LOOPS; + u32 val, tmp; + u32 __user *uaddr; + +@@ -106,21 +117,25 @@ futex_atomic_cmpxchg_inatomic(u32 *uval, + ALTERNATIVE("nop", SET_PSTATE_PAN(0), ARM64_HAS_PAN, CONFIG_ARM64_PAN) + " prfm pstl1strm, %2\n" + "1: ldxr %w1, %2\n" +-" sub %w3, %w1, %w4\n" +-" cbnz %w3, 3f\n" +-"2: stlxr %w3, %w5, %2\n" +-" cbnz %w3, 1b\n" +-" dmb ish\n" ++" sub %w3, %w1, %w5\n" ++" cbnz %w3, 4f\n" ++"2: stlxr %w3, %w6, %2\n" ++" cbz %w3, 3f\n" ++" sub %w4, %w4, %w3\n" ++" cbnz %w4, 1b\n" ++" mov %w0, %w8\n" + "3:\n" ++" dmb ish\n" ++"4:\n" + " .pushsection .fixup,\"ax\"\n" +-"4: mov %w0, %w6\n" +-" b 3b\n" ++"5: mov %w0, %w7\n" ++" b 4b\n" + " .popsection\n" +- _ASM_EXTABLE(1b, 4b) +- _ASM_EXTABLE(2b, 4b) ++ _ASM_EXTABLE(1b, 5b) ++ _ASM_EXTABLE(2b, 5b) + ALTERNATIVE("nop", SET_PSTATE_PAN(1), ARM64_HAS_PAN, CONFIG_ARM64_PAN) +- : "+r" (ret), "=&r" (val), "+Q" (*uaddr), "=&r" (tmp) +- : "r" (oldval), "r" (newval), "Ir" (-EFAULT) ++ : "+r" (ret), "=&r" (val), "+Q" (*uaddr), "=&r" (tmp), "+r" (loops) ++ : "r" (oldval), "r" (newval), "Ir" (-EFAULT), "Ir" (-EAGAIN) + : "memory"); + + *uval = val; diff --git a/queue-4.9/futex-avoid-freeing-an-active-timer.patch b/queue-4.9/futex-avoid-freeing-an-active-timer.patch new file mode 100644 index 00000000000..c87b966fa44 --- /dev/null +++ b/queue-4.9/futex-avoid-freeing-an-active-timer.patch @@ -0,0 +1,61 @@ +From foo@baz Mon Mar 29 07:48:09 AM CEST 2021 +From: Ben Hutchings +Date: Sun, 28 Mar 2021 22:42:00 +0200 +Subject: futex: Avoid freeing an active timer +To: stable@vger.kernel.org +Cc: Lee Jones , "Luis Claudio R. Goncalves" , Florian Fainelli +Message-ID: +Content-Disposition: inline + +From: Thomas Gleixner + +commit 97181f9bd57405b879403763284537e27d46963d upstream. + +Alexander reported a hrtimer debug_object splat: + + ODEBUG: free active (active state 0) object type: hrtimer hint: hrtimer_wakeup (kernel/time/hrtimer.c:1423) + + debug_object_free (lib/debugobjects.c:603) + destroy_hrtimer_on_stack (kernel/time/hrtimer.c:427) + futex_lock_pi (kernel/futex.c:2740) + do_futex (kernel/futex.c:3399) + SyS_futex (kernel/futex.c:3447 kernel/futex.c:3415) + do_syscall_64 (arch/x86/entry/common.c:284) + entry_SYSCALL64_slow_path (arch/x86/entry/entry_64.S:249) + +Which was caused by commit: + + cfafcd117da0 ("futex: Rework futex_lock_pi() to use rt_mutex_*_proxy_lock()") + +... losing the hrtimer_cancel() in the shuffle. Where previously the +hrtimer_cancel() was done by rt_mutex_slowlock() we now need to do it +manually. + +Reported-by: Alexander Levin +Signed-off-by: Thomas Gleixner +Signed-off-by: Peter Zijlstra (Intel) +Cc: Linus Torvalds +Cc: Peter Zijlstra +Fixes: cfafcd117da0 ("futex: Rework futex_lock_pi() to use rt_mutex_*_proxy_lock()") +Link: http://lkml.kernel.org/r/alpine.DEB.2.20.1704101802370.2906@nanos +Signed-off-by: Ingo Molnar +Signed-off-by: Ben Hutchings +Signed-off-by: Greg Kroah-Hartman +--- + kernel/futex.c | 4 +++- + 1 file changed, 3 insertions(+), 1 deletion(-) + +--- a/kernel/futex.c ++++ b/kernel/futex.c +@@ -3018,8 +3018,10 @@ out_unlock_put_key: + out_put_key: + put_futex_key(&q.key); + out: +- if (to) ++ if (to) { ++ hrtimer_cancel(&to->timer); + destroy_hrtimer_on_stack(&to->timer); ++ } + return ret != -EINTR ? ret : -ERESTARTNOINTR; + + uaddr_faulted: diff --git a/queue-4.9/futex-drop-hb-lock-before-enqueueing-on-the-rtmutex.patch b/queue-4.9/futex-drop-hb-lock-before-enqueueing-on-the-rtmutex.patch new file mode 100644 index 00000000000..a3d59d6518f --- /dev/null +++ b/queue-4.9/futex-drop-hb-lock-before-enqueueing-on-the-rtmutex.patch @@ -0,0 +1,211 @@ +From foo@baz Mon Mar 29 07:48:09 AM CEST 2021 +From: Ben Hutchings +Date: Sun, 28 Mar 2021 22:41:51 +0200 +Subject: futex: Drop hb->lock before enqueueing on the rtmutex +To: stable@vger.kernel.org +Cc: Lee Jones , "Luis Claudio R. Goncalves" , Florian Fainelli +Message-ID: +Content-Disposition: inline + +From: Peter Zijlstra + +commit 56222b212e8edb1cf51f5dd73ff645809b082b40 upstream. + +When PREEMPT_RT_FULL does the spinlock -> rt_mutex substitution the PI +chain code will (falsely) report a deadlock and BUG. + +The problem is that it hold hb->lock (now an rt_mutex) while doing +task_blocks_on_rt_mutex on the futex's pi_state::rtmutex. This, when +interleaved just right with futex_unlock_pi() leads it to believe to see an +AB-BA deadlock. + + Task1 (holds rt_mutex, Task2 (does FUTEX_LOCK_PI) + does FUTEX_UNLOCK_PI) + + lock hb->lock + lock rt_mutex (as per start_proxy) + lock hb->lock + +Which is a trivial AB-BA. + +It is not an actual deadlock, because it won't be holding hb->lock by the +time it actually blocks on the rt_mutex, but the chainwalk code doesn't +know that and it would be a nightmare to handle this gracefully. + +To avoid this problem, do the same as in futex_unlock_pi() and drop +hb->lock after acquiring wait_lock. This still fully serializes against +futex_unlock_pi(), since adding to the wait_list does the very same lock +dance, and removing it holds both locks. + +Aside of solving the RT problem this makes the lock and unlock mechanism +symetric and reduces the hb->lock held time. + +Reported-and-tested-by: Sebastian Andrzej Siewior +Suggested-by: Thomas Gleixner +Signed-off-by: Peter Zijlstra (Intel) +Cc: juri.lelli@arm.com +Cc: xlpang@redhat.com +Cc: rostedt@goodmis.org +Cc: mathieu.desnoyers@efficios.com +Cc: jdesfossez@efficios.com +Cc: dvhart@infradead.org +Cc: bristot@redhat.com +Link: http://lkml.kernel.org/r/20170322104152.161341537@infradead.org +Signed-off-by: Thomas Gleixner +Signed-off-by: Ben Hutchings +Signed-off-by: Greg Kroah-Hartman +--- + kernel/futex.c | 30 +++++++++++++++++------- + kernel/locking/rtmutex.c | 49 ++++++++++++++++++++++------------------ + kernel/locking/rtmutex_common.h | 3 ++ + 3 files changed, 52 insertions(+), 30 deletions(-) + +--- a/kernel/futex.c ++++ b/kernel/futex.c +@@ -2948,20 +2948,33 @@ retry_private: + goto no_block; + } + ++ rt_mutex_init_waiter(&rt_waiter); ++ + /* +- * We must add ourselves to the rt_mutex waitlist while holding hb->lock +- * such that the hb and rt_mutex wait lists match. ++ * On PREEMPT_RT_FULL, when hb->lock becomes an rt_mutex, we must not ++ * hold it while doing rt_mutex_start_proxy(), because then it will ++ * include hb->lock in the blocking chain, even through we'll not in ++ * fact hold it while blocking. This will lead it to report -EDEADLK ++ * and BUG when futex_unlock_pi() interleaves with this. ++ * ++ * Therefore acquire wait_lock while holding hb->lock, but drop the ++ * latter before calling rt_mutex_start_proxy_lock(). This still fully ++ * serializes against futex_unlock_pi() as that does the exact same ++ * lock handoff sequence. + */ +- rt_mutex_init_waiter(&rt_waiter); +- ret = rt_mutex_start_proxy_lock(&q.pi_state->pi_mutex, &rt_waiter, current); ++ raw_spin_lock_irq(&q.pi_state->pi_mutex.wait_lock); ++ spin_unlock(q.lock_ptr); ++ ret = __rt_mutex_start_proxy_lock(&q.pi_state->pi_mutex, &rt_waiter, current); ++ raw_spin_unlock_irq(&q.pi_state->pi_mutex.wait_lock); ++ + if (ret) { + if (ret == 1) + ret = 0; + ++ spin_lock(q.lock_ptr); + goto no_block; + } + +- spin_unlock(q.lock_ptr); + + if (unlikely(to)) + hrtimer_start_expires(&to->timer, HRTIMER_MODE_ABS); +@@ -2974,6 +2987,9 @@ retry_private: + * first acquire the hb->lock before removing the lock from the + * rt_mutex waitqueue, such that we can keep the hb and rt_mutex + * wait lists consistent. ++ * ++ * In particular; it is important that futex_unlock_pi() can not ++ * observe this inconsistency. + */ + if (ret && !rt_mutex_cleanup_proxy_lock(&q.pi_state->pi_mutex, &rt_waiter)) + ret = 0; +@@ -3071,10 +3087,6 @@ retry: + + get_pi_state(pi_state); + /* +- * Since modifying the wait_list is done while holding both +- * hb->lock and wait_lock, holding either is sufficient to +- * observe it. +- * + * By taking wait_lock while still holding hb->lock, we ensure + * there is no point where we hold neither; and therefore + * wake_futex_pi() must observe a state consistent with what we +--- a/kernel/locking/rtmutex.c ++++ b/kernel/locking/rtmutex.c +@@ -1695,31 +1695,14 @@ void rt_mutex_proxy_unlock(struct rt_mut + rt_mutex_set_owner(lock, NULL); + } + +-/** +- * rt_mutex_start_proxy_lock() - Start lock acquisition for another task +- * @lock: the rt_mutex to take +- * @waiter: the pre-initialized rt_mutex_waiter +- * @task: the task to prepare +- * +- * Returns: +- * 0 - task blocked on lock +- * 1 - acquired the lock for task, caller should wake it up +- * <0 - error +- * +- * Special API call for FUTEX_REQUEUE_PI support. +- */ +-int rt_mutex_start_proxy_lock(struct rt_mutex *lock, ++int __rt_mutex_start_proxy_lock(struct rt_mutex *lock, + struct rt_mutex_waiter *waiter, + struct task_struct *task) + { + int ret; + +- raw_spin_lock_irq(&lock->wait_lock); +- +- if (try_to_take_rt_mutex(lock, task, NULL)) { +- raw_spin_unlock_irq(&lock->wait_lock); ++ if (try_to_take_rt_mutex(lock, task, NULL)) + return 1; +- } + + /* We enforce deadlock detection for futexes */ + ret = task_blocks_on_rt_mutex(lock, waiter, task, +@@ -1738,12 +1721,36 @@ int rt_mutex_start_proxy_lock(struct rt_ + if (unlikely(ret)) + remove_waiter(lock, waiter); + +- raw_spin_unlock_irq(&lock->wait_lock); +- + debug_rt_mutex_print_deadlock(waiter); + + return ret; + } ++ ++/** ++ * rt_mutex_start_proxy_lock() - Start lock acquisition for another task ++ * @lock: the rt_mutex to take ++ * @waiter: the pre-initialized rt_mutex_waiter ++ * @task: the task to prepare ++ * ++ * Returns: ++ * 0 - task blocked on lock ++ * 1 - acquired the lock for task, caller should wake it up ++ * <0 - error ++ * ++ * Special API call for FUTEX_REQUEUE_PI support. ++ */ ++int rt_mutex_start_proxy_lock(struct rt_mutex *lock, ++ struct rt_mutex_waiter *waiter, ++ struct task_struct *task) ++{ ++ int ret; ++ ++ raw_spin_lock_irq(&lock->wait_lock); ++ ret = __rt_mutex_start_proxy_lock(lock, waiter, task); ++ raw_spin_unlock_irq(&lock->wait_lock); ++ ++ return ret; ++} + + /** + * rt_mutex_next_owner - return the next owner of the lock +--- a/kernel/locking/rtmutex_common.h ++++ b/kernel/locking/rtmutex_common.h +@@ -104,6 +104,9 @@ extern void rt_mutex_init_proxy_locked(s + struct task_struct *proxy_owner); + extern void rt_mutex_proxy_unlock(struct rt_mutex *lock); + extern void rt_mutex_init_waiter(struct rt_mutex_waiter *waiter); ++extern int __rt_mutex_start_proxy_lock(struct rt_mutex *lock, ++ struct rt_mutex_waiter *waiter, ++ struct task_struct *task); + extern int rt_mutex_start_proxy_lock(struct rt_mutex *lock, + struct rt_mutex_waiter *waiter, + struct task_struct *task); diff --git a/queue-4.9/futex-fix-incorrect-should_fail_futex-handling.patch b/queue-4.9/futex-fix-incorrect-should_fail_futex-handling.patch new file mode 100644 index 00000000000..0fc668e3482 --- /dev/null +++ b/queue-4.9/futex-fix-incorrect-should_fail_futex-handling.patch @@ -0,0 +1,50 @@ +From foo@baz Mon Mar 29 07:48:09 AM CEST 2021 +From: Ben Hutchings +Date: Sun, 28 Mar 2021 22:43:10 +0200 +Subject: futex: Fix incorrect should_fail_futex() handling +To: stable@vger.kernel.org +Cc: Lee Jones , "Luis Claudio R. Goncalves" , Florian Fainelli +Message-ID: +Content-Disposition: inline + +From: Mateusz Nosek + +commit 921c7ebd1337d1a46783d7e15a850e12aed2eaa0 upstream. + +If should_futex_fail() returns true in futex_wake_pi(), then the 'ret' +variable is set to -EFAULT and then immediately overwritten. So the failure +injection is non-functional. + +Fix it by actually leaving the function and returning -EFAULT. + +The Fixes tag is kinda blury because the initial commit which introduced +failure injection was already sloppy, but the below mentioned commit broke +it completely. + +[ tglx: Massaged changelog ] + +Fixes: 6b4f4bc9cb22 ("locking/futex: Allow low-level atomic operations to return -EAGAIN") +Signed-off-by: Mateusz Nosek +Signed-off-by: Thomas Gleixner +Link: https://lore.kernel.org/r/20200927000858.24219-1-mateusznosek0@gmail.com +Signed-off-by: Sasha Levin +Signed-off-by: Ben Hutchings +Signed-off-by: Greg Kroah-Hartman +--- + kernel/futex.c | 4 +++- + 1 file changed, 3 insertions(+), 1 deletion(-) + +--- a/kernel/futex.c ++++ b/kernel/futex.c +@@ -1605,8 +1605,10 @@ static int wake_futex_pi(u32 __user *uad + */ + newval = FUTEX_WAITERS | task_pid_vnr(new_owner); + +- if (unlikely(should_fail_futex(true))) ++ if (unlikely(should_fail_futex(true))) { + ret = -EFAULT; ++ goto out_unlock; ++ } + + ret = cmpxchg_futex_value_locked(&curval, uaddr, uval, newval); + if (!ret && (curval != uval)) { diff --git a/queue-4.9/futex-fix-possible-missed-wakeup.patch b/queue-4.9/futex-fix-possible-missed-wakeup.patch new file mode 100644 index 00000000000..9c8e2147d03 --- /dev/null +++ b/queue-4.9/futex-fix-possible-missed-wakeup.patch @@ -0,0 +1,64 @@ +From foo@baz Mon Mar 29 07:48:09 AM CEST 2021 +From: Ben Hutchings +Date: Sun, 28 Mar 2021 22:42:32 +0200 +Subject: futex: Fix (possible) missed wakeup +To: stable@vger.kernel.org +Cc: Lee Jones , "Luis Claudio R. Goncalves" , Florian Fainelli +Message-ID: +Content-Disposition: inline + +From: Peter Zijlstra + +commit b061c38bef43406df8e73c5be06cbfacad5ee6ad upstream. + +We must not rely on wake_q_add() to delay the wakeup; in particular +commit: + + 1d0dcb3ad9d3 ("futex: Implement lockless wakeups") + +moved wake_q_add() before smp_store_release(&q->lock_ptr, NULL), which +could result in futex_wait() waking before observing ->lock_ptr == +NULL and going back to sleep again. + +Signed-off-by: Peter Zijlstra (Intel) +Cc: Linus Torvalds +Cc: Peter Zijlstra +Cc: Thomas Gleixner +Fixes: 1d0dcb3ad9d3 ("futex: Implement lockless wakeups") +Signed-off-by: Ingo Molnar +Signed-off-by: Sasha Levin +Signed-off-by: Ben Hutchings +Signed-off-by: Greg Kroah-Hartman +--- + kernel/futex.c | 13 ++++++++----- + 1 file changed, 8 insertions(+), 5 deletions(-) + +--- a/kernel/futex.c ++++ b/kernel/futex.c +@@ -1553,11 +1553,7 @@ static void mark_wake_futex(struct wake_ + if (WARN(q->pi_state || q->rt_waiter, "refusing to wake PI futex\n")) + return; + +- /* +- * Queue the task for later wakeup for after we've released +- * the hb->lock. wake_q_add() grabs reference to p. +- */ +- wake_q_add(wake_q, p); ++ get_task_struct(p); + __unqueue_futex(q); + /* + * The waiting task can free the futex_q as soon as +@@ -1566,6 +1562,13 @@ static void mark_wake_futex(struct wake_ + * store to lock_ptr from getting ahead of the plist_del. + */ + smp_store_release(&q->lock_ptr, NULL); ++ ++ /* ++ * Queue the task for later wakeup for after we've released ++ * the hb->lock. wake_q_add() grabs reference to p. ++ */ ++ wake_q_add(wake_q, p); ++ put_task_struct(p); + } + + /* diff --git a/queue-4.9/futex-handle-early-deadlock-return-correctly.patch b/queue-4.9/futex-handle-early-deadlock-return-correctly.patch new file mode 100644 index 00000000000..c5d088cd059 --- /dev/null +++ b/queue-4.9/futex-handle-early-deadlock-return-correctly.patch @@ -0,0 +1,231 @@ +From foo@baz Mon Mar 29 07:48:09 AM CEST 2021 +From: Ben Hutchings +Date: Sun, 28 Mar 2021 22:42:20 +0200 +Subject: futex: Handle early deadlock return correctly +To: stable@vger.kernel.org +Cc: Lee Jones , "Luis Claudio R. Goncalves" , Florian Fainelli +Message-ID: +Content-Disposition: inline + +From: Thomas Gleixner + +commit 1a1fb985f2e2b85ec0d3dc2e519ee48389ec2434 upstream. + +commit 56222b212e8e ("futex: Drop hb->lock before enqueueing on the +rtmutex") changed the locking rules in the futex code so that the hash +bucket lock is not longer held while the waiter is enqueued into the +rtmutex wait list. This made the lock and the unlock path symmetric, but +unfortunately the possible early exit from __rt_mutex_proxy_start() due to +a detected deadlock was not updated accordingly. That allows a concurrent +unlocker to observe inconsitent state which triggers the warning in the +unlock path. + +futex_lock_pi() futex_unlock_pi() + lock(hb->lock) + queue(hb_waiter) lock(hb->lock) + lock(rtmutex->wait_lock) + unlock(hb->lock) + // acquired hb->lock + hb_waiter = futex_top_waiter() + lock(rtmutex->wait_lock) + __rt_mutex_proxy_start() + ---> fail + remove(rtmutex_waiter); + ---> returns -EDEADLOCK + unlock(rtmutex->wait_lock) + // acquired wait_lock + wake_futex_pi() + rt_mutex_next_owner() + --> returns NULL + --> WARN + + lock(hb->lock) + unqueue(hb_waiter) + +The problem is caused by the remove(rtmutex_waiter) in the failure case of +__rt_mutex_proxy_start() as this lets the unlocker observe a waiter in the +hash bucket but no waiter on the rtmutex, i.e. inconsistent state. + +The original commit handles this correctly for the other early return cases +(timeout, signal) by delaying the removal of the rtmutex waiter until the +returning task reacquired the hash bucket lock. + +Treat the failure case of __rt_mutex_proxy_start() in the same way and let +the existing cleanup code handle the eventual handover of the rtmutex +gracefully. The regular rt_mutex_proxy_start() gains the rtmutex waiter +removal for the failure case, so that the other callsites are still +operating correctly. + +Add proper comments to the code so all these details are fully documented. + +Thanks to Peter for helping with the analysis and writing the really +valuable code comments. + +Fixes: 56222b212e8e ("futex: Drop hb->lock before enqueueing on the rtmutex") +Reported-by: Heiko Carstens +Co-developed-by: Peter Zijlstra +Signed-off-by: Peter Zijlstra +Signed-off-by: Thomas Gleixner +Tested-by: Heiko Carstens +Cc: Martin Schwidefsky +Cc: linux-s390@vger.kernel.org +Cc: Stefan Liebler +Cc: Sebastian Sewior +Cc: stable@vger.kernel.org +Link: https://lkml.kernel.org/r/alpine.DEB.2.21.1901292311410.1950@nanos.tec.linutronix.de +Signed-off-by: Greg Kroah-Hartman +Signed-off-by: Ben Hutchings +Signed-off-by: Greg Kroah-Hartman +--- + kernel/futex.c | 28 ++++++++++++++++++---------- + kernel/locking/rtmutex.c | 37 ++++++++++++++++++++++++++++++++----- + 2 files changed, 50 insertions(+), 15 deletions(-) + +--- a/kernel/futex.c ++++ b/kernel/futex.c +@@ -2958,35 +2958,39 @@ retry_private: + * and BUG when futex_unlock_pi() interleaves with this. + * + * Therefore acquire wait_lock while holding hb->lock, but drop the +- * latter before calling rt_mutex_start_proxy_lock(). This still fully +- * serializes against futex_unlock_pi() as that does the exact same +- * lock handoff sequence. ++ * latter before calling __rt_mutex_start_proxy_lock(). This ++ * interleaves with futex_unlock_pi() -- which does a similar lock ++ * handoff -- such that the latter can observe the futex_q::pi_state ++ * before __rt_mutex_start_proxy_lock() is done. + */ + raw_spin_lock_irq(&q.pi_state->pi_mutex.wait_lock); + spin_unlock(q.lock_ptr); ++ /* ++ * __rt_mutex_start_proxy_lock() unconditionally enqueues the @rt_waiter ++ * such that futex_unlock_pi() is guaranteed to observe the waiter when ++ * it sees the futex_q::pi_state. ++ */ + ret = __rt_mutex_start_proxy_lock(&q.pi_state->pi_mutex, &rt_waiter, current); + raw_spin_unlock_irq(&q.pi_state->pi_mutex.wait_lock); + + if (ret) { + if (ret == 1) + ret = 0; +- +- spin_lock(q.lock_ptr); +- goto no_block; ++ goto cleanup; + } + +- + if (unlikely(to)) + hrtimer_start_expires(&to->timer, HRTIMER_MODE_ABS); + + ret = rt_mutex_wait_proxy_lock(&q.pi_state->pi_mutex, to, &rt_waiter); + ++cleanup: + spin_lock(q.lock_ptr); + /* +- * If we failed to acquire the lock (signal/timeout), we must ++ * If we failed to acquire the lock (deadlock/signal/timeout), we must + * first acquire the hb->lock before removing the lock from the +- * rt_mutex waitqueue, such that we can keep the hb and rt_mutex +- * wait lists consistent. ++ * rt_mutex waitqueue, such that we can keep the hb and rt_mutex wait ++ * lists consistent. + * + * In particular; it is important that futex_unlock_pi() can not + * observe this inconsistency. +@@ -3093,6 +3097,10 @@ retry: + * there is no point where we hold neither; and therefore + * wake_futex_pi() must observe a state consistent with what we + * observed. ++ * ++ * In particular; this forces __rt_mutex_start_proxy() to ++ * complete such that we're guaranteed to observe the ++ * rt_waiter. Also see the WARN in wake_futex_pi(). + */ + raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock); + spin_unlock(&hb->lock); +--- a/kernel/locking/rtmutex.c ++++ b/kernel/locking/rtmutex.c +@@ -1695,12 +1695,33 @@ void rt_mutex_proxy_unlock(struct rt_mut + rt_mutex_set_owner(lock, NULL); + } + ++/** ++ * __rt_mutex_start_proxy_lock() - Start lock acquisition for another task ++ * @lock: the rt_mutex to take ++ * @waiter: the pre-initialized rt_mutex_waiter ++ * @task: the task to prepare ++ * ++ * Starts the rt_mutex acquire; it enqueues the @waiter and does deadlock ++ * detection. It does not wait, see rt_mutex_wait_proxy_lock() for that. ++ * ++ * NOTE: does _NOT_ remove the @waiter on failure; must either call ++ * rt_mutex_wait_proxy_lock() or rt_mutex_cleanup_proxy_lock() after this. ++ * ++ * Returns: ++ * 0 - task blocked on lock ++ * 1 - acquired the lock for task, caller should wake it up ++ * <0 - error ++ * ++ * Special API call for PI-futex support. ++ */ + int __rt_mutex_start_proxy_lock(struct rt_mutex *lock, + struct rt_mutex_waiter *waiter, + struct task_struct *task) + { + int ret; + ++ lockdep_assert_held(&lock->wait_lock); ++ + if (try_to_take_rt_mutex(lock, task, NULL)) + return 1; + +@@ -1718,9 +1739,6 @@ int __rt_mutex_start_proxy_lock(struct r + ret = 0; + } + +- if (unlikely(ret)) +- remove_waiter(lock, waiter); +- + debug_rt_mutex_print_deadlock(waiter); + + return ret; +@@ -1732,12 +1750,18 @@ int __rt_mutex_start_proxy_lock(struct r + * @waiter: the pre-initialized rt_mutex_waiter + * @task: the task to prepare + * ++ * Starts the rt_mutex acquire; it enqueues the @waiter and does deadlock ++ * detection. It does not wait, see rt_mutex_wait_proxy_lock() for that. ++ * ++ * NOTE: unlike __rt_mutex_start_proxy_lock this _DOES_ remove the @waiter ++ * on failure. ++ * + * Returns: + * 0 - task blocked on lock + * 1 - acquired the lock for task, caller should wake it up + * <0 - error + * +- * Special API call for FUTEX_REQUEUE_PI support. ++ * Special API call for PI-futex support. + */ + int rt_mutex_start_proxy_lock(struct rt_mutex *lock, + struct rt_mutex_waiter *waiter, +@@ -1747,6 +1771,8 @@ int rt_mutex_start_proxy_lock(struct rt_ + + raw_spin_lock_irq(&lock->wait_lock); + ret = __rt_mutex_start_proxy_lock(lock, waiter, task); ++ if (unlikely(ret)) ++ remove_waiter(lock, waiter); + raw_spin_unlock_irq(&lock->wait_lock); + + return ret; +@@ -1814,7 +1840,8 @@ int rt_mutex_wait_proxy_lock(struct rt_m + * @lock: the rt_mutex we were woken on + * @waiter: the pre-initialized rt_mutex_waiter + * +- * Attempt to clean up after a failed rt_mutex_wait_proxy_lock(). ++ * Attempt to clean up after a failed __rt_mutex_start_proxy_lock() or ++ * rt_mutex_wait_proxy_lock(). + * + * Unless we acquired the lock; we're still enqueued on the wait-list and can + * in fact still be granted ownership until we're removed. Therefore we can diff --git a/queue-4.9/futex-handle-transient-ownerless-rtmutex-state-correctly.patch b/queue-4.9/futex-handle-transient-ownerless-rtmutex-state-correctly.patch new file mode 100644 index 00000000000..0bb854d1db3 --- /dev/null +++ b/queue-4.9/futex-handle-transient-ownerless-rtmutex-state-correctly.patch @@ -0,0 +1,85 @@ +From foo@baz Mon Mar 29 07:48:09 AM CEST 2021 +From: Ben Hutchings +Date: Sun, 28 Mar 2021 22:43:15 +0200 +Subject: futex: Handle transient "ownerless" rtmutex state correctly +To: stable@vger.kernel.org +Cc: Lee Jones , "Luis Claudio R. Goncalves" , Florian Fainelli +Message-ID: +Content-Disposition: inline + +From: Mike Galbraith + +commit 9f5d1c336a10c0d24e83e40b4c1b9539f7dba627 upstream. + +Gratian managed to trigger the BUG_ON(!newowner) in fixup_pi_state_owner(). +This is one possible chain of events leading to this: + +Task Prio Operation +T1 120 lock(F) +T2 120 lock(F) -> blocks (top waiter) +T3 50 (RT) lock(F) -> boosts T1 and blocks (new top waiter) +XX timeout/ -> wakes T2 + signal +T1 50 unlock(F) -> wakes T3 (rtmutex->owner == NULL, waiter bit is set) +T2 120 cleanup -> try_to_take_mutex() fails because T3 is the top waiter + and the lower priority T2 cannot steal the lock. + -> fixup_pi_state_owner() sees newowner == NULL -> BUG_ON() + +The comment states that this is invalid and rt_mutex_real_owner() must +return a non NULL owner when the trylock failed, but in case of a queued +and woken up waiter rt_mutex_real_owner() == NULL is a valid transient +state. The higher priority waiter has simply not yet managed to take over +the rtmutex. + +The BUG_ON() is therefore wrong and this is just another retry condition in +fixup_pi_state_owner(). + +Drop the locks, so that T3 can make progress, and then try the fixup again. + +Gratian provided a great analysis, traces and a reproducer. The analysis is +to the point, but it confused the hell out of that tglx dude who had to +page in all the futex horrors again. Condensed version is above. + +[ tglx: Wrote comment and changelog ] + +Fixes: c1e2f0eaf015 ("futex: Avoid violating the 10th rule of futex") +Reported-by: Gratian Crisan +Signed-off-by: Mike Galbraith +Signed-off-by: Thomas Gleixner +Cc: stable@vger.kernel.org +Link: https://lore.kernel.org/r/87a6w6x7bb.fsf@ni.com +Link: https://lore.kernel.org/r/87sg9pkvf7.fsf@nanos.tec.linutronix.de +Signed-off-by: Greg Kroah-Hartman +Signed-off-by: Ben Hutchings +Signed-off-by: Greg Kroah-Hartman +--- + kernel/futex.c | 16 ++++++++++++++-- + 1 file changed, 14 insertions(+), 2 deletions(-) + +--- a/kernel/futex.c ++++ b/kernel/futex.c +@@ -2497,10 +2497,22 @@ retry: + } + + /* +- * Since we just failed the trylock; there must be an owner. ++ * The trylock just failed, so either there is an owner or ++ * there is a higher priority waiter than this one. + */ + newowner = rt_mutex_owner(&pi_state->pi_mutex); +- BUG_ON(!newowner); ++ /* ++ * If the higher priority waiter has not yet taken over the ++ * rtmutex then newowner is NULL. We can't return here with ++ * that state because it's inconsistent vs. the user space ++ * state. So drop the locks and try again. It's a valid ++ * situation and not any different from the other retry ++ * conditions. ++ */ ++ if (unlikely(!newowner)) { ++ err = -EAGAIN; ++ goto handle_err; ++ } + } else { + WARN_ON_ONCE(argowner != current); + if (oldowner == current) { diff --git a/queue-4.9/futex-prevent-robust-futex-exit-race.patch b/queue-4.9/futex-prevent-robust-futex-exit-race.patch new file mode 100644 index 00000000000..6f9d81d3b00 --- /dev/null +++ b/queue-4.9/futex-prevent-robust-futex-exit-race.patch @@ -0,0 +1,266 @@ +From foo@baz Mon Mar 29 07:48:09 AM CEST 2021 +From: Ben Hutchings +Date: Sun, 28 Mar 2021 22:42:58 +0200 +Subject: futex: Prevent robust futex exit race +To: stable@vger.kernel.org +Cc: Lee Jones , "Luis Claudio R. Goncalves" , Florian Fainelli +Message-ID: +Content-Disposition: inline + +From: Yang Tao + +commit ca16d5bee59807bf04deaab0a8eccecd5061528c upstream. + +Robust futexes utilize the robust_list mechanism to allow the kernel to +release futexes which are held when a task exits. The exit can be voluntary +or caused by a signal or fault. This prevents that waiters block forever. + +The futex operations in user space store a pointer to the futex they are +either locking or unlocking in the op_pending member of the per task robust +list. + +After a lock operation has succeeded the futex is queued in the robust list +linked list and the op_pending pointer is cleared. + +After an unlock operation has succeeded the futex is removed from the +robust list linked list and the op_pending pointer is cleared. + +The robust list exit code checks for the pending operation and any futex +which is queued in the linked list. It carefully checks whether the futex +value is the TID of the exiting task. If so, it sets the OWNER_DIED bit and +tries to wake up a potential waiter. + +This is race free for the lock operation but unlock has two race scenarios +where waiters might not be woken up. These issues can be observed with +regular robust pthread mutexes. PI aware pthread mutexes are not affected. + +(1) Unlocking task is killed after unlocking the futex value in user space + before being able to wake a waiter. + + pthread_mutex_unlock() + | + V + atomic_exchange_rel (&mutex->__data.__lock, 0) + <------------------------killed + lll_futex_wake () | + | + |(__lock = 0) + |(enter kernel) + | + V + do_exit() + exit_mm() + mm_release() + exit_robust_list() + handle_futex_death() + | + |(__lock = 0) + |(uval = 0) + | + V + if ((uval & FUTEX_TID_MASK) != task_pid_vnr(curr)) + return 0; + + The sanity check which ensures that the user space futex is owned by + the exiting task prevents the wakeup of waiters which in consequence + block infinitely. + +(2) Waiting task is killed after a wakeup and before it can acquire the + futex in user space. + + OWNER WAITER + futex_wait() + pthread_mutex_unlock() | + | | + |(__lock = 0) | + | | + V | + futex_wake() ------------> wakeup() + | + |(return to userspace) + |(__lock = 0) + | + V + oldval = mutex->__data.__lock + <-----------------killed + atomic_compare_and_exchange_val_acq (&mutex->__data.__lock, | + id | assume_other_futex_waiters, 0) | + | + | + (enter kernel)| + | + V + do_exit() + | + | + V + handle_futex_death() + | + |(__lock = 0) + |(uval = 0) + | + V + if ((uval & FUTEX_TID_MASK) != task_pid_vnr(curr)) + return 0; + + The sanity check which ensures that the user space futex is owned + by the exiting task prevents the wakeup of waiters, which seems to + be correct as the exiting task does not own the futex value, but + the consequence is that other waiters wont be woken up and block + infinitely. + +In both scenarios the following conditions are true: + + - task->robust_list->list_op_pending != NULL + - user space futex value == 0 + - Regular futex (not PI) + +If these conditions are met then it is reasonably safe to wake up a +potential waiter in order to prevent the above problems. + +As this might be a false positive it can cause spurious wakeups, but the +waiter side has to handle other types of unrelated wakeups, e.g. signals +gracefully anyway. So such a spurious wakeup will not affect the +correctness of these operations. + +This workaround must not touch the user space futex value and cannot set +the OWNER_DIED bit because the lock value is 0, i.e. uncontended. Setting +OWNER_DIED in this case would result in inconsistent state and subsequently +in malfunction of the owner died handling in user space. + +The rest of the user space state is still consistent as no other task can +observe the list_op_pending entry in the exiting tasks robust list. + +The eventually woken up waiter will observe the uncontended lock value and +take it over. + +[ tglx: Massaged changelog and comment. Made the return explicit and not + depend on the subsequent check and added constants to hand into + handle_futex_death() instead of plain numbers. Fixed a few coding + style issues. ] + +Fixes: 0771dfefc9e5 ("[PATCH] lightweight robust futexes: core") +Signed-off-by: Yang Tao +Signed-off-by: Yi Wang +Signed-off-by: Thomas Gleixner +Reviewed-by: Ingo Molnar +Acked-by: Peter Zijlstra (Intel) +Cc: stable@vger.kernel.org +Link: https://lkml.kernel.org/r/1573010582-35297-1-git-send-email-wang.yi59@zte.com.cn +Link: https://lkml.kernel.org/r/20191106224555.943191378@linutronix.de +Signed-off-by: Greg Kroah-Hartman +Signed-off-by: Ben Hutchings +Signed-off-by: Greg Kroah-Hartman +--- + kernel/futex.c | 58 ++++++++++++++++++++++++++++++++++++++++++++++++++------- + 1 file changed, 51 insertions(+), 7 deletions(-) + +--- a/kernel/futex.c ++++ b/kernel/futex.c +@@ -3526,11 +3526,16 @@ err_unlock: + return ret; + } + ++/* Constants for the pending_op argument of handle_futex_death */ ++#define HANDLE_DEATH_PENDING true ++#define HANDLE_DEATH_LIST false ++ + /* + * Process a futex-list entry, check whether it's owned by the + * dying task, and do notification if so: + */ +-static int handle_futex_death(u32 __user *uaddr, struct task_struct *curr, int pi) ++static int handle_futex_death(u32 __user *uaddr, struct task_struct *curr, ++ bool pi, bool pending_op) + { + u32 uval, uninitialized_var(nval), mval; + int err; +@@ -3543,6 +3548,42 @@ retry: + if (get_user(uval, uaddr)) + return -1; + ++ /* ++ * Special case for regular (non PI) futexes. The unlock path in ++ * user space has two race scenarios: ++ * ++ * 1. The unlock path releases the user space futex value and ++ * before it can execute the futex() syscall to wake up ++ * waiters it is killed. ++ * ++ * 2. A woken up waiter is killed before it can acquire the ++ * futex in user space. ++ * ++ * In both cases the TID validation below prevents a wakeup of ++ * potential waiters which can cause these waiters to block ++ * forever. ++ * ++ * In both cases the following conditions are met: ++ * ++ * 1) task->robust_list->list_op_pending != NULL ++ * @pending_op == true ++ * 2) User space futex value == 0 ++ * 3) Regular futex: @pi == false ++ * ++ * If these conditions are met, it is safe to attempt waking up a ++ * potential waiter without touching the user space futex value and ++ * trying to set the OWNER_DIED bit. The user space futex value is ++ * uncontended and the rest of the user space mutex state is ++ * consistent, so a woken waiter will just take over the ++ * uncontended futex. Setting the OWNER_DIED bit would create ++ * inconsistent state and malfunction of the user space owner died ++ * handling. ++ */ ++ if (pending_op && !pi && !uval) { ++ futex_wake(uaddr, 1, 1, FUTEX_BITSET_MATCH_ANY); ++ return 0; ++ } ++ + if ((uval & FUTEX_TID_MASK) != task_pid_vnr(curr)) + return 0; + +@@ -3662,10 +3703,11 @@ static void exit_robust_list(struct task + * A pending lock might already be on the list, so + * don't process it twice: + */ +- if (entry != pending) ++ if (entry != pending) { + if (handle_futex_death((void __user *)entry + futex_offset, +- curr, pi)) ++ curr, pi, HANDLE_DEATH_LIST)) + return; ++ } + if (rc) + return; + entry = next_entry; +@@ -3679,9 +3721,10 @@ static void exit_robust_list(struct task + cond_resched(); + } + +- if (pending) ++ if (pending) { + handle_futex_death((void __user *)pending + futex_offset, +- curr, pip); ++ curr, pip, HANDLE_DEATH_PENDING); ++ } + } + + static void futex_cleanup(struct task_struct *tsk) +@@ -3964,7 +4007,8 @@ void compat_exit_robust_list(struct task + if (entry != pending) { + void __user *uaddr = futex_uaddr(entry, futex_offset); + +- if (handle_futex_death(uaddr, curr, pi)) ++ if (handle_futex_death(uaddr, curr, pi, ++ HANDLE_DEATH_LIST)) + return; + } + if (rc) +@@ -3983,7 +4027,7 @@ void compat_exit_robust_list(struct task + if (pending) { + void __user *uaddr = futex_uaddr(pending, futex_offset); + +- handle_futex_death(uaddr, curr, pip); ++ handle_futex_death(uaddr, curr, pip, HANDLE_DEATH_PENDING); + } + } + diff --git a/queue-4.9/futex-rework-futex_lock_pi-to-use-rt_mutex_-_proxy_lock.patch b/queue-4.9/futex-rework-futex_lock_pi-to-use-rt_mutex_-_proxy_lock.patch new file mode 100644 index 00000000000..dc557f5a5ca --- /dev/null +++ b/queue-4.9/futex-rework-futex_lock_pi-to-use-rt_mutex_-_proxy_lock.patch @@ -0,0 +1,273 @@ +From foo@baz Mon Mar 29 07:48:09 AM CEST 2021 +From: Ben Hutchings +Date: Sun, 28 Mar 2021 22:41:42 +0200 +Subject: futex: Rework futex_lock_pi() to use rt_mutex_*_proxy_lock() +To: stable@vger.kernel.org +Cc: Lee Jones , "Luis Claudio R. Goncalves" , Florian Fainelli +Message-ID: +Content-Disposition: inline + +From: Peter Zijlstra + +commit cfafcd117da0216520568c195cb2f6cd1980c4bb upstream. + +By changing futex_lock_pi() to use rt_mutex_*_proxy_lock() all wait_list +modifications are done under both hb->lock and wait_lock. + +This closes the obvious interleave pattern between futex_lock_pi() and +futex_unlock_pi(), but not entirely so. See below: + +Before: + +futex_lock_pi() futex_unlock_pi() + unlock hb->lock + + lock hb->lock + unlock hb->lock + + lock rt_mutex->wait_lock + unlock rt_mutex_wait_lock + -EAGAIN + + lock rt_mutex->wait_lock + list_add + unlock rt_mutex->wait_lock + + schedule() + + lock rt_mutex->wait_lock + list_del + unlock rt_mutex->wait_lock + + + -EAGAIN + + lock hb->lock + +After: + +futex_lock_pi() futex_unlock_pi() + + lock hb->lock + lock rt_mutex->wait_lock + list_add + unlock rt_mutex->wait_lock + unlock hb->lock + + schedule() + lock hb->lock + unlock hb->lock + lock hb->lock + lock rt_mutex->wait_lock + list_del + unlock rt_mutex->wait_lock + + lock rt_mutex->wait_lock + unlock rt_mutex_wait_lock + -EAGAIN + + unlock hb->lock + +It does however solve the earlier starvation/live-lock scenario which got +introduced with the -EAGAIN since unlike the before scenario; where the +-EAGAIN happens while futex_unlock_pi() doesn't hold any locks; in the +after scenario it happens while futex_unlock_pi() actually holds a lock, +and then it is serialized on that lock. + +Signed-off-by: Peter Zijlstra (Intel) +Cc: juri.lelli@arm.com +Cc: bigeasy@linutronix.de +Cc: xlpang@redhat.com +Cc: rostedt@goodmis.org +Cc: mathieu.desnoyers@efficios.com +Cc: jdesfossez@efficios.com +Cc: dvhart@infradead.org +Cc: bristot@redhat.com +Link: http://lkml.kernel.org/r/20170322104152.062785528@infradead.org +Signed-off-by: Thomas Gleixner +[bwh: Backported to 4.9: adjust context] +Signed-off-by: Ben Hutchings +Signed-off-by: Greg Kroah-Hartman +--- + kernel/futex.c | 77 ++++++++++++++++++++++++++++------------ + kernel/locking/rtmutex.c | 26 +++---------- + kernel/locking/rtmutex_common.h | 1 + 3 files changed, 62 insertions(+), 42 deletions(-) + +--- a/kernel/futex.c ++++ b/kernel/futex.c +@@ -2333,20 +2333,7 @@ queue_unlock(struct futex_hash_bucket *h + hb_waiters_dec(hb); + } + +-/** +- * queue_me() - Enqueue the futex_q on the futex_hash_bucket +- * @q: The futex_q to enqueue +- * @hb: The destination hash bucket +- * +- * The hb->lock must be held by the caller, and is released here. A call to +- * queue_me() is typically paired with exactly one call to unqueue_me(). The +- * exceptions involve the PI related operations, which may use unqueue_me_pi() +- * or nothing if the unqueue is done as part of the wake process and the unqueue +- * state is implicit in the state of woken task (see futex_wait_requeue_pi() for +- * an example). +- */ +-static inline void queue_me(struct futex_q *q, struct futex_hash_bucket *hb) +- __releases(&hb->lock) ++static inline void __queue_me(struct futex_q *q, struct futex_hash_bucket *hb) + { + int prio; + +@@ -2363,6 +2350,24 @@ static inline void queue_me(struct futex + plist_node_init(&q->list, prio); + plist_add(&q->list, &hb->chain); + q->task = current; ++} ++ ++/** ++ * queue_me() - Enqueue the futex_q on the futex_hash_bucket ++ * @q: The futex_q to enqueue ++ * @hb: The destination hash bucket ++ * ++ * The hb->lock must be held by the caller, and is released here. A call to ++ * queue_me() is typically paired with exactly one call to unqueue_me(). The ++ * exceptions involve the PI related operations, which may use unqueue_me_pi() ++ * or nothing if the unqueue is done as part of the wake process and the unqueue ++ * state is implicit in the state of woken task (see futex_wait_requeue_pi() for ++ * an example). ++ */ ++static inline void queue_me(struct futex_q *q, struct futex_hash_bucket *hb) ++ __releases(&hb->lock) ++{ ++ __queue_me(q, hb); + spin_unlock(&hb->lock); + } + +@@ -2868,6 +2873,7 @@ static int futex_lock_pi(u32 __user *uad + { + struct hrtimer_sleeper timeout, *to = NULL; + struct task_struct *exiting = NULL; ++ struct rt_mutex_waiter rt_waiter; + struct futex_hash_bucket *hb; + struct futex_q q = futex_q_init; + int res, ret; +@@ -2928,25 +2934,52 @@ retry_private: + } + } + ++ WARN_ON(!q.pi_state); ++ + /* + * Only actually queue now that the atomic ops are done: + */ +- queue_me(&q, hb); ++ __queue_me(&q, hb); + +- WARN_ON(!q.pi_state); +- /* +- * Block on the PI mutex: +- */ +- if (!trylock) { +- ret = rt_mutex_timed_futex_lock(&q.pi_state->pi_mutex, to); +- } else { ++ if (trylock) { + ret = rt_mutex_futex_trylock(&q.pi_state->pi_mutex); + /* Fixup the trylock return value: */ + ret = ret ? 0 : -EWOULDBLOCK; ++ goto no_block; + } + ++ /* ++ * We must add ourselves to the rt_mutex waitlist while holding hb->lock ++ * such that the hb and rt_mutex wait lists match. ++ */ ++ rt_mutex_init_waiter(&rt_waiter); ++ ret = rt_mutex_start_proxy_lock(&q.pi_state->pi_mutex, &rt_waiter, current); ++ if (ret) { ++ if (ret == 1) ++ ret = 0; ++ ++ goto no_block; ++ } ++ ++ spin_unlock(q.lock_ptr); ++ ++ if (unlikely(to)) ++ hrtimer_start_expires(&to->timer, HRTIMER_MODE_ABS); ++ ++ ret = rt_mutex_wait_proxy_lock(&q.pi_state->pi_mutex, to, &rt_waiter); ++ + spin_lock(q.lock_ptr); + /* ++ * If we failed to acquire the lock (signal/timeout), we must ++ * first acquire the hb->lock before removing the lock from the ++ * rt_mutex waitqueue, such that we can keep the hb and rt_mutex ++ * wait lists consistent. ++ */ ++ if (ret && !rt_mutex_cleanup_proxy_lock(&q.pi_state->pi_mutex, &rt_waiter)) ++ ret = 0; ++ ++no_block: ++ /* + * Fixup the pi_state owner and possibly acquire the lock if we + * haven't already. + */ +--- a/kernel/locking/rtmutex.c ++++ b/kernel/locking/rtmutex.c +@@ -1523,19 +1523,6 @@ int __sched rt_mutex_lock_interruptible( + EXPORT_SYMBOL_GPL(rt_mutex_lock_interruptible); + + /* +- * Futex variant with full deadlock detection. +- * Futex variants must not use the fast-path, see __rt_mutex_futex_unlock(). +- */ +-int __sched rt_mutex_timed_futex_lock(struct rt_mutex *lock, +- struct hrtimer_sleeper *timeout) +-{ +- might_sleep(); +- +- return rt_mutex_slowlock(lock, TASK_INTERRUPTIBLE, +- timeout, RT_MUTEX_FULL_CHAINWALK); +-} +- +-/* + * Futex variant, must not use fastpath. + */ + int __sched rt_mutex_futex_trylock(struct rt_mutex *lock) +@@ -1808,12 +1795,6 @@ int rt_mutex_wait_proxy_lock(struct rt_m + /* sleep on the mutex */ + ret = __rt_mutex_slowlock(lock, TASK_INTERRUPTIBLE, to, waiter); + +- /* +- * try_to_take_rt_mutex() sets the waiter bit unconditionally. We might +- * have to fix that up. +- */ +- fixup_rt_mutex_waiters(lock); +- + raw_spin_unlock_irq(&lock->wait_lock); + + return ret; +@@ -1853,6 +1834,13 @@ bool rt_mutex_cleanup_proxy_lock(struct + fixup_rt_mutex_waiters(lock); + cleanup = true; + } ++ ++ /* ++ * try_to_take_rt_mutex() sets the waiter bit unconditionally. We might ++ * have to fix that up. ++ */ ++ fixup_rt_mutex_waiters(lock); ++ + raw_spin_unlock_irq(&lock->wait_lock); + + return cleanup; +--- a/kernel/locking/rtmutex_common.h ++++ b/kernel/locking/rtmutex_common.h +@@ -112,7 +112,6 @@ extern int rt_mutex_wait_proxy_lock(stru + struct rt_mutex_waiter *waiter); + extern bool rt_mutex_cleanup_proxy_lock(struct rt_mutex *lock, + struct rt_mutex_waiter *waiter); +-extern int rt_mutex_timed_futex_lock(struct rt_mutex *l, struct hrtimer_sleeper *to); + extern int rt_mutex_futex_trylock(struct rt_mutex *l); + extern int __rt_mutex_futex_trylock(struct rt_mutex *l); + diff --git a/queue-4.9/futex-rt_mutex-fix-rt_mutex_cleanup_proxy_lock.patch b/queue-4.9/futex-rt_mutex-fix-rt_mutex_cleanup_proxy_lock.patch new file mode 100644 index 00000000000..beac03aea08 --- /dev/null +++ b/queue-4.9/futex-rt_mutex-fix-rt_mutex_cleanup_proxy_lock.patch @@ -0,0 +1,135 @@ +From foo@baz Mon Mar 29 07:48:09 AM CEST 2021 +From: Ben Hutchings +Date: Sun, 28 Mar 2021 22:42:08 +0200 +Subject: futex,rt_mutex: Fix rt_mutex_cleanup_proxy_lock() +To: stable@vger.kernel.org +Cc: Lee Jones , "Luis Claudio R. Goncalves" , Florian Fainelli +Message-ID: +Content-Disposition: inline + +From: Peter Zijlstra + +commit 04dc1b2fff4e96cb4142227fbdc63c8871ad4ed9 upstream. + +Markus reported that the glibc/nptl/tst-robustpi8 test was failing after +commit: + + cfafcd117da0 ("futex: Rework futex_lock_pi() to use rt_mutex_*_proxy_lock()") + +The following trace shows the problem: + + ld-linux-x86-64-2161 [019] .... 410.760971: SyS_futex: 00007ffbeb76b028: 80000875 op=FUTEX_LOCK_PI + ld-linux-x86-64-2161 [019] ...1 410.760972: lock_pi_update_atomic: 00007ffbeb76b028: curval=80000875 uval=80000875 newval=80000875 ret=0 + ld-linux-x86-64-2165 [011] .... 410.760978: SyS_futex: 00007ffbeb76b028: 80000875 op=FUTEX_UNLOCK_PI + ld-linux-x86-64-2165 [011] d..1 410.760979: do_futex: 00007ffbeb76b028: curval=80000875 uval=80000875 newval=80000871 ret=0 + ld-linux-x86-64-2165 [011] .... 410.760980: SyS_futex: 00007ffbeb76b028: 80000871 ret=0000 + ld-linux-x86-64-2161 [019] .... 410.760980: SyS_futex: 00007ffbeb76b028: 80000871 ret=ETIMEDOUT + +Task 2165 does an UNLOCK_PI, assigning the lock to the waiter task 2161 +which then returns with -ETIMEDOUT. That wrecks the lock state, because now +the owner isn't aware it acquired the lock and removes the pending robust +list entry. + +If 2161 is killed, the robust list will not clear out this futex and the +subsequent acquire on this futex will then (correctly) result in -ESRCH +which is unexpected by glibc, triggers an internal assertion and dies. + +Task 2161 Task 2165 + +rt_mutex_wait_proxy_lock() + timeout(); + /* T2161 is still queued in the waiter list */ + return -ETIMEDOUT; + + futex_unlock_pi() + spin_lock(hb->lock); + rtmutex_unlock() + remove_rtmutex_waiter(T2161); + mark_lock_available(); + /* Make the next waiter owner of the user space side */ + futex_uval = 2161; + spin_unlock(hb->lock); +spin_lock(hb->lock); +rt_mutex_cleanup_proxy_lock() + if (rtmutex_owner() !== current) + ... + return FAIL; +.... +return -ETIMEOUT; + +This means that rt_mutex_cleanup_proxy_lock() needs to call +try_to_take_rt_mutex() so it can take over the rtmutex correctly which was +assigned by the waker. If the rtmutex is owned by some other task then this +call is harmless and just confirmes that the waiter is not able to acquire +it. + +While there, fix what looks like a merge error which resulted in +rt_mutex_cleanup_proxy_lock() having two calls to +fixup_rt_mutex_waiters() and rt_mutex_wait_proxy_lock() not having any. +Both should have one, since both potentially touch the waiter list. + +Fixes: 38d589f2fd08 ("futex,rt_mutex: Restructure rt_mutex_finish_proxy_lock()") +Reported-by: Markus Trippelsdorf +Bug-Spotted-by: Thomas Gleixner +Signed-off-by: Peter Zijlstra (Intel) +Cc: Florian Weimer +Cc: Darren Hart +Cc: Sebastian Andrzej Siewior +Cc: Markus Trippelsdorf +Link: http://lkml.kernel.org/r/20170519154850.mlomgdsd26drq5j6@hirez.programming.kicks-ass.net +Signed-off-by: Thomas Gleixner +Signed-off-by: Ben Hutchings +Signed-off-by: Greg Kroah-Hartman +--- + kernel/locking/rtmutex.c | 24 ++++++++++++++++++------ + 1 file changed, 18 insertions(+), 6 deletions(-) + +--- a/kernel/locking/rtmutex.c ++++ b/kernel/locking/rtmutex.c +@@ -1796,12 +1796,14 @@ int rt_mutex_wait_proxy_lock(struct rt_m + int ret; + + raw_spin_lock_irq(&lock->wait_lock); +- +- set_current_state(TASK_INTERRUPTIBLE); +- + /* sleep on the mutex */ ++ set_current_state(TASK_INTERRUPTIBLE); + ret = __rt_mutex_slowlock(lock, TASK_INTERRUPTIBLE, to, waiter); +- ++ /* ++ * try_to_take_rt_mutex() sets the waiter bit unconditionally. We might ++ * have to fix that up. ++ */ ++ fixup_rt_mutex_waiters(lock); + raw_spin_unlock_irq(&lock->wait_lock); + + return ret; +@@ -1833,15 +1835,25 @@ bool rt_mutex_cleanup_proxy_lock(struct + + raw_spin_lock_irq(&lock->wait_lock); + /* ++ * Do an unconditional try-lock, this deals with the lock stealing ++ * state where __rt_mutex_futex_unlock() -> mark_wakeup_next_waiter() ++ * sets a NULL owner. ++ * ++ * We're not interested in the return value, because the subsequent ++ * test on rt_mutex_owner() will infer that. If the trylock succeeded, ++ * we will own the lock and it will have removed the waiter. If we ++ * failed the trylock, we're still not owner and we need to remove ++ * ourselves. ++ */ ++ try_to_take_rt_mutex(lock, current, waiter); ++ /* + * Unless we're the owner; we're still enqueued on the wait_list. + * So check if we became owner, if not, take us off the wait_list. + */ + if (rt_mutex_owner(lock) != current) { + remove_waiter(lock, waiter); +- fixup_rt_mutex_waiters(lock); + cleanup = true; + } +- + /* + * try_to_take_rt_mutex() sets the waiter bit unconditionally. We might + * have to fix that up. diff --git a/queue-4.9/futex-rt_mutex-introduce-rt_mutex_init_waiter.patch b/queue-4.9/futex-rt_mutex-introduce-rt_mutex_init_waiter.patch new file mode 100644 index 00000000000..427c051c9b6 --- /dev/null +++ b/queue-4.9/futex-rt_mutex-introduce-rt_mutex_init_waiter.patch @@ -0,0 +1,88 @@ +From foo@baz Mon Mar 29 07:48:09 AM CEST 2021 +From: Ben Hutchings +Date: Sun, 28 Mar 2021 22:41:33 +0200 +Subject: futex,rt_mutex: Introduce rt_mutex_init_waiter() +To: stable@vger.kernel.org +Cc: Lee Jones , "Luis Claudio R. Goncalves" , Florian Fainelli +Message-ID: +Content-Disposition: inline + +From: Peter Zijlstra + +commit 50809358dd7199aa7ce232f6877dd09ec30ef374 upstream. + +Since there's already two copies of this code, introduce a helper now +before adding a third one. + +Signed-off-by: Peter Zijlstra (Intel) +Cc: juri.lelli@arm.com +Cc: bigeasy@linutronix.de +Cc: xlpang@redhat.com +Cc: rostedt@goodmis.org +Cc: mathieu.desnoyers@efficios.com +Cc: jdesfossez@efficios.com +Cc: dvhart@infradead.org +Cc: bristot@redhat.com +Link: http://lkml.kernel.org/r/20170322104151.950039479@infradead.org +Signed-off-by: Thomas Gleixner +[bwh: Backported to 4.9: adjust context] +Signed-off-by: Ben Hutchings +Signed-off-by: Greg Kroah-Hartman +--- + kernel/futex.c | 5 +---- + kernel/locking/rtmutex.c | 12 +++++++++--- + kernel/locking/rtmutex_common.h | 1 + + 3 files changed, 11 insertions(+), 7 deletions(-) + +--- a/kernel/futex.c ++++ b/kernel/futex.c +@@ -3234,10 +3234,7 @@ static int futex_wait_requeue_pi(u32 __u + * The waiter is allocated on our stack, manipulated by the requeue + * code while we sleep on uaddr. + */ +- debug_rt_mutex_init_waiter(&rt_waiter); +- RB_CLEAR_NODE(&rt_waiter.pi_tree_entry); +- RB_CLEAR_NODE(&rt_waiter.tree_entry); +- rt_waiter.task = NULL; ++ rt_mutex_init_waiter(&rt_waiter); + + ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2, VERIFY_WRITE); + if (unlikely(ret != 0)) +--- a/kernel/locking/rtmutex.c ++++ b/kernel/locking/rtmutex.c +@@ -1176,6 +1176,14 @@ void rt_mutex_adjust_pi(struct task_stru + next_lock, NULL, task); + } + ++void rt_mutex_init_waiter(struct rt_mutex_waiter *waiter) ++{ ++ debug_rt_mutex_init_waiter(waiter); ++ RB_CLEAR_NODE(&waiter->pi_tree_entry); ++ RB_CLEAR_NODE(&waiter->tree_entry); ++ waiter->task = NULL; ++} ++ + /** + * __rt_mutex_slowlock() - Perform the wait-wake-try-to-take loop + * @lock: the rt_mutex to take +@@ -1258,9 +1266,7 @@ rt_mutex_slowlock(struct rt_mutex *lock, + unsigned long flags; + int ret = 0; + +- debug_rt_mutex_init_waiter(&waiter); +- RB_CLEAR_NODE(&waiter.pi_tree_entry); +- RB_CLEAR_NODE(&waiter.tree_entry); ++ rt_mutex_init_waiter(&waiter); + + /* + * Technically we could use raw_spin_[un]lock_irq() here, but this can +--- a/kernel/locking/rtmutex_common.h ++++ b/kernel/locking/rtmutex_common.h +@@ -103,6 +103,7 @@ extern struct task_struct *rt_mutex_next + extern void rt_mutex_init_proxy_locked(struct rt_mutex *lock, + struct task_struct *proxy_owner); + extern void rt_mutex_proxy_unlock(struct rt_mutex *lock); ++extern void rt_mutex_init_waiter(struct rt_mutex_waiter *waiter); + extern int rt_mutex_start_proxy_lock(struct rt_mutex *lock, + struct rt_mutex_waiter *waiter, + struct task_struct *task); diff --git a/queue-4.9/futex-use-smp_store_release-in-mark_wake_futex.patch b/queue-4.9/futex-use-smp_store_release-in-mark_wake_futex.patch new file mode 100644 index 00000000000..246f3aa270b --- /dev/null +++ b/queue-4.9/futex-use-smp_store_release-in-mark_wake_futex.patch @@ -0,0 +1,46 @@ +From foo@baz Mon Mar 29 07:48:09 AM CEST 2021 +From: Ben Hutchings +Date: Sun, 28 Mar 2021 22:40:54 +0200 +Subject: futex: Use smp_store_release() in mark_wake_futex() +To: stable@vger.kernel.org +Cc: Lee Jones , "Luis Claudio R. Goncalves" , Florian Fainelli +Message-ID: +Content-Disposition: inline + +From: Peter Zijlstra + +commit 1b367ece0d7e696cab1c8501bab282cc6a538b3f upstream. + +Since the futex_q can dissapear the instruction after assigning NULL, +this really should be a RELEASE barrier. That stops loads from hitting +dead memory too. + +Signed-off-by: Peter Zijlstra (Intel) +Cc: juri.lelli@arm.com +Cc: bigeasy@linutronix.de +Cc: xlpang@redhat.com +Cc: rostedt@goodmis.org +Cc: mathieu.desnoyers@efficios.com +Cc: jdesfossez@efficios.com +Cc: dvhart@infradead.org +Cc: bristot@redhat.com +Link: http://lkml.kernel.org/r/20170322104151.604296452@infradead.org +Signed-off-by: Thomas Gleixner +Signed-off-by: Ben Hutchings +Signed-off-by: Greg Kroah-Hartman +--- + kernel/futex.c | 3 +-- + 1 file changed, 1 insertion(+), 2 deletions(-) + +--- a/kernel/futex.c ++++ b/kernel/futex.c +@@ -1565,8 +1565,7 @@ static void mark_wake_futex(struct wake_ + * memory barrier is required here to prevent the following + * store to lock_ptr from getting ahead of the plist_del. + */ +- smp_wmb(); +- q->lock_ptr = NULL; ++ smp_store_release(&q->lock_ptr, NULL); + } + + /* diff --git a/queue-4.9/locking-futex-allow-low-level-atomic-operations-to-return-eagain.patch b/queue-4.9/locking-futex-allow-low-level-atomic-operations-to-return-eagain.patch new file mode 100644 index 00000000000..dd2c3e074cc --- /dev/null +++ b/queue-4.9/locking-futex-allow-low-level-atomic-operations-to-return-eagain.patch @@ -0,0 +1,337 @@ +From foo@baz Mon Mar 29 07:48:09 AM CEST 2021 +From: Ben Hutchings +Date: Sun, 28 Mar 2021 22:42:44 +0200 +Subject: locking/futex: Allow low-level atomic operations to return -EAGAIN +To: stable@vger.kernel.org +Cc: Lee Jones , "Luis Claudio R. Goncalves" , Florian Fainelli +Message-ID: +Content-Disposition: inline + +From: Will Deacon + +commit 6b4f4bc9cb22875f97023984a625386f0c7cc1c0 upstream. + +Some futex() operations, including FUTEX_WAKE_OP, require the kernel to +perform an atomic read-modify-write of the futex word via the userspace +mapping. These operations are implemented by each architecture in +arch_futex_atomic_op_inuser() and futex_atomic_cmpxchg_inatomic(), which +are called in atomic context with the relevant hash bucket locks held. + +Although these routines may return -EFAULT in response to a page fault +generated when accessing userspace, they are expected to succeed (i.e. +return 0) in all other cases. This poses a problem for architectures +that do not provide bounded forward progress guarantees or fairness of +contended atomic operations and can lead to starvation in some cases. + +In these problematic scenarios, we must return back to the core futex +code so that we can drop the hash bucket locks and reschedule if +necessary, much like we do in the case of a page fault. + +Allow architectures to return -EAGAIN from their implementations of +arch_futex_atomic_op_inuser() and futex_atomic_cmpxchg_inatomic(), which +will cause the core futex code to reschedule if necessary and return +back to the architecture code later on. + +Cc: +Acked-by: Peter Zijlstra (Intel) +Signed-off-by: Will Deacon +Signed-off-by: Greg Kroah-Hartman +[bwh: Backported to 4.9: adjust context] +Signed-off-by: Ben Hutchings +Signed-off-by: Greg Kroah-Hartman +--- + kernel/futex.c | 185 +++++++++++++++++++++++++++++++++++---------------------- + 1 file changed, 115 insertions(+), 70 deletions(-) + +--- a/kernel/futex.c ++++ b/kernel/futex.c +@@ -1407,13 +1407,15 @@ static int lookup_pi_state(u32 __user *u + + static int lock_pi_update_atomic(u32 __user *uaddr, u32 uval, u32 newval) + { ++ int err; + u32 uninitialized_var(curval); + + if (unlikely(should_fail_futex(true))) + return -EFAULT; + +- if (unlikely(cmpxchg_futex_value_locked(&curval, uaddr, uval, newval))) +- return -EFAULT; ++ err = cmpxchg_futex_value_locked(&curval, uaddr, uval, newval); ++ if (unlikely(err)) ++ return err; + + /* If user space value changed, let the caller retry */ + return curval != uval ? -EAGAIN : 0; +@@ -1606,10 +1608,8 @@ static int wake_futex_pi(u32 __user *uad + if (unlikely(should_fail_futex(true))) + ret = -EFAULT; + +- if (cmpxchg_futex_value_locked(&curval, uaddr, uval, newval)) { +- ret = -EFAULT; +- +- } else if (curval != uval) { ++ ret = cmpxchg_futex_value_locked(&curval, uaddr, uval, newval); ++ if (!ret && (curval != uval)) { + /* + * If a unconditional UNLOCK_PI operation (user space did not + * try the TID->0 transition) raced with a waiter setting the +@@ -1795,32 +1795,32 @@ retry_private: + double_lock_hb(hb1, hb2); + op_ret = futex_atomic_op_inuser(op, uaddr2); + if (unlikely(op_ret < 0)) { +- + double_unlock_hb(hb1, hb2); + +-#ifndef CONFIG_MMU +- /* +- * we don't get EFAULT from MMU faults if we don't have an MMU, +- * but we might get them from range checking +- */ +- ret = op_ret; +- goto out_put_keys; +-#endif +- +- if (unlikely(op_ret != -EFAULT)) { ++ if (!IS_ENABLED(CONFIG_MMU) || ++ unlikely(op_ret != -EFAULT && op_ret != -EAGAIN)) { ++ /* ++ * we don't get EFAULT from MMU faults if we don't have ++ * an MMU, but we might get them from range checking ++ */ + ret = op_ret; + goto out_put_keys; + } + +- ret = fault_in_user_writeable(uaddr2); +- if (ret) +- goto out_put_keys; ++ if (op_ret == -EFAULT) { ++ ret = fault_in_user_writeable(uaddr2); ++ if (ret) ++ goto out_put_keys; ++ } + +- if (!(flags & FLAGS_SHARED)) ++ if (!(flags & FLAGS_SHARED)) { ++ cond_resched(); + goto retry_private; ++ } + + put_futex_key(&key2); + put_futex_key(&key1); ++ cond_resched(); + goto retry; + } + +@@ -2516,14 +2516,17 @@ retry: + if (!pi_state->owner) + newtid |= FUTEX_OWNER_DIED; + +- if (get_futex_value_locked(&uval, uaddr)) +- goto handle_fault; ++ err = get_futex_value_locked(&uval, uaddr); ++ if (err) ++ goto handle_err; + + for (;;) { + newval = (uval & FUTEX_OWNER_DIED) | newtid; + +- if (cmpxchg_futex_value_locked(&curval, uaddr, uval, newval)) +- goto handle_fault; ++ err = cmpxchg_futex_value_locked(&curval, uaddr, uval, newval); ++ if (err) ++ goto handle_err; ++ + if (curval == uval) + break; + uval = curval; +@@ -2538,23 +2541,36 @@ retry: + return argowner == current; + + /* +- * To handle the page fault we need to drop the locks here. That gives +- * the other task (either the highest priority waiter itself or the +- * task which stole the rtmutex) the chance to try the fixup of the +- * pi_state. So once we are back from handling the fault we need to +- * check the pi_state after reacquiring the locks and before trying to +- * do another fixup. When the fixup has been done already we simply +- * return. ++ * In order to reschedule or handle a page fault, we need to drop the ++ * locks here. In the case of a fault, this gives the other task ++ * (either the highest priority waiter itself or the task which stole ++ * the rtmutex) the chance to try the fixup of the pi_state. So once we ++ * are back from handling the fault we need to check the pi_state after ++ * reacquiring the locks and before trying to do another fixup. When ++ * the fixup has been done already we simply return. + * + * Note: we hold both hb->lock and pi_mutex->wait_lock. We can safely + * drop hb->lock since the caller owns the hb -> futex_q relation. + * Dropping the pi_mutex->wait_lock requires the state revalidate. + */ +-handle_fault: ++handle_err: + raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock); + spin_unlock(q->lock_ptr); + +- err = fault_in_user_writeable(uaddr); ++ switch (err) { ++ case -EFAULT: ++ err = fault_in_user_writeable(uaddr); ++ break; ++ ++ case -EAGAIN: ++ cond_resched(); ++ err = 0; ++ break; ++ ++ default: ++ WARN_ON_ONCE(1); ++ break; ++ } + + spin_lock(q->lock_ptr); + raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock); +@@ -3128,10 +3144,8 @@ retry: + * A unconditional UNLOCK_PI op raced against a waiter + * setting the FUTEX_WAITERS bit. Try again. + */ +- if (ret == -EAGAIN) { +- put_futex_key(&key); +- goto retry; +- } ++ if (ret == -EAGAIN) ++ goto pi_retry; + /* + * wake_futex_pi has detected invalid state. Tell user + * space. +@@ -3146,9 +3160,19 @@ retry: + * preserve the WAITERS bit not the OWNER_DIED one. We are the + * owner. + */ +- if (cmpxchg_futex_value_locked(&curval, uaddr, uval, 0)) { ++ if ((ret = cmpxchg_futex_value_locked(&curval, uaddr, uval, 0))) { + spin_unlock(&hb->lock); +- goto pi_faulted; ++ switch (ret) { ++ case -EFAULT: ++ goto pi_faulted; ++ ++ case -EAGAIN: ++ goto pi_retry; ++ ++ default: ++ WARN_ON_ONCE(1); ++ goto out_putkey; ++ } + } + + /* +@@ -3162,6 +3186,11 @@ out_putkey: + put_futex_key(&key); + return ret; + ++pi_retry: ++ put_futex_key(&key); ++ cond_resched(); ++ goto retry; ++ + pi_faulted: + put_futex_key(&key); + +@@ -3504,6 +3533,7 @@ err_unlock: + static int handle_futex_death(u32 __user *uaddr, struct task_struct *curr, int pi) + { + u32 uval, uninitialized_var(nval), mval; ++ int err; + + /* Futex address must be 32bit aligned */ + if ((((unsigned long)uaddr) % sizeof(*uaddr)) != 0) +@@ -3513,42 +3543,57 @@ retry: + if (get_user(uval, uaddr)) + return -1; + +- if ((uval & FUTEX_TID_MASK) == task_pid_vnr(curr)) { +- /* +- * Ok, this dying thread is truly holding a futex +- * of interest. Set the OWNER_DIED bit atomically +- * via cmpxchg, and if the value had FUTEX_WAITERS +- * set, wake up a waiter (if any). (We have to do a +- * futex_wake() even if OWNER_DIED is already set - +- * to handle the rare but possible case of recursive +- * thread-death.) The rest of the cleanup is done in +- * userspace. +- */ +- mval = (uval & FUTEX_WAITERS) | FUTEX_OWNER_DIED; +- /* +- * We are not holding a lock here, but we want to have +- * the pagefault_disable/enable() protection because +- * we want to handle the fault gracefully. If the +- * access fails we try to fault in the futex with R/W +- * verification via get_user_pages. get_user() above +- * does not guarantee R/W access. If that fails we +- * give up and leave the futex locked. +- */ +- if (cmpxchg_futex_value_locked(&nval, uaddr, uval, mval)) { ++ if ((uval & FUTEX_TID_MASK) != task_pid_vnr(curr)) ++ return 0; ++ ++ /* ++ * Ok, this dying thread is truly holding a futex ++ * of interest. Set the OWNER_DIED bit atomically ++ * via cmpxchg, and if the value had FUTEX_WAITERS ++ * set, wake up a waiter (if any). (We have to do a ++ * futex_wake() even if OWNER_DIED is already set - ++ * to handle the rare but possible case of recursive ++ * thread-death.) The rest of the cleanup is done in ++ * userspace. ++ */ ++ mval = (uval & FUTEX_WAITERS) | FUTEX_OWNER_DIED; ++ ++ /* ++ * We are not holding a lock here, but we want to have ++ * the pagefault_disable/enable() protection because ++ * we want to handle the fault gracefully. If the ++ * access fails we try to fault in the futex with R/W ++ * verification via get_user_pages. get_user() above ++ * does not guarantee R/W access. If that fails we ++ * give up and leave the futex locked. ++ */ ++ if ((err = cmpxchg_futex_value_locked(&nval, uaddr, uval, mval))) { ++ switch (err) { ++ case -EFAULT: + if (fault_in_user_writeable(uaddr)) + return -1; + goto retry; +- } +- if (nval != uval) ++ ++ case -EAGAIN: ++ cond_resched(); + goto retry; + +- /* +- * Wake robust non-PI futexes here. The wakeup of +- * PI futexes happens in exit_pi_state(): +- */ +- if (!pi && (uval & FUTEX_WAITERS)) +- futex_wake(uaddr, 1, 1, FUTEX_BITSET_MATCH_ANY); ++ default: ++ WARN_ON_ONCE(1); ++ return err; ++ } + } ++ ++ if (nval != uval) ++ goto retry; ++ ++ /* ++ * Wake robust non-PI futexes here. The wakeup of ++ * PI futexes happens in exit_pi_state(): ++ */ ++ if (!pi && (uval & FUTEX_WAITERS)) ++ futex_wake(uaddr, 1, 1, FUTEX_BITSET_MATCH_ANY); ++ + return 0; + } + diff --git a/queue-4.9/series b/queue-4.9/series index 82aa6ab8dae..240edd61d6e 100644 --- a/queue-4.9/series +++ b/queue-4.9/series @@ -34,3 +34,16 @@ acpi-scan-rearrange-memory-allocation-in-acpi_device.patch acpi-scan-use-unique-number-for-instance_no.patch perf-auxtrace-fix-auxtrace-queue-conflict.patch idr-add-ida_is_empty.patch +futex-use-smp_store_release-in-mark_wake_futex.patch +futex-rt_mutex-introduce-rt_mutex_init_waiter.patch +futex-rework-futex_lock_pi-to-use-rt_mutex_-_proxy_lock.patch +futex-drop-hb-lock-before-enqueueing-on-the-rtmutex.patch +futex-avoid-freeing-an-active-timer.patch +futex-rt_mutex-fix-rt_mutex_cleanup_proxy_lock.patch +futex-handle-early-deadlock-return-correctly.patch +futex-fix-possible-missed-wakeup.patch +locking-futex-allow-low-level-atomic-operations-to-return-eagain.patch +arm64-futex-bound-number-of-ldxr-stxr-loops-in-futex_wake_op.patch +futex-prevent-robust-futex-exit-race.patch +futex-fix-incorrect-should_fail_futex-handling.patch +futex-handle-transient-ownerless-rtmutex-state-correctly.patch -- 2.47.3