--- /dev/null
+From foo@baz Mon Mar 29 07:48:09 AM CEST 2021
+From: Ben Hutchings <ben@decadent.org.uk>
+Date: Sun, 28 Mar 2021 22:42:51 +0200
+Subject: arm64: futex: Bound number of LDXR/STXR loops in FUTEX_WAKE_OP
+To: stable@vger.kernel.org
+Cc: Lee Jones <lee.jones@linaro.org>, "Luis Claudio R. Goncalves" <lgoncalv@redhat.com>, Florian Fainelli <f.fainelli@gmail.com>
+Message-ID: <YGDqS7bKz9n1Jto3@decadent.org.uk>
+Content-Disposition: inline
+
+From: Will Deacon <will.deacon@arm.com>
+
+commit 03110a5cb2161690ae5ac04994d47ed0cd6cef75 upstream.
+
+Our futex implementation makes use of LDXR/STXR loops to perform atomic
+updates to user memory from atomic context. This can lead to latency
+problems if we end up spinning around the LL/SC sequence at the expense
+of doing something useful.
+
+Rework our futex atomic operations so that we return -EAGAIN if we fail
+to update the futex word after 128 attempts. The core futex code will
+reschedule if necessary and we'll try again later.
+
+Fixes: 6170a97460db ("arm64: Atomic operations")
+Signed-off-by: Will Deacon <will.deacon@arm.com>
+[bwh: Backported to 4.9: adjust context]
+Signed-off-by: Ben Hutchings <ben@decadent.org.uk>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ arch/arm64/include/asm/futex.h | 59 +++++++++++++++++++++++++----------------
+ 1 file changed, 37 insertions(+), 22 deletions(-)
+
+--- a/arch/arm64/include/asm/futex.h
++++ b/arch/arm64/include/asm/futex.h
+@@ -26,7 +26,12 @@
+ #include <asm/errno.h>
+ #include <asm/sysreg.h>
+
++#define FUTEX_MAX_LOOPS 128 /* What's the largest number you can think of? */
++
+ #define __futex_atomic_op(insn, ret, oldval, uaddr, tmp, oparg) \
++do { \
++ unsigned int loops = FUTEX_MAX_LOOPS; \
++ \
+ asm volatile( \
+ ALTERNATIVE("nop", SET_PSTATE_PAN(0), ARM64_HAS_PAN, \
+ CONFIG_ARM64_PAN) \
+@@ -34,21 +39,26 @@
+ "1: ldxr %w1, %2\n" \
+ insn "\n" \
+ "2: stlxr %w0, %w3, %2\n" \
+-" cbnz %w0, 1b\n" \
+-" dmb ish\n" \
++" cbz %w0, 3f\n" \
++" sub %w4, %w4, %w0\n" \
++" cbnz %w4, 1b\n" \
++" mov %w0, %w7\n" \
+ "3:\n" \
++" dmb ish\n" \
+ " .pushsection .fixup,\"ax\"\n" \
+ " .align 2\n" \
+-"4: mov %w0, %w5\n" \
++"4: mov %w0, %w6\n" \
+ " b 3b\n" \
+ " .popsection\n" \
+ _ASM_EXTABLE(1b, 4b) \
+ _ASM_EXTABLE(2b, 4b) \
+ ALTERNATIVE("nop", SET_PSTATE_PAN(1), ARM64_HAS_PAN, \
+ CONFIG_ARM64_PAN) \
+- : "=&r" (ret), "=&r" (oldval), "+Q" (*uaddr), "=&r" (tmp) \
+- : "r" (oparg), "Ir" (-EFAULT) \
+- : "memory")
++ : "=&r" (ret), "=&r" (oldval), "+Q" (*uaddr), "=&r" (tmp), \
++ "+r" (loops) \
++ : "r" (oparg), "Ir" (-EFAULT), "Ir" (-EAGAIN) \
++ : "memory"); \
++} while (0)
+
+ static inline int
+ arch_futex_atomic_op_inuser(int op, int oparg, int *oval, u32 __user *uaddr)
+@@ -59,23 +69,23 @@ arch_futex_atomic_op_inuser(int op, int
+
+ switch (op) {
+ case FUTEX_OP_SET:
+- __futex_atomic_op("mov %w3, %w4",
++ __futex_atomic_op("mov %w3, %w5",
+ ret, oldval, uaddr, tmp, oparg);
+ break;
+ case FUTEX_OP_ADD:
+- __futex_atomic_op("add %w3, %w1, %w4",
++ __futex_atomic_op("add %w3, %w1, %w5",
+ ret, oldval, uaddr, tmp, oparg);
+ break;
+ case FUTEX_OP_OR:
+- __futex_atomic_op("orr %w3, %w1, %w4",
++ __futex_atomic_op("orr %w3, %w1, %w5",
+ ret, oldval, uaddr, tmp, oparg);
+ break;
+ case FUTEX_OP_ANDN:
+- __futex_atomic_op("and %w3, %w1, %w4",
++ __futex_atomic_op("and %w3, %w1, %w5",
+ ret, oldval, uaddr, tmp, ~oparg);
+ break;
+ case FUTEX_OP_XOR:
+- __futex_atomic_op("eor %w3, %w1, %w4",
++ __futex_atomic_op("eor %w3, %w1, %w5",
+ ret, oldval, uaddr, tmp, oparg);
+ break;
+ default:
+@@ -95,6 +105,7 @@ futex_atomic_cmpxchg_inatomic(u32 *uval,
+ u32 oldval, u32 newval)
+ {
+ int ret = 0;
++ unsigned int loops = FUTEX_MAX_LOOPS;
+ u32 val, tmp;
+ u32 __user *uaddr;
+
+@@ -106,21 +117,25 @@ futex_atomic_cmpxchg_inatomic(u32 *uval,
+ ALTERNATIVE("nop", SET_PSTATE_PAN(0), ARM64_HAS_PAN, CONFIG_ARM64_PAN)
+ " prfm pstl1strm, %2\n"
+ "1: ldxr %w1, %2\n"
+-" sub %w3, %w1, %w4\n"
+-" cbnz %w3, 3f\n"
+-"2: stlxr %w3, %w5, %2\n"
+-" cbnz %w3, 1b\n"
+-" dmb ish\n"
++" sub %w3, %w1, %w5\n"
++" cbnz %w3, 4f\n"
++"2: stlxr %w3, %w6, %2\n"
++" cbz %w3, 3f\n"
++" sub %w4, %w4, %w3\n"
++" cbnz %w4, 1b\n"
++" mov %w0, %w8\n"
+ "3:\n"
++" dmb ish\n"
++"4:\n"
+ " .pushsection .fixup,\"ax\"\n"
+-"4: mov %w0, %w6\n"
+-" b 3b\n"
++"5: mov %w0, %w7\n"
++" b 4b\n"
+ " .popsection\n"
+- _ASM_EXTABLE(1b, 4b)
+- _ASM_EXTABLE(2b, 4b)
++ _ASM_EXTABLE(1b, 5b)
++ _ASM_EXTABLE(2b, 5b)
+ ALTERNATIVE("nop", SET_PSTATE_PAN(1), ARM64_HAS_PAN, CONFIG_ARM64_PAN)
+- : "+r" (ret), "=&r" (val), "+Q" (*uaddr), "=&r" (tmp)
+- : "r" (oldval), "r" (newval), "Ir" (-EFAULT)
++ : "+r" (ret), "=&r" (val), "+Q" (*uaddr), "=&r" (tmp), "+r" (loops)
++ : "r" (oldval), "r" (newval), "Ir" (-EFAULT), "Ir" (-EAGAIN)
+ : "memory");
+
+ *uval = val;
--- /dev/null
+From foo@baz Mon Mar 29 07:48:09 AM CEST 2021
+From: Ben Hutchings <ben@decadent.org.uk>
+Date: Sun, 28 Mar 2021 22:42:00 +0200
+Subject: futex: Avoid freeing an active timer
+To: stable@vger.kernel.org
+Cc: Lee Jones <lee.jones@linaro.org>, "Luis Claudio R. Goncalves" <lgoncalv@redhat.com>, Florian Fainelli <f.fainelli@gmail.com>
+Message-ID: <YGDqGB/tmAa2cjOz@decadent.org.uk>
+Content-Disposition: inline
+
+From: Thomas Gleixner <tglx@linutronix.de>
+
+commit 97181f9bd57405b879403763284537e27d46963d upstream.
+
+Alexander reported a hrtimer debug_object splat:
+
+ ODEBUG: free active (active state 0) object type: hrtimer hint: hrtimer_wakeup (kernel/time/hrtimer.c:1423)
+
+ debug_object_free (lib/debugobjects.c:603)
+ destroy_hrtimer_on_stack (kernel/time/hrtimer.c:427)
+ futex_lock_pi (kernel/futex.c:2740)
+ do_futex (kernel/futex.c:3399)
+ SyS_futex (kernel/futex.c:3447 kernel/futex.c:3415)
+ do_syscall_64 (arch/x86/entry/common.c:284)
+ entry_SYSCALL64_slow_path (arch/x86/entry/entry_64.S:249)
+
+Which was caused by commit:
+
+ cfafcd117da0 ("futex: Rework futex_lock_pi() to use rt_mutex_*_proxy_lock()")
+
+... losing the hrtimer_cancel() in the shuffle. Where previously the
+hrtimer_cancel() was done by rt_mutex_slowlock() we now need to do it
+manually.
+
+Reported-by: Alexander Levin <alexander.levin@verizon.com>
+Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
+Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
+Cc: Linus Torvalds <torvalds@linux-foundation.org>
+Cc: Peter Zijlstra <peterz@infradead.org>
+Fixes: cfafcd117da0 ("futex: Rework futex_lock_pi() to use rt_mutex_*_proxy_lock()")
+Link: http://lkml.kernel.org/r/alpine.DEB.2.20.1704101802370.2906@nanos
+Signed-off-by: Ingo Molnar <mingo@kernel.org>
+Signed-off-by: Ben Hutchings <ben@decadent.org.uk>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ kernel/futex.c | 4 +++-
+ 1 file changed, 3 insertions(+), 1 deletion(-)
+
+--- a/kernel/futex.c
++++ b/kernel/futex.c
+@@ -3018,8 +3018,10 @@ out_unlock_put_key:
+ out_put_key:
+ put_futex_key(&q.key);
+ out:
+- if (to)
++ if (to) {
++ hrtimer_cancel(&to->timer);
+ destroy_hrtimer_on_stack(&to->timer);
++ }
+ return ret != -EINTR ? ret : -ERESTARTNOINTR;
+
+ uaddr_faulted:
--- /dev/null
+From foo@baz Mon Mar 29 07:48:09 AM CEST 2021
+From: Ben Hutchings <ben@decadent.org.uk>
+Date: Sun, 28 Mar 2021 22:41:51 +0200
+Subject: futex: Drop hb->lock before enqueueing on the rtmutex
+To: stable@vger.kernel.org
+Cc: Lee Jones <lee.jones@linaro.org>, "Luis Claudio R. Goncalves" <lgoncalv@redhat.com>, Florian Fainelli <f.fainelli@gmail.com>
+Message-ID: <YGDqD/h/x3P8HwyB@decadent.org.uk>
+Content-Disposition: inline
+
+From: Peter Zijlstra <peterz@infradead.org>
+
+commit 56222b212e8edb1cf51f5dd73ff645809b082b40 upstream.
+
+When PREEMPT_RT_FULL does the spinlock -> rt_mutex substitution the PI
+chain code will (falsely) report a deadlock and BUG.
+
+The problem is that it hold hb->lock (now an rt_mutex) while doing
+task_blocks_on_rt_mutex on the futex's pi_state::rtmutex. This, when
+interleaved just right with futex_unlock_pi() leads it to believe to see an
+AB-BA deadlock.
+
+ Task1 (holds rt_mutex, Task2 (does FUTEX_LOCK_PI)
+ does FUTEX_UNLOCK_PI)
+
+ lock hb->lock
+ lock rt_mutex (as per start_proxy)
+ lock hb->lock
+
+Which is a trivial AB-BA.
+
+It is not an actual deadlock, because it won't be holding hb->lock by the
+time it actually blocks on the rt_mutex, but the chainwalk code doesn't
+know that and it would be a nightmare to handle this gracefully.
+
+To avoid this problem, do the same as in futex_unlock_pi() and drop
+hb->lock after acquiring wait_lock. This still fully serializes against
+futex_unlock_pi(), since adding to the wait_list does the very same lock
+dance, and removing it holds both locks.
+
+Aside of solving the RT problem this makes the lock and unlock mechanism
+symetric and reduces the hb->lock held time.
+
+Reported-and-tested-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
+Suggested-by: Thomas Gleixner <tglx@linutronix.de>
+Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
+Cc: juri.lelli@arm.com
+Cc: xlpang@redhat.com
+Cc: rostedt@goodmis.org
+Cc: mathieu.desnoyers@efficios.com
+Cc: jdesfossez@efficios.com
+Cc: dvhart@infradead.org
+Cc: bristot@redhat.com
+Link: http://lkml.kernel.org/r/20170322104152.161341537@infradead.org
+Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
+Signed-off-by: Ben Hutchings <ben@decadent.org.uk>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ kernel/futex.c | 30 +++++++++++++++++-------
+ kernel/locking/rtmutex.c | 49 ++++++++++++++++++++++------------------
+ kernel/locking/rtmutex_common.h | 3 ++
+ 3 files changed, 52 insertions(+), 30 deletions(-)
+
+--- a/kernel/futex.c
++++ b/kernel/futex.c
+@@ -2948,20 +2948,33 @@ retry_private:
+ goto no_block;
+ }
+
++ rt_mutex_init_waiter(&rt_waiter);
++
+ /*
+- * We must add ourselves to the rt_mutex waitlist while holding hb->lock
+- * such that the hb and rt_mutex wait lists match.
++ * On PREEMPT_RT_FULL, when hb->lock becomes an rt_mutex, we must not
++ * hold it while doing rt_mutex_start_proxy(), because then it will
++ * include hb->lock in the blocking chain, even through we'll not in
++ * fact hold it while blocking. This will lead it to report -EDEADLK
++ * and BUG when futex_unlock_pi() interleaves with this.
++ *
++ * Therefore acquire wait_lock while holding hb->lock, but drop the
++ * latter before calling rt_mutex_start_proxy_lock(). This still fully
++ * serializes against futex_unlock_pi() as that does the exact same
++ * lock handoff sequence.
+ */
+- rt_mutex_init_waiter(&rt_waiter);
+- ret = rt_mutex_start_proxy_lock(&q.pi_state->pi_mutex, &rt_waiter, current);
++ raw_spin_lock_irq(&q.pi_state->pi_mutex.wait_lock);
++ spin_unlock(q.lock_ptr);
++ ret = __rt_mutex_start_proxy_lock(&q.pi_state->pi_mutex, &rt_waiter, current);
++ raw_spin_unlock_irq(&q.pi_state->pi_mutex.wait_lock);
++
+ if (ret) {
+ if (ret == 1)
+ ret = 0;
+
++ spin_lock(q.lock_ptr);
+ goto no_block;
+ }
+
+- spin_unlock(q.lock_ptr);
+
+ if (unlikely(to))
+ hrtimer_start_expires(&to->timer, HRTIMER_MODE_ABS);
+@@ -2974,6 +2987,9 @@ retry_private:
+ * first acquire the hb->lock before removing the lock from the
+ * rt_mutex waitqueue, such that we can keep the hb and rt_mutex
+ * wait lists consistent.
++ *
++ * In particular; it is important that futex_unlock_pi() can not
++ * observe this inconsistency.
+ */
+ if (ret && !rt_mutex_cleanup_proxy_lock(&q.pi_state->pi_mutex, &rt_waiter))
+ ret = 0;
+@@ -3071,10 +3087,6 @@ retry:
+
+ get_pi_state(pi_state);
+ /*
+- * Since modifying the wait_list is done while holding both
+- * hb->lock and wait_lock, holding either is sufficient to
+- * observe it.
+- *
+ * By taking wait_lock while still holding hb->lock, we ensure
+ * there is no point where we hold neither; and therefore
+ * wake_futex_pi() must observe a state consistent with what we
+--- a/kernel/locking/rtmutex.c
++++ b/kernel/locking/rtmutex.c
+@@ -1695,31 +1695,14 @@ void rt_mutex_proxy_unlock(struct rt_mut
+ rt_mutex_set_owner(lock, NULL);
+ }
+
+-/**
+- * rt_mutex_start_proxy_lock() - Start lock acquisition for another task
+- * @lock: the rt_mutex to take
+- * @waiter: the pre-initialized rt_mutex_waiter
+- * @task: the task to prepare
+- *
+- * Returns:
+- * 0 - task blocked on lock
+- * 1 - acquired the lock for task, caller should wake it up
+- * <0 - error
+- *
+- * Special API call for FUTEX_REQUEUE_PI support.
+- */
+-int rt_mutex_start_proxy_lock(struct rt_mutex *lock,
++int __rt_mutex_start_proxy_lock(struct rt_mutex *lock,
+ struct rt_mutex_waiter *waiter,
+ struct task_struct *task)
+ {
+ int ret;
+
+- raw_spin_lock_irq(&lock->wait_lock);
+-
+- if (try_to_take_rt_mutex(lock, task, NULL)) {
+- raw_spin_unlock_irq(&lock->wait_lock);
++ if (try_to_take_rt_mutex(lock, task, NULL))
+ return 1;
+- }
+
+ /* We enforce deadlock detection for futexes */
+ ret = task_blocks_on_rt_mutex(lock, waiter, task,
+@@ -1738,12 +1721,36 @@ int rt_mutex_start_proxy_lock(struct rt_
+ if (unlikely(ret))
+ remove_waiter(lock, waiter);
+
+- raw_spin_unlock_irq(&lock->wait_lock);
+-
+ debug_rt_mutex_print_deadlock(waiter);
+
+ return ret;
+ }
++
++/**
++ * rt_mutex_start_proxy_lock() - Start lock acquisition for another task
++ * @lock: the rt_mutex to take
++ * @waiter: the pre-initialized rt_mutex_waiter
++ * @task: the task to prepare
++ *
++ * Returns:
++ * 0 - task blocked on lock
++ * 1 - acquired the lock for task, caller should wake it up
++ * <0 - error
++ *
++ * Special API call for FUTEX_REQUEUE_PI support.
++ */
++int rt_mutex_start_proxy_lock(struct rt_mutex *lock,
++ struct rt_mutex_waiter *waiter,
++ struct task_struct *task)
++{
++ int ret;
++
++ raw_spin_lock_irq(&lock->wait_lock);
++ ret = __rt_mutex_start_proxy_lock(lock, waiter, task);
++ raw_spin_unlock_irq(&lock->wait_lock);
++
++ return ret;
++}
+
+ /**
+ * rt_mutex_next_owner - return the next owner of the lock
+--- a/kernel/locking/rtmutex_common.h
++++ b/kernel/locking/rtmutex_common.h
+@@ -104,6 +104,9 @@ extern void rt_mutex_init_proxy_locked(s
+ struct task_struct *proxy_owner);
+ extern void rt_mutex_proxy_unlock(struct rt_mutex *lock);
+ extern void rt_mutex_init_waiter(struct rt_mutex_waiter *waiter);
++extern int __rt_mutex_start_proxy_lock(struct rt_mutex *lock,
++ struct rt_mutex_waiter *waiter,
++ struct task_struct *task);
+ extern int rt_mutex_start_proxy_lock(struct rt_mutex *lock,
+ struct rt_mutex_waiter *waiter,
+ struct task_struct *task);
--- /dev/null
+From foo@baz Mon Mar 29 07:48:09 AM CEST 2021
+From: Ben Hutchings <ben@decadent.org.uk>
+Date: Sun, 28 Mar 2021 22:43:10 +0200
+Subject: futex: Fix incorrect should_fail_futex() handling
+To: stable@vger.kernel.org
+Cc: Lee Jones <lee.jones@linaro.org>, "Luis Claudio R. Goncalves" <lgoncalv@redhat.com>, Florian Fainelli <f.fainelli@gmail.com>
+Message-ID: <YGDqXlZ4Vb3r3Q8f@decadent.org.uk>
+Content-Disposition: inline
+
+From: Mateusz Nosek <mateusznosek0@gmail.com>
+
+commit 921c7ebd1337d1a46783d7e15a850e12aed2eaa0 upstream.
+
+If should_futex_fail() returns true in futex_wake_pi(), then the 'ret'
+variable is set to -EFAULT and then immediately overwritten. So the failure
+injection is non-functional.
+
+Fix it by actually leaving the function and returning -EFAULT.
+
+The Fixes tag is kinda blury because the initial commit which introduced
+failure injection was already sloppy, but the below mentioned commit broke
+it completely.
+
+[ tglx: Massaged changelog ]
+
+Fixes: 6b4f4bc9cb22 ("locking/futex: Allow low-level atomic operations to return -EAGAIN")
+Signed-off-by: Mateusz Nosek <mateusznosek0@gmail.com>
+Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
+Link: https://lore.kernel.org/r/20200927000858.24219-1-mateusznosek0@gmail.com
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+Signed-off-by: Ben Hutchings <ben@decadent.org.uk>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ kernel/futex.c | 4 +++-
+ 1 file changed, 3 insertions(+), 1 deletion(-)
+
+--- a/kernel/futex.c
++++ b/kernel/futex.c
+@@ -1605,8 +1605,10 @@ static int wake_futex_pi(u32 __user *uad
+ */
+ newval = FUTEX_WAITERS | task_pid_vnr(new_owner);
+
+- if (unlikely(should_fail_futex(true)))
++ if (unlikely(should_fail_futex(true))) {
+ ret = -EFAULT;
++ goto out_unlock;
++ }
+
+ ret = cmpxchg_futex_value_locked(&curval, uaddr, uval, newval);
+ if (!ret && (curval != uval)) {
--- /dev/null
+From foo@baz Mon Mar 29 07:48:09 AM CEST 2021
+From: Ben Hutchings <ben@decadent.org.uk>
+Date: Sun, 28 Mar 2021 22:42:32 +0200
+Subject: futex: Fix (possible) missed wakeup
+To: stable@vger.kernel.org
+Cc: Lee Jones <lee.jones@linaro.org>, "Luis Claudio R. Goncalves" <lgoncalv@redhat.com>, Florian Fainelli <f.fainelli@gmail.com>
+Message-ID: <YGDqOPquwh/m9sL9@decadent.org.uk>
+Content-Disposition: inline
+
+From: Peter Zijlstra <peterz@infradead.org>
+
+commit b061c38bef43406df8e73c5be06cbfacad5ee6ad upstream.
+
+We must not rely on wake_q_add() to delay the wakeup; in particular
+commit:
+
+ 1d0dcb3ad9d3 ("futex: Implement lockless wakeups")
+
+moved wake_q_add() before smp_store_release(&q->lock_ptr, NULL), which
+could result in futex_wait() waking before observing ->lock_ptr ==
+NULL and going back to sleep again.
+
+Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
+Cc: Linus Torvalds <torvalds@linux-foundation.org>
+Cc: Peter Zijlstra <peterz@infradead.org>
+Cc: Thomas Gleixner <tglx@linutronix.de>
+Fixes: 1d0dcb3ad9d3 ("futex: Implement lockless wakeups")
+Signed-off-by: Ingo Molnar <mingo@kernel.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+Signed-off-by: Ben Hutchings <ben@decadent.org.uk>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ kernel/futex.c | 13 ++++++++-----
+ 1 file changed, 8 insertions(+), 5 deletions(-)
+
+--- a/kernel/futex.c
++++ b/kernel/futex.c
+@@ -1553,11 +1553,7 @@ static void mark_wake_futex(struct wake_
+ if (WARN(q->pi_state || q->rt_waiter, "refusing to wake PI futex\n"))
+ return;
+
+- /*
+- * Queue the task for later wakeup for after we've released
+- * the hb->lock. wake_q_add() grabs reference to p.
+- */
+- wake_q_add(wake_q, p);
++ get_task_struct(p);
+ __unqueue_futex(q);
+ /*
+ * The waiting task can free the futex_q as soon as
+@@ -1566,6 +1562,13 @@ static void mark_wake_futex(struct wake_
+ * store to lock_ptr from getting ahead of the plist_del.
+ */
+ smp_store_release(&q->lock_ptr, NULL);
++
++ /*
++ * Queue the task for later wakeup for after we've released
++ * the hb->lock. wake_q_add() grabs reference to p.
++ */
++ wake_q_add(wake_q, p);
++ put_task_struct(p);
+ }
+
+ /*
--- /dev/null
+From foo@baz Mon Mar 29 07:48:09 AM CEST 2021
+From: Ben Hutchings <ben@decadent.org.uk>
+Date: Sun, 28 Mar 2021 22:42:20 +0200
+Subject: futex: Handle early deadlock return correctly
+To: stable@vger.kernel.org
+Cc: Lee Jones <lee.jones@linaro.org>, "Luis Claudio R. Goncalves" <lgoncalv@redhat.com>, Florian Fainelli <f.fainelli@gmail.com>
+Message-ID: <YGDqLAxAfaH+BDVy@decadent.org.uk>
+Content-Disposition: inline
+
+From: Thomas Gleixner <tglx@linutronix.de>
+
+commit 1a1fb985f2e2b85ec0d3dc2e519ee48389ec2434 upstream.
+
+commit 56222b212e8e ("futex: Drop hb->lock before enqueueing on the
+rtmutex") changed the locking rules in the futex code so that the hash
+bucket lock is not longer held while the waiter is enqueued into the
+rtmutex wait list. This made the lock and the unlock path symmetric, but
+unfortunately the possible early exit from __rt_mutex_proxy_start() due to
+a detected deadlock was not updated accordingly. That allows a concurrent
+unlocker to observe inconsitent state which triggers the warning in the
+unlock path.
+
+futex_lock_pi() futex_unlock_pi()
+ lock(hb->lock)
+ queue(hb_waiter) lock(hb->lock)
+ lock(rtmutex->wait_lock)
+ unlock(hb->lock)
+ // acquired hb->lock
+ hb_waiter = futex_top_waiter()
+ lock(rtmutex->wait_lock)
+ __rt_mutex_proxy_start()
+ ---> fail
+ remove(rtmutex_waiter);
+ ---> returns -EDEADLOCK
+ unlock(rtmutex->wait_lock)
+ // acquired wait_lock
+ wake_futex_pi()
+ rt_mutex_next_owner()
+ --> returns NULL
+ --> WARN
+
+ lock(hb->lock)
+ unqueue(hb_waiter)
+
+The problem is caused by the remove(rtmutex_waiter) in the failure case of
+__rt_mutex_proxy_start() as this lets the unlocker observe a waiter in the
+hash bucket but no waiter on the rtmutex, i.e. inconsistent state.
+
+The original commit handles this correctly for the other early return cases
+(timeout, signal) by delaying the removal of the rtmutex waiter until the
+returning task reacquired the hash bucket lock.
+
+Treat the failure case of __rt_mutex_proxy_start() in the same way and let
+the existing cleanup code handle the eventual handover of the rtmutex
+gracefully. The regular rt_mutex_proxy_start() gains the rtmutex waiter
+removal for the failure case, so that the other callsites are still
+operating correctly.
+
+Add proper comments to the code so all these details are fully documented.
+
+Thanks to Peter for helping with the analysis and writing the really
+valuable code comments.
+
+Fixes: 56222b212e8e ("futex: Drop hb->lock before enqueueing on the rtmutex")
+Reported-by: Heiko Carstens <heiko.carstens@de.ibm.com>
+Co-developed-by: Peter Zijlstra <peterz@infradead.org>
+Signed-off-by: Peter Zijlstra <peterz@infradead.org>
+Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
+Tested-by: Heiko Carstens <heiko.carstens@de.ibm.com>
+Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
+Cc: linux-s390@vger.kernel.org
+Cc: Stefan Liebler <stli@linux.ibm.com>
+Cc: Sebastian Sewior <bigeasy@linutronix.de>
+Cc: stable@vger.kernel.org
+Link: https://lkml.kernel.org/r/alpine.DEB.2.21.1901292311410.1950@nanos.tec.linutronix.de
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+Signed-off-by: Ben Hutchings <ben@decadent.org.uk>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ kernel/futex.c | 28 ++++++++++++++++++----------
+ kernel/locking/rtmutex.c | 37 ++++++++++++++++++++++++++++++++-----
+ 2 files changed, 50 insertions(+), 15 deletions(-)
+
+--- a/kernel/futex.c
++++ b/kernel/futex.c
+@@ -2958,35 +2958,39 @@ retry_private:
+ * and BUG when futex_unlock_pi() interleaves with this.
+ *
+ * Therefore acquire wait_lock while holding hb->lock, but drop the
+- * latter before calling rt_mutex_start_proxy_lock(). This still fully
+- * serializes against futex_unlock_pi() as that does the exact same
+- * lock handoff sequence.
++ * latter before calling __rt_mutex_start_proxy_lock(). This
++ * interleaves with futex_unlock_pi() -- which does a similar lock
++ * handoff -- such that the latter can observe the futex_q::pi_state
++ * before __rt_mutex_start_proxy_lock() is done.
+ */
+ raw_spin_lock_irq(&q.pi_state->pi_mutex.wait_lock);
+ spin_unlock(q.lock_ptr);
++ /*
++ * __rt_mutex_start_proxy_lock() unconditionally enqueues the @rt_waiter
++ * such that futex_unlock_pi() is guaranteed to observe the waiter when
++ * it sees the futex_q::pi_state.
++ */
+ ret = __rt_mutex_start_proxy_lock(&q.pi_state->pi_mutex, &rt_waiter, current);
+ raw_spin_unlock_irq(&q.pi_state->pi_mutex.wait_lock);
+
+ if (ret) {
+ if (ret == 1)
+ ret = 0;
+-
+- spin_lock(q.lock_ptr);
+- goto no_block;
++ goto cleanup;
+ }
+
+-
+ if (unlikely(to))
+ hrtimer_start_expires(&to->timer, HRTIMER_MODE_ABS);
+
+ ret = rt_mutex_wait_proxy_lock(&q.pi_state->pi_mutex, to, &rt_waiter);
+
++cleanup:
+ spin_lock(q.lock_ptr);
+ /*
+- * If we failed to acquire the lock (signal/timeout), we must
++ * If we failed to acquire the lock (deadlock/signal/timeout), we must
+ * first acquire the hb->lock before removing the lock from the
+- * rt_mutex waitqueue, such that we can keep the hb and rt_mutex
+- * wait lists consistent.
++ * rt_mutex waitqueue, such that we can keep the hb and rt_mutex wait
++ * lists consistent.
+ *
+ * In particular; it is important that futex_unlock_pi() can not
+ * observe this inconsistency.
+@@ -3093,6 +3097,10 @@ retry:
+ * there is no point where we hold neither; and therefore
+ * wake_futex_pi() must observe a state consistent with what we
+ * observed.
++ *
++ * In particular; this forces __rt_mutex_start_proxy() to
++ * complete such that we're guaranteed to observe the
++ * rt_waiter. Also see the WARN in wake_futex_pi().
+ */
+ raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);
+ spin_unlock(&hb->lock);
+--- a/kernel/locking/rtmutex.c
++++ b/kernel/locking/rtmutex.c
+@@ -1695,12 +1695,33 @@ void rt_mutex_proxy_unlock(struct rt_mut
+ rt_mutex_set_owner(lock, NULL);
+ }
+
++/**
++ * __rt_mutex_start_proxy_lock() - Start lock acquisition for another task
++ * @lock: the rt_mutex to take
++ * @waiter: the pre-initialized rt_mutex_waiter
++ * @task: the task to prepare
++ *
++ * Starts the rt_mutex acquire; it enqueues the @waiter and does deadlock
++ * detection. It does not wait, see rt_mutex_wait_proxy_lock() for that.
++ *
++ * NOTE: does _NOT_ remove the @waiter on failure; must either call
++ * rt_mutex_wait_proxy_lock() or rt_mutex_cleanup_proxy_lock() after this.
++ *
++ * Returns:
++ * 0 - task blocked on lock
++ * 1 - acquired the lock for task, caller should wake it up
++ * <0 - error
++ *
++ * Special API call for PI-futex support.
++ */
+ int __rt_mutex_start_proxy_lock(struct rt_mutex *lock,
+ struct rt_mutex_waiter *waiter,
+ struct task_struct *task)
+ {
+ int ret;
+
++ lockdep_assert_held(&lock->wait_lock);
++
+ if (try_to_take_rt_mutex(lock, task, NULL))
+ return 1;
+
+@@ -1718,9 +1739,6 @@ int __rt_mutex_start_proxy_lock(struct r
+ ret = 0;
+ }
+
+- if (unlikely(ret))
+- remove_waiter(lock, waiter);
+-
+ debug_rt_mutex_print_deadlock(waiter);
+
+ return ret;
+@@ -1732,12 +1750,18 @@ int __rt_mutex_start_proxy_lock(struct r
+ * @waiter: the pre-initialized rt_mutex_waiter
+ * @task: the task to prepare
+ *
++ * Starts the rt_mutex acquire; it enqueues the @waiter and does deadlock
++ * detection. It does not wait, see rt_mutex_wait_proxy_lock() for that.
++ *
++ * NOTE: unlike __rt_mutex_start_proxy_lock this _DOES_ remove the @waiter
++ * on failure.
++ *
+ * Returns:
+ * 0 - task blocked on lock
+ * 1 - acquired the lock for task, caller should wake it up
+ * <0 - error
+ *
+- * Special API call for FUTEX_REQUEUE_PI support.
++ * Special API call for PI-futex support.
+ */
+ int rt_mutex_start_proxy_lock(struct rt_mutex *lock,
+ struct rt_mutex_waiter *waiter,
+@@ -1747,6 +1771,8 @@ int rt_mutex_start_proxy_lock(struct rt_
+
+ raw_spin_lock_irq(&lock->wait_lock);
+ ret = __rt_mutex_start_proxy_lock(lock, waiter, task);
++ if (unlikely(ret))
++ remove_waiter(lock, waiter);
+ raw_spin_unlock_irq(&lock->wait_lock);
+
+ return ret;
+@@ -1814,7 +1840,8 @@ int rt_mutex_wait_proxy_lock(struct rt_m
+ * @lock: the rt_mutex we were woken on
+ * @waiter: the pre-initialized rt_mutex_waiter
+ *
+- * Attempt to clean up after a failed rt_mutex_wait_proxy_lock().
++ * Attempt to clean up after a failed __rt_mutex_start_proxy_lock() or
++ * rt_mutex_wait_proxy_lock().
+ *
+ * Unless we acquired the lock; we're still enqueued on the wait-list and can
+ * in fact still be granted ownership until we're removed. Therefore we can
--- /dev/null
+From foo@baz Mon Mar 29 07:48:09 AM CEST 2021
+From: Ben Hutchings <ben@decadent.org.uk>
+Date: Sun, 28 Mar 2021 22:43:15 +0200
+Subject: futex: Handle transient "ownerless" rtmutex state correctly
+To: stable@vger.kernel.org
+Cc: Lee Jones <lee.jones@linaro.org>, "Luis Claudio R. Goncalves" <lgoncalv@redhat.com>, Florian Fainelli <f.fainelli@gmail.com>
+Message-ID: <YGDqY9a/qbJKK5eC@decadent.org.uk>
+Content-Disposition: inline
+
+From: Mike Galbraith <efault@gmx.de>
+
+commit 9f5d1c336a10c0d24e83e40b4c1b9539f7dba627 upstream.
+
+Gratian managed to trigger the BUG_ON(!newowner) in fixup_pi_state_owner().
+This is one possible chain of events leading to this:
+
+Task Prio Operation
+T1 120 lock(F)
+T2 120 lock(F) -> blocks (top waiter)
+T3 50 (RT) lock(F) -> boosts T1 and blocks (new top waiter)
+XX timeout/ -> wakes T2
+ signal
+T1 50 unlock(F) -> wakes T3 (rtmutex->owner == NULL, waiter bit is set)
+T2 120 cleanup -> try_to_take_mutex() fails because T3 is the top waiter
+ and the lower priority T2 cannot steal the lock.
+ -> fixup_pi_state_owner() sees newowner == NULL -> BUG_ON()
+
+The comment states that this is invalid and rt_mutex_real_owner() must
+return a non NULL owner when the trylock failed, but in case of a queued
+and woken up waiter rt_mutex_real_owner() == NULL is a valid transient
+state. The higher priority waiter has simply not yet managed to take over
+the rtmutex.
+
+The BUG_ON() is therefore wrong and this is just another retry condition in
+fixup_pi_state_owner().
+
+Drop the locks, so that T3 can make progress, and then try the fixup again.
+
+Gratian provided a great analysis, traces and a reproducer. The analysis is
+to the point, but it confused the hell out of that tglx dude who had to
+page in all the futex horrors again. Condensed version is above.
+
+[ tglx: Wrote comment and changelog ]
+
+Fixes: c1e2f0eaf015 ("futex: Avoid violating the 10th rule of futex")
+Reported-by: Gratian Crisan <gratian.crisan@ni.com>
+Signed-off-by: Mike Galbraith <efault@gmx.de>
+Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
+Cc: stable@vger.kernel.org
+Link: https://lore.kernel.org/r/87a6w6x7bb.fsf@ni.com
+Link: https://lore.kernel.org/r/87sg9pkvf7.fsf@nanos.tec.linutronix.de
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+Signed-off-by: Ben Hutchings <ben@decadent.org.uk>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ kernel/futex.c | 16 ++++++++++++++--
+ 1 file changed, 14 insertions(+), 2 deletions(-)
+
+--- a/kernel/futex.c
++++ b/kernel/futex.c
+@@ -2497,10 +2497,22 @@ retry:
+ }
+
+ /*
+- * Since we just failed the trylock; there must be an owner.
++ * The trylock just failed, so either there is an owner or
++ * there is a higher priority waiter than this one.
+ */
+ newowner = rt_mutex_owner(&pi_state->pi_mutex);
+- BUG_ON(!newowner);
++ /*
++ * If the higher priority waiter has not yet taken over the
++ * rtmutex then newowner is NULL. We can't return here with
++ * that state because it's inconsistent vs. the user space
++ * state. So drop the locks and try again. It's a valid
++ * situation and not any different from the other retry
++ * conditions.
++ */
++ if (unlikely(!newowner)) {
++ err = -EAGAIN;
++ goto handle_err;
++ }
+ } else {
+ WARN_ON_ONCE(argowner != current);
+ if (oldowner == current) {
--- /dev/null
+From foo@baz Mon Mar 29 07:48:09 AM CEST 2021
+From: Ben Hutchings <ben@decadent.org.uk>
+Date: Sun, 28 Mar 2021 22:42:58 +0200
+Subject: futex: Prevent robust futex exit race
+To: stable@vger.kernel.org
+Cc: Lee Jones <lee.jones@linaro.org>, "Luis Claudio R. Goncalves" <lgoncalv@redhat.com>, Florian Fainelli <f.fainelli@gmail.com>
+Message-ID: <YGDqUkDvp7aAhzSQ@decadent.org.uk>
+Content-Disposition: inline
+
+From: Yang Tao <yang.tao172@zte.com.cn>
+
+commit ca16d5bee59807bf04deaab0a8eccecd5061528c upstream.
+
+Robust futexes utilize the robust_list mechanism to allow the kernel to
+release futexes which are held when a task exits. The exit can be voluntary
+or caused by a signal or fault. This prevents that waiters block forever.
+
+The futex operations in user space store a pointer to the futex they are
+either locking or unlocking in the op_pending member of the per task robust
+list.
+
+After a lock operation has succeeded the futex is queued in the robust list
+linked list and the op_pending pointer is cleared.
+
+After an unlock operation has succeeded the futex is removed from the
+robust list linked list and the op_pending pointer is cleared.
+
+The robust list exit code checks for the pending operation and any futex
+which is queued in the linked list. It carefully checks whether the futex
+value is the TID of the exiting task. If so, it sets the OWNER_DIED bit and
+tries to wake up a potential waiter.
+
+This is race free for the lock operation but unlock has two race scenarios
+where waiters might not be woken up. These issues can be observed with
+regular robust pthread mutexes. PI aware pthread mutexes are not affected.
+
+(1) Unlocking task is killed after unlocking the futex value in user space
+ before being able to wake a waiter.
+
+ pthread_mutex_unlock()
+ |
+ V
+ atomic_exchange_rel (&mutex->__data.__lock, 0)
+ <------------------------killed
+ lll_futex_wake () |
+ |
+ |(__lock = 0)
+ |(enter kernel)
+ |
+ V
+ do_exit()
+ exit_mm()
+ mm_release()
+ exit_robust_list()
+ handle_futex_death()
+ |
+ |(__lock = 0)
+ |(uval = 0)
+ |
+ V
+ if ((uval & FUTEX_TID_MASK) != task_pid_vnr(curr))
+ return 0;
+
+ The sanity check which ensures that the user space futex is owned by
+ the exiting task prevents the wakeup of waiters which in consequence
+ block infinitely.
+
+(2) Waiting task is killed after a wakeup and before it can acquire the
+ futex in user space.
+
+ OWNER WAITER
+ futex_wait()
+ pthread_mutex_unlock() |
+ | |
+ |(__lock = 0) |
+ | |
+ V |
+ futex_wake() ------------> wakeup()
+ |
+ |(return to userspace)
+ |(__lock = 0)
+ |
+ V
+ oldval = mutex->__data.__lock
+ <-----------------killed
+ atomic_compare_and_exchange_val_acq (&mutex->__data.__lock, |
+ id | assume_other_futex_waiters, 0) |
+ |
+ |
+ (enter kernel)|
+ |
+ V
+ do_exit()
+ |
+ |
+ V
+ handle_futex_death()
+ |
+ |(__lock = 0)
+ |(uval = 0)
+ |
+ V
+ if ((uval & FUTEX_TID_MASK) != task_pid_vnr(curr))
+ return 0;
+
+ The sanity check which ensures that the user space futex is owned
+ by the exiting task prevents the wakeup of waiters, which seems to
+ be correct as the exiting task does not own the futex value, but
+ the consequence is that other waiters wont be woken up and block
+ infinitely.
+
+In both scenarios the following conditions are true:
+
+ - task->robust_list->list_op_pending != NULL
+ - user space futex value == 0
+ - Regular futex (not PI)
+
+If these conditions are met then it is reasonably safe to wake up a
+potential waiter in order to prevent the above problems.
+
+As this might be a false positive it can cause spurious wakeups, but the
+waiter side has to handle other types of unrelated wakeups, e.g. signals
+gracefully anyway. So such a spurious wakeup will not affect the
+correctness of these operations.
+
+This workaround must not touch the user space futex value and cannot set
+the OWNER_DIED bit because the lock value is 0, i.e. uncontended. Setting
+OWNER_DIED in this case would result in inconsistent state and subsequently
+in malfunction of the owner died handling in user space.
+
+The rest of the user space state is still consistent as no other task can
+observe the list_op_pending entry in the exiting tasks robust list.
+
+The eventually woken up waiter will observe the uncontended lock value and
+take it over.
+
+[ tglx: Massaged changelog and comment. Made the return explicit and not
+ depend on the subsequent check and added constants to hand into
+ handle_futex_death() instead of plain numbers. Fixed a few coding
+ style issues. ]
+
+Fixes: 0771dfefc9e5 ("[PATCH] lightweight robust futexes: core")
+Signed-off-by: Yang Tao <yang.tao172@zte.com.cn>
+Signed-off-by: Yi Wang <wang.yi59@zte.com.cn>
+Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
+Reviewed-by: Ingo Molnar <mingo@kernel.org>
+Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
+Cc: stable@vger.kernel.org
+Link: https://lkml.kernel.org/r/1573010582-35297-1-git-send-email-wang.yi59@zte.com.cn
+Link: https://lkml.kernel.org/r/20191106224555.943191378@linutronix.de
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+Signed-off-by: Ben Hutchings <ben@decadent.org.uk>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ kernel/futex.c | 58 ++++++++++++++++++++++++++++++++++++++++++++++++++-------
+ 1 file changed, 51 insertions(+), 7 deletions(-)
+
+--- a/kernel/futex.c
++++ b/kernel/futex.c
+@@ -3526,11 +3526,16 @@ err_unlock:
+ return ret;
+ }
+
++/* Constants for the pending_op argument of handle_futex_death */
++#define HANDLE_DEATH_PENDING true
++#define HANDLE_DEATH_LIST false
++
+ /*
+ * Process a futex-list entry, check whether it's owned by the
+ * dying task, and do notification if so:
+ */
+-static int handle_futex_death(u32 __user *uaddr, struct task_struct *curr, int pi)
++static int handle_futex_death(u32 __user *uaddr, struct task_struct *curr,
++ bool pi, bool pending_op)
+ {
+ u32 uval, uninitialized_var(nval), mval;
+ int err;
+@@ -3543,6 +3548,42 @@ retry:
+ if (get_user(uval, uaddr))
+ return -1;
+
++ /*
++ * Special case for regular (non PI) futexes. The unlock path in
++ * user space has two race scenarios:
++ *
++ * 1. The unlock path releases the user space futex value and
++ * before it can execute the futex() syscall to wake up
++ * waiters it is killed.
++ *
++ * 2. A woken up waiter is killed before it can acquire the
++ * futex in user space.
++ *
++ * In both cases the TID validation below prevents a wakeup of
++ * potential waiters which can cause these waiters to block
++ * forever.
++ *
++ * In both cases the following conditions are met:
++ *
++ * 1) task->robust_list->list_op_pending != NULL
++ * @pending_op == true
++ * 2) User space futex value == 0
++ * 3) Regular futex: @pi == false
++ *
++ * If these conditions are met, it is safe to attempt waking up a
++ * potential waiter without touching the user space futex value and
++ * trying to set the OWNER_DIED bit. The user space futex value is
++ * uncontended and the rest of the user space mutex state is
++ * consistent, so a woken waiter will just take over the
++ * uncontended futex. Setting the OWNER_DIED bit would create
++ * inconsistent state and malfunction of the user space owner died
++ * handling.
++ */
++ if (pending_op && !pi && !uval) {
++ futex_wake(uaddr, 1, 1, FUTEX_BITSET_MATCH_ANY);
++ return 0;
++ }
++
+ if ((uval & FUTEX_TID_MASK) != task_pid_vnr(curr))
+ return 0;
+
+@@ -3662,10 +3703,11 @@ static void exit_robust_list(struct task
+ * A pending lock might already be on the list, so
+ * don't process it twice:
+ */
+- if (entry != pending)
++ if (entry != pending) {
+ if (handle_futex_death((void __user *)entry + futex_offset,
+- curr, pi))
++ curr, pi, HANDLE_DEATH_LIST))
+ return;
++ }
+ if (rc)
+ return;
+ entry = next_entry;
+@@ -3679,9 +3721,10 @@ static void exit_robust_list(struct task
+ cond_resched();
+ }
+
+- if (pending)
++ if (pending) {
+ handle_futex_death((void __user *)pending + futex_offset,
+- curr, pip);
++ curr, pip, HANDLE_DEATH_PENDING);
++ }
+ }
+
+ static void futex_cleanup(struct task_struct *tsk)
+@@ -3964,7 +4007,8 @@ void compat_exit_robust_list(struct task
+ if (entry != pending) {
+ void __user *uaddr = futex_uaddr(entry, futex_offset);
+
+- if (handle_futex_death(uaddr, curr, pi))
++ if (handle_futex_death(uaddr, curr, pi,
++ HANDLE_DEATH_LIST))
+ return;
+ }
+ if (rc)
+@@ -3983,7 +4027,7 @@ void compat_exit_robust_list(struct task
+ if (pending) {
+ void __user *uaddr = futex_uaddr(pending, futex_offset);
+
+- handle_futex_death(uaddr, curr, pip);
++ handle_futex_death(uaddr, curr, pip, HANDLE_DEATH_PENDING);
+ }
+ }
+
--- /dev/null
+From foo@baz Mon Mar 29 07:48:09 AM CEST 2021
+From: Ben Hutchings <ben@decadent.org.uk>
+Date: Sun, 28 Mar 2021 22:41:42 +0200
+Subject: futex: Rework futex_lock_pi() to use rt_mutex_*_proxy_lock()
+To: stable@vger.kernel.org
+Cc: Lee Jones <lee.jones@linaro.org>, "Luis Claudio R. Goncalves" <lgoncalv@redhat.com>, Florian Fainelli <f.fainelli@gmail.com>
+Message-ID: <YGDqBnDrUInZoDsM@decadent.org.uk>
+Content-Disposition: inline
+
+From: Peter Zijlstra <peterz@infradead.org>
+
+commit cfafcd117da0216520568c195cb2f6cd1980c4bb upstream.
+
+By changing futex_lock_pi() to use rt_mutex_*_proxy_lock() all wait_list
+modifications are done under both hb->lock and wait_lock.
+
+This closes the obvious interleave pattern between futex_lock_pi() and
+futex_unlock_pi(), but not entirely so. See below:
+
+Before:
+
+futex_lock_pi() futex_unlock_pi()
+ unlock hb->lock
+
+ lock hb->lock
+ unlock hb->lock
+
+ lock rt_mutex->wait_lock
+ unlock rt_mutex_wait_lock
+ -EAGAIN
+
+ lock rt_mutex->wait_lock
+ list_add
+ unlock rt_mutex->wait_lock
+
+ schedule()
+
+ lock rt_mutex->wait_lock
+ list_del
+ unlock rt_mutex->wait_lock
+
+ <idem>
+ -EAGAIN
+
+ lock hb->lock
+
+After:
+
+futex_lock_pi() futex_unlock_pi()
+
+ lock hb->lock
+ lock rt_mutex->wait_lock
+ list_add
+ unlock rt_mutex->wait_lock
+ unlock hb->lock
+
+ schedule()
+ lock hb->lock
+ unlock hb->lock
+ lock hb->lock
+ lock rt_mutex->wait_lock
+ list_del
+ unlock rt_mutex->wait_lock
+
+ lock rt_mutex->wait_lock
+ unlock rt_mutex_wait_lock
+ -EAGAIN
+
+ unlock hb->lock
+
+It does however solve the earlier starvation/live-lock scenario which got
+introduced with the -EAGAIN since unlike the before scenario; where the
+-EAGAIN happens while futex_unlock_pi() doesn't hold any locks; in the
+after scenario it happens while futex_unlock_pi() actually holds a lock,
+and then it is serialized on that lock.
+
+Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
+Cc: juri.lelli@arm.com
+Cc: bigeasy@linutronix.de
+Cc: xlpang@redhat.com
+Cc: rostedt@goodmis.org
+Cc: mathieu.desnoyers@efficios.com
+Cc: jdesfossez@efficios.com
+Cc: dvhart@infradead.org
+Cc: bristot@redhat.com
+Link: http://lkml.kernel.org/r/20170322104152.062785528@infradead.org
+Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
+[bwh: Backported to 4.9: adjust context]
+Signed-off-by: Ben Hutchings <ben@decadent.org.uk>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ kernel/futex.c | 77 ++++++++++++++++++++++++++++------------
+ kernel/locking/rtmutex.c | 26 +++----------
+ kernel/locking/rtmutex_common.h | 1
+ 3 files changed, 62 insertions(+), 42 deletions(-)
+
+--- a/kernel/futex.c
++++ b/kernel/futex.c
+@@ -2333,20 +2333,7 @@ queue_unlock(struct futex_hash_bucket *h
+ hb_waiters_dec(hb);
+ }
+
+-/**
+- * queue_me() - Enqueue the futex_q on the futex_hash_bucket
+- * @q: The futex_q to enqueue
+- * @hb: The destination hash bucket
+- *
+- * The hb->lock must be held by the caller, and is released here. A call to
+- * queue_me() is typically paired with exactly one call to unqueue_me(). The
+- * exceptions involve the PI related operations, which may use unqueue_me_pi()
+- * or nothing if the unqueue is done as part of the wake process and the unqueue
+- * state is implicit in the state of woken task (see futex_wait_requeue_pi() for
+- * an example).
+- */
+-static inline void queue_me(struct futex_q *q, struct futex_hash_bucket *hb)
+- __releases(&hb->lock)
++static inline void __queue_me(struct futex_q *q, struct futex_hash_bucket *hb)
+ {
+ int prio;
+
+@@ -2363,6 +2350,24 @@ static inline void queue_me(struct futex
+ plist_node_init(&q->list, prio);
+ plist_add(&q->list, &hb->chain);
+ q->task = current;
++}
++
++/**
++ * queue_me() - Enqueue the futex_q on the futex_hash_bucket
++ * @q: The futex_q to enqueue
++ * @hb: The destination hash bucket
++ *
++ * The hb->lock must be held by the caller, and is released here. A call to
++ * queue_me() is typically paired with exactly one call to unqueue_me(). The
++ * exceptions involve the PI related operations, which may use unqueue_me_pi()
++ * or nothing if the unqueue is done as part of the wake process and the unqueue
++ * state is implicit in the state of woken task (see futex_wait_requeue_pi() for
++ * an example).
++ */
++static inline void queue_me(struct futex_q *q, struct futex_hash_bucket *hb)
++ __releases(&hb->lock)
++{
++ __queue_me(q, hb);
+ spin_unlock(&hb->lock);
+ }
+
+@@ -2868,6 +2873,7 @@ static int futex_lock_pi(u32 __user *uad
+ {
+ struct hrtimer_sleeper timeout, *to = NULL;
+ struct task_struct *exiting = NULL;
++ struct rt_mutex_waiter rt_waiter;
+ struct futex_hash_bucket *hb;
+ struct futex_q q = futex_q_init;
+ int res, ret;
+@@ -2928,25 +2934,52 @@ retry_private:
+ }
+ }
+
++ WARN_ON(!q.pi_state);
++
+ /*
+ * Only actually queue now that the atomic ops are done:
+ */
+- queue_me(&q, hb);
++ __queue_me(&q, hb);
+
+- WARN_ON(!q.pi_state);
+- /*
+- * Block on the PI mutex:
+- */
+- if (!trylock) {
+- ret = rt_mutex_timed_futex_lock(&q.pi_state->pi_mutex, to);
+- } else {
++ if (trylock) {
+ ret = rt_mutex_futex_trylock(&q.pi_state->pi_mutex);
+ /* Fixup the trylock return value: */
+ ret = ret ? 0 : -EWOULDBLOCK;
++ goto no_block;
+ }
+
++ /*
++ * We must add ourselves to the rt_mutex waitlist while holding hb->lock
++ * such that the hb and rt_mutex wait lists match.
++ */
++ rt_mutex_init_waiter(&rt_waiter);
++ ret = rt_mutex_start_proxy_lock(&q.pi_state->pi_mutex, &rt_waiter, current);
++ if (ret) {
++ if (ret == 1)
++ ret = 0;
++
++ goto no_block;
++ }
++
++ spin_unlock(q.lock_ptr);
++
++ if (unlikely(to))
++ hrtimer_start_expires(&to->timer, HRTIMER_MODE_ABS);
++
++ ret = rt_mutex_wait_proxy_lock(&q.pi_state->pi_mutex, to, &rt_waiter);
++
+ spin_lock(q.lock_ptr);
+ /*
++ * If we failed to acquire the lock (signal/timeout), we must
++ * first acquire the hb->lock before removing the lock from the
++ * rt_mutex waitqueue, such that we can keep the hb and rt_mutex
++ * wait lists consistent.
++ */
++ if (ret && !rt_mutex_cleanup_proxy_lock(&q.pi_state->pi_mutex, &rt_waiter))
++ ret = 0;
++
++no_block:
++ /*
+ * Fixup the pi_state owner and possibly acquire the lock if we
+ * haven't already.
+ */
+--- a/kernel/locking/rtmutex.c
++++ b/kernel/locking/rtmutex.c
+@@ -1523,19 +1523,6 @@ int __sched rt_mutex_lock_interruptible(
+ EXPORT_SYMBOL_GPL(rt_mutex_lock_interruptible);
+
+ /*
+- * Futex variant with full deadlock detection.
+- * Futex variants must not use the fast-path, see __rt_mutex_futex_unlock().
+- */
+-int __sched rt_mutex_timed_futex_lock(struct rt_mutex *lock,
+- struct hrtimer_sleeper *timeout)
+-{
+- might_sleep();
+-
+- return rt_mutex_slowlock(lock, TASK_INTERRUPTIBLE,
+- timeout, RT_MUTEX_FULL_CHAINWALK);
+-}
+-
+-/*
+ * Futex variant, must not use fastpath.
+ */
+ int __sched rt_mutex_futex_trylock(struct rt_mutex *lock)
+@@ -1808,12 +1795,6 @@ int rt_mutex_wait_proxy_lock(struct rt_m
+ /* sleep on the mutex */
+ ret = __rt_mutex_slowlock(lock, TASK_INTERRUPTIBLE, to, waiter);
+
+- /*
+- * try_to_take_rt_mutex() sets the waiter bit unconditionally. We might
+- * have to fix that up.
+- */
+- fixup_rt_mutex_waiters(lock);
+-
+ raw_spin_unlock_irq(&lock->wait_lock);
+
+ return ret;
+@@ -1853,6 +1834,13 @@ bool rt_mutex_cleanup_proxy_lock(struct
+ fixup_rt_mutex_waiters(lock);
+ cleanup = true;
+ }
++
++ /*
++ * try_to_take_rt_mutex() sets the waiter bit unconditionally. We might
++ * have to fix that up.
++ */
++ fixup_rt_mutex_waiters(lock);
++
+ raw_spin_unlock_irq(&lock->wait_lock);
+
+ return cleanup;
+--- a/kernel/locking/rtmutex_common.h
++++ b/kernel/locking/rtmutex_common.h
+@@ -112,7 +112,6 @@ extern int rt_mutex_wait_proxy_lock(stru
+ struct rt_mutex_waiter *waiter);
+ extern bool rt_mutex_cleanup_proxy_lock(struct rt_mutex *lock,
+ struct rt_mutex_waiter *waiter);
+-extern int rt_mutex_timed_futex_lock(struct rt_mutex *l, struct hrtimer_sleeper *to);
+ extern int rt_mutex_futex_trylock(struct rt_mutex *l);
+ extern int __rt_mutex_futex_trylock(struct rt_mutex *l);
+
--- /dev/null
+From foo@baz Mon Mar 29 07:48:09 AM CEST 2021
+From: Ben Hutchings <ben@decadent.org.uk>
+Date: Sun, 28 Mar 2021 22:42:08 +0200
+Subject: futex,rt_mutex: Fix rt_mutex_cleanup_proxy_lock()
+To: stable@vger.kernel.org
+Cc: Lee Jones <lee.jones@linaro.org>, "Luis Claudio R. Goncalves" <lgoncalv@redhat.com>, Florian Fainelli <f.fainelli@gmail.com>
+Message-ID: <YGDqIHbfbs/FszkQ@decadent.org.uk>
+Content-Disposition: inline
+
+From: Peter Zijlstra <peterz@infradead.org>
+
+commit 04dc1b2fff4e96cb4142227fbdc63c8871ad4ed9 upstream.
+
+Markus reported that the glibc/nptl/tst-robustpi8 test was failing after
+commit:
+
+ cfafcd117da0 ("futex: Rework futex_lock_pi() to use rt_mutex_*_proxy_lock()")
+
+The following trace shows the problem:
+
+ ld-linux-x86-64-2161 [019] .... 410.760971: SyS_futex: 00007ffbeb76b028: 80000875 op=FUTEX_LOCK_PI
+ ld-linux-x86-64-2161 [019] ...1 410.760972: lock_pi_update_atomic: 00007ffbeb76b028: curval=80000875 uval=80000875 newval=80000875 ret=0
+ ld-linux-x86-64-2165 [011] .... 410.760978: SyS_futex: 00007ffbeb76b028: 80000875 op=FUTEX_UNLOCK_PI
+ ld-linux-x86-64-2165 [011] d..1 410.760979: do_futex: 00007ffbeb76b028: curval=80000875 uval=80000875 newval=80000871 ret=0
+ ld-linux-x86-64-2165 [011] .... 410.760980: SyS_futex: 00007ffbeb76b028: 80000871 ret=0000
+ ld-linux-x86-64-2161 [019] .... 410.760980: SyS_futex: 00007ffbeb76b028: 80000871 ret=ETIMEDOUT
+
+Task 2165 does an UNLOCK_PI, assigning the lock to the waiter task 2161
+which then returns with -ETIMEDOUT. That wrecks the lock state, because now
+the owner isn't aware it acquired the lock and removes the pending robust
+list entry.
+
+If 2161 is killed, the robust list will not clear out this futex and the
+subsequent acquire on this futex will then (correctly) result in -ESRCH
+which is unexpected by glibc, triggers an internal assertion and dies.
+
+Task 2161 Task 2165
+
+rt_mutex_wait_proxy_lock()
+ timeout();
+ /* T2161 is still queued in the waiter list */
+ return -ETIMEDOUT;
+
+ futex_unlock_pi()
+ spin_lock(hb->lock);
+ rtmutex_unlock()
+ remove_rtmutex_waiter(T2161);
+ mark_lock_available();
+ /* Make the next waiter owner of the user space side */
+ futex_uval = 2161;
+ spin_unlock(hb->lock);
+spin_lock(hb->lock);
+rt_mutex_cleanup_proxy_lock()
+ if (rtmutex_owner() !== current)
+ ...
+ return FAIL;
+....
+return -ETIMEOUT;
+
+This means that rt_mutex_cleanup_proxy_lock() needs to call
+try_to_take_rt_mutex() so it can take over the rtmutex correctly which was
+assigned by the waker. If the rtmutex is owned by some other task then this
+call is harmless and just confirmes that the waiter is not able to acquire
+it.
+
+While there, fix what looks like a merge error which resulted in
+rt_mutex_cleanup_proxy_lock() having two calls to
+fixup_rt_mutex_waiters() and rt_mutex_wait_proxy_lock() not having any.
+Both should have one, since both potentially touch the waiter list.
+
+Fixes: 38d589f2fd08 ("futex,rt_mutex: Restructure rt_mutex_finish_proxy_lock()")
+Reported-by: Markus Trippelsdorf <markus@trippelsdorf.de>
+Bug-Spotted-by: Thomas Gleixner <tglx@linutronix.de>
+Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
+Cc: Florian Weimer <fweimer@redhat.com>
+Cc: Darren Hart <dvhart@infradead.org>
+Cc: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
+Cc: Markus Trippelsdorf <markus@trippelsdorf.de>
+Link: http://lkml.kernel.org/r/20170519154850.mlomgdsd26drq5j6@hirez.programming.kicks-ass.net
+Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
+Signed-off-by: Ben Hutchings <ben@decadent.org.uk>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ kernel/locking/rtmutex.c | 24 ++++++++++++++++++------
+ 1 file changed, 18 insertions(+), 6 deletions(-)
+
+--- a/kernel/locking/rtmutex.c
++++ b/kernel/locking/rtmutex.c
+@@ -1796,12 +1796,14 @@ int rt_mutex_wait_proxy_lock(struct rt_m
+ int ret;
+
+ raw_spin_lock_irq(&lock->wait_lock);
+-
+- set_current_state(TASK_INTERRUPTIBLE);
+-
+ /* sleep on the mutex */
++ set_current_state(TASK_INTERRUPTIBLE);
+ ret = __rt_mutex_slowlock(lock, TASK_INTERRUPTIBLE, to, waiter);
+-
++ /*
++ * try_to_take_rt_mutex() sets the waiter bit unconditionally. We might
++ * have to fix that up.
++ */
++ fixup_rt_mutex_waiters(lock);
+ raw_spin_unlock_irq(&lock->wait_lock);
+
+ return ret;
+@@ -1833,15 +1835,25 @@ bool rt_mutex_cleanup_proxy_lock(struct
+
+ raw_spin_lock_irq(&lock->wait_lock);
+ /*
++ * Do an unconditional try-lock, this deals with the lock stealing
++ * state where __rt_mutex_futex_unlock() -> mark_wakeup_next_waiter()
++ * sets a NULL owner.
++ *
++ * We're not interested in the return value, because the subsequent
++ * test on rt_mutex_owner() will infer that. If the trylock succeeded,
++ * we will own the lock and it will have removed the waiter. If we
++ * failed the trylock, we're still not owner and we need to remove
++ * ourselves.
++ */
++ try_to_take_rt_mutex(lock, current, waiter);
++ /*
+ * Unless we're the owner; we're still enqueued on the wait_list.
+ * So check if we became owner, if not, take us off the wait_list.
+ */
+ if (rt_mutex_owner(lock) != current) {
+ remove_waiter(lock, waiter);
+- fixup_rt_mutex_waiters(lock);
+ cleanup = true;
+ }
+-
+ /*
+ * try_to_take_rt_mutex() sets the waiter bit unconditionally. We might
+ * have to fix that up.
--- /dev/null
+From foo@baz Mon Mar 29 07:48:09 AM CEST 2021
+From: Ben Hutchings <ben@decadent.org.uk>
+Date: Sun, 28 Mar 2021 22:41:33 +0200
+Subject: futex,rt_mutex: Introduce rt_mutex_init_waiter()
+To: stable@vger.kernel.org
+Cc: Lee Jones <lee.jones@linaro.org>, "Luis Claudio R. Goncalves" <lgoncalv@redhat.com>, Florian Fainelli <f.fainelli@gmail.com>
+Message-ID: <YGDp/WL8DllYuErc@decadent.org.uk>
+Content-Disposition: inline
+
+From: Peter Zijlstra <peterz@infradead.org>
+
+commit 50809358dd7199aa7ce232f6877dd09ec30ef374 upstream.
+
+Since there's already two copies of this code, introduce a helper now
+before adding a third one.
+
+Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
+Cc: juri.lelli@arm.com
+Cc: bigeasy@linutronix.de
+Cc: xlpang@redhat.com
+Cc: rostedt@goodmis.org
+Cc: mathieu.desnoyers@efficios.com
+Cc: jdesfossez@efficios.com
+Cc: dvhart@infradead.org
+Cc: bristot@redhat.com
+Link: http://lkml.kernel.org/r/20170322104151.950039479@infradead.org
+Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
+[bwh: Backported to 4.9: adjust context]
+Signed-off-by: Ben Hutchings <ben@decadent.org.uk>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ kernel/futex.c | 5 +----
+ kernel/locking/rtmutex.c | 12 +++++++++---
+ kernel/locking/rtmutex_common.h | 1 +
+ 3 files changed, 11 insertions(+), 7 deletions(-)
+
+--- a/kernel/futex.c
++++ b/kernel/futex.c
+@@ -3234,10 +3234,7 @@ static int futex_wait_requeue_pi(u32 __u
+ * The waiter is allocated on our stack, manipulated by the requeue
+ * code while we sleep on uaddr.
+ */
+- debug_rt_mutex_init_waiter(&rt_waiter);
+- RB_CLEAR_NODE(&rt_waiter.pi_tree_entry);
+- RB_CLEAR_NODE(&rt_waiter.tree_entry);
+- rt_waiter.task = NULL;
++ rt_mutex_init_waiter(&rt_waiter);
+
+ ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2, VERIFY_WRITE);
+ if (unlikely(ret != 0))
+--- a/kernel/locking/rtmutex.c
++++ b/kernel/locking/rtmutex.c
+@@ -1176,6 +1176,14 @@ void rt_mutex_adjust_pi(struct task_stru
+ next_lock, NULL, task);
+ }
+
++void rt_mutex_init_waiter(struct rt_mutex_waiter *waiter)
++{
++ debug_rt_mutex_init_waiter(waiter);
++ RB_CLEAR_NODE(&waiter->pi_tree_entry);
++ RB_CLEAR_NODE(&waiter->tree_entry);
++ waiter->task = NULL;
++}
++
+ /**
+ * __rt_mutex_slowlock() - Perform the wait-wake-try-to-take loop
+ * @lock: the rt_mutex to take
+@@ -1258,9 +1266,7 @@ rt_mutex_slowlock(struct rt_mutex *lock,
+ unsigned long flags;
+ int ret = 0;
+
+- debug_rt_mutex_init_waiter(&waiter);
+- RB_CLEAR_NODE(&waiter.pi_tree_entry);
+- RB_CLEAR_NODE(&waiter.tree_entry);
++ rt_mutex_init_waiter(&waiter);
+
+ /*
+ * Technically we could use raw_spin_[un]lock_irq() here, but this can
+--- a/kernel/locking/rtmutex_common.h
++++ b/kernel/locking/rtmutex_common.h
+@@ -103,6 +103,7 @@ extern struct task_struct *rt_mutex_next
+ extern void rt_mutex_init_proxy_locked(struct rt_mutex *lock,
+ struct task_struct *proxy_owner);
+ extern void rt_mutex_proxy_unlock(struct rt_mutex *lock);
++extern void rt_mutex_init_waiter(struct rt_mutex_waiter *waiter);
+ extern int rt_mutex_start_proxy_lock(struct rt_mutex *lock,
+ struct rt_mutex_waiter *waiter,
+ struct task_struct *task);
--- /dev/null
+From foo@baz Mon Mar 29 07:48:09 AM CEST 2021
+From: Ben Hutchings <ben@decadent.org.uk>
+Date: Sun, 28 Mar 2021 22:40:54 +0200
+Subject: futex: Use smp_store_release() in mark_wake_futex()
+To: stable@vger.kernel.org
+Cc: Lee Jones <lee.jones@linaro.org>, "Luis Claudio R. Goncalves" <lgoncalv@redhat.com>, Florian Fainelli <f.fainelli@gmail.com>
+Message-ID: <YGDp1qJOCUJmE1Ty@decadent.org.uk>
+Content-Disposition: inline
+
+From: Peter Zijlstra <peterz@infradead.org>
+
+commit 1b367ece0d7e696cab1c8501bab282cc6a538b3f upstream.
+
+Since the futex_q can dissapear the instruction after assigning NULL,
+this really should be a RELEASE barrier. That stops loads from hitting
+dead memory too.
+
+Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
+Cc: juri.lelli@arm.com
+Cc: bigeasy@linutronix.de
+Cc: xlpang@redhat.com
+Cc: rostedt@goodmis.org
+Cc: mathieu.desnoyers@efficios.com
+Cc: jdesfossez@efficios.com
+Cc: dvhart@infradead.org
+Cc: bristot@redhat.com
+Link: http://lkml.kernel.org/r/20170322104151.604296452@infradead.org
+Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
+Signed-off-by: Ben Hutchings <ben@decadent.org.uk>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ kernel/futex.c | 3 +--
+ 1 file changed, 1 insertion(+), 2 deletions(-)
+
+--- a/kernel/futex.c
++++ b/kernel/futex.c
+@@ -1565,8 +1565,7 @@ static void mark_wake_futex(struct wake_
+ * memory barrier is required here to prevent the following
+ * store to lock_ptr from getting ahead of the plist_del.
+ */
+- smp_wmb();
+- q->lock_ptr = NULL;
++ smp_store_release(&q->lock_ptr, NULL);
+ }
+
+ /*
--- /dev/null
+From foo@baz Mon Mar 29 07:48:09 AM CEST 2021
+From: Ben Hutchings <ben@decadent.org.uk>
+Date: Sun, 28 Mar 2021 22:42:44 +0200
+Subject: locking/futex: Allow low-level atomic operations to return -EAGAIN
+To: stable@vger.kernel.org
+Cc: Lee Jones <lee.jones@linaro.org>, "Luis Claudio R. Goncalves" <lgoncalv@redhat.com>, Florian Fainelli <f.fainelli@gmail.com>
+Message-ID: <YGDqRGR3K1fUDkoR@decadent.org.uk>
+Content-Disposition: inline
+
+From: Will Deacon <will.deacon@arm.com>
+
+commit 6b4f4bc9cb22875f97023984a625386f0c7cc1c0 upstream.
+
+Some futex() operations, including FUTEX_WAKE_OP, require the kernel to
+perform an atomic read-modify-write of the futex word via the userspace
+mapping. These operations are implemented by each architecture in
+arch_futex_atomic_op_inuser() and futex_atomic_cmpxchg_inatomic(), which
+are called in atomic context with the relevant hash bucket locks held.
+
+Although these routines may return -EFAULT in response to a page fault
+generated when accessing userspace, they are expected to succeed (i.e.
+return 0) in all other cases. This poses a problem for architectures
+that do not provide bounded forward progress guarantees or fairness of
+contended atomic operations and can lead to starvation in some cases.
+
+In these problematic scenarios, we must return back to the core futex
+code so that we can drop the hash bucket locks and reschedule if
+necessary, much like we do in the case of a page fault.
+
+Allow architectures to return -EAGAIN from their implementations of
+arch_futex_atomic_op_inuser() and futex_atomic_cmpxchg_inatomic(), which
+will cause the core futex code to reschedule if necessary and return
+back to the architecture code later on.
+
+Cc: <stable@kernel.org>
+Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
+Signed-off-by: Will Deacon <will.deacon@arm.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+[bwh: Backported to 4.9: adjust context]
+Signed-off-by: Ben Hutchings <ben@decadent.org.uk>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ kernel/futex.c | 185 +++++++++++++++++++++++++++++++++++----------------------
+ 1 file changed, 115 insertions(+), 70 deletions(-)
+
+--- a/kernel/futex.c
++++ b/kernel/futex.c
+@@ -1407,13 +1407,15 @@ static int lookup_pi_state(u32 __user *u
+
+ static int lock_pi_update_atomic(u32 __user *uaddr, u32 uval, u32 newval)
+ {
++ int err;
+ u32 uninitialized_var(curval);
+
+ if (unlikely(should_fail_futex(true)))
+ return -EFAULT;
+
+- if (unlikely(cmpxchg_futex_value_locked(&curval, uaddr, uval, newval)))
+- return -EFAULT;
++ err = cmpxchg_futex_value_locked(&curval, uaddr, uval, newval);
++ if (unlikely(err))
++ return err;
+
+ /* If user space value changed, let the caller retry */
+ return curval != uval ? -EAGAIN : 0;
+@@ -1606,10 +1608,8 @@ static int wake_futex_pi(u32 __user *uad
+ if (unlikely(should_fail_futex(true)))
+ ret = -EFAULT;
+
+- if (cmpxchg_futex_value_locked(&curval, uaddr, uval, newval)) {
+- ret = -EFAULT;
+-
+- } else if (curval != uval) {
++ ret = cmpxchg_futex_value_locked(&curval, uaddr, uval, newval);
++ if (!ret && (curval != uval)) {
+ /*
+ * If a unconditional UNLOCK_PI operation (user space did not
+ * try the TID->0 transition) raced with a waiter setting the
+@@ -1795,32 +1795,32 @@ retry_private:
+ double_lock_hb(hb1, hb2);
+ op_ret = futex_atomic_op_inuser(op, uaddr2);
+ if (unlikely(op_ret < 0)) {
+-
+ double_unlock_hb(hb1, hb2);
+
+-#ifndef CONFIG_MMU
+- /*
+- * we don't get EFAULT from MMU faults if we don't have an MMU,
+- * but we might get them from range checking
+- */
+- ret = op_ret;
+- goto out_put_keys;
+-#endif
+-
+- if (unlikely(op_ret != -EFAULT)) {
++ if (!IS_ENABLED(CONFIG_MMU) ||
++ unlikely(op_ret != -EFAULT && op_ret != -EAGAIN)) {
++ /*
++ * we don't get EFAULT from MMU faults if we don't have
++ * an MMU, but we might get them from range checking
++ */
+ ret = op_ret;
+ goto out_put_keys;
+ }
+
+- ret = fault_in_user_writeable(uaddr2);
+- if (ret)
+- goto out_put_keys;
++ if (op_ret == -EFAULT) {
++ ret = fault_in_user_writeable(uaddr2);
++ if (ret)
++ goto out_put_keys;
++ }
+
+- if (!(flags & FLAGS_SHARED))
++ if (!(flags & FLAGS_SHARED)) {
++ cond_resched();
+ goto retry_private;
++ }
+
+ put_futex_key(&key2);
+ put_futex_key(&key1);
++ cond_resched();
+ goto retry;
+ }
+
+@@ -2516,14 +2516,17 @@ retry:
+ if (!pi_state->owner)
+ newtid |= FUTEX_OWNER_DIED;
+
+- if (get_futex_value_locked(&uval, uaddr))
+- goto handle_fault;
++ err = get_futex_value_locked(&uval, uaddr);
++ if (err)
++ goto handle_err;
+
+ for (;;) {
+ newval = (uval & FUTEX_OWNER_DIED) | newtid;
+
+- if (cmpxchg_futex_value_locked(&curval, uaddr, uval, newval))
+- goto handle_fault;
++ err = cmpxchg_futex_value_locked(&curval, uaddr, uval, newval);
++ if (err)
++ goto handle_err;
++
+ if (curval == uval)
+ break;
+ uval = curval;
+@@ -2538,23 +2541,36 @@ retry:
+ return argowner == current;
+
+ /*
+- * To handle the page fault we need to drop the locks here. That gives
+- * the other task (either the highest priority waiter itself or the
+- * task which stole the rtmutex) the chance to try the fixup of the
+- * pi_state. So once we are back from handling the fault we need to
+- * check the pi_state after reacquiring the locks and before trying to
+- * do another fixup. When the fixup has been done already we simply
+- * return.
++ * In order to reschedule or handle a page fault, we need to drop the
++ * locks here. In the case of a fault, this gives the other task
++ * (either the highest priority waiter itself or the task which stole
++ * the rtmutex) the chance to try the fixup of the pi_state. So once we
++ * are back from handling the fault we need to check the pi_state after
++ * reacquiring the locks and before trying to do another fixup. When
++ * the fixup has been done already we simply return.
+ *
+ * Note: we hold both hb->lock and pi_mutex->wait_lock. We can safely
+ * drop hb->lock since the caller owns the hb -> futex_q relation.
+ * Dropping the pi_mutex->wait_lock requires the state revalidate.
+ */
+-handle_fault:
++handle_err:
+ raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
+ spin_unlock(q->lock_ptr);
+
+- err = fault_in_user_writeable(uaddr);
++ switch (err) {
++ case -EFAULT:
++ err = fault_in_user_writeable(uaddr);
++ break;
++
++ case -EAGAIN:
++ cond_resched();
++ err = 0;
++ break;
++
++ default:
++ WARN_ON_ONCE(1);
++ break;
++ }
+
+ spin_lock(q->lock_ptr);
+ raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);
+@@ -3128,10 +3144,8 @@ retry:
+ * A unconditional UNLOCK_PI op raced against a waiter
+ * setting the FUTEX_WAITERS bit. Try again.
+ */
+- if (ret == -EAGAIN) {
+- put_futex_key(&key);
+- goto retry;
+- }
++ if (ret == -EAGAIN)
++ goto pi_retry;
+ /*
+ * wake_futex_pi has detected invalid state. Tell user
+ * space.
+@@ -3146,9 +3160,19 @@ retry:
+ * preserve the WAITERS bit not the OWNER_DIED one. We are the
+ * owner.
+ */
+- if (cmpxchg_futex_value_locked(&curval, uaddr, uval, 0)) {
++ if ((ret = cmpxchg_futex_value_locked(&curval, uaddr, uval, 0))) {
+ spin_unlock(&hb->lock);
+- goto pi_faulted;
++ switch (ret) {
++ case -EFAULT:
++ goto pi_faulted;
++
++ case -EAGAIN:
++ goto pi_retry;
++
++ default:
++ WARN_ON_ONCE(1);
++ goto out_putkey;
++ }
+ }
+
+ /*
+@@ -3162,6 +3186,11 @@ out_putkey:
+ put_futex_key(&key);
+ return ret;
+
++pi_retry:
++ put_futex_key(&key);
++ cond_resched();
++ goto retry;
++
+ pi_faulted:
+ put_futex_key(&key);
+
+@@ -3504,6 +3533,7 @@ err_unlock:
+ static int handle_futex_death(u32 __user *uaddr, struct task_struct *curr, int pi)
+ {
+ u32 uval, uninitialized_var(nval), mval;
++ int err;
+
+ /* Futex address must be 32bit aligned */
+ if ((((unsigned long)uaddr) % sizeof(*uaddr)) != 0)
+@@ -3513,42 +3543,57 @@ retry:
+ if (get_user(uval, uaddr))
+ return -1;
+
+- if ((uval & FUTEX_TID_MASK) == task_pid_vnr(curr)) {
+- /*
+- * Ok, this dying thread is truly holding a futex
+- * of interest. Set the OWNER_DIED bit atomically
+- * via cmpxchg, and if the value had FUTEX_WAITERS
+- * set, wake up a waiter (if any). (We have to do a
+- * futex_wake() even if OWNER_DIED is already set -
+- * to handle the rare but possible case of recursive
+- * thread-death.) The rest of the cleanup is done in
+- * userspace.
+- */
+- mval = (uval & FUTEX_WAITERS) | FUTEX_OWNER_DIED;
+- /*
+- * We are not holding a lock here, but we want to have
+- * the pagefault_disable/enable() protection because
+- * we want to handle the fault gracefully. If the
+- * access fails we try to fault in the futex with R/W
+- * verification via get_user_pages. get_user() above
+- * does not guarantee R/W access. If that fails we
+- * give up and leave the futex locked.
+- */
+- if (cmpxchg_futex_value_locked(&nval, uaddr, uval, mval)) {
++ if ((uval & FUTEX_TID_MASK) != task_pid_vnr(curr))
++ return 0;
++
++ /*
++ * Ok, this dying thread is truly holding a futex
++ * of interest. Set the OWNER_DIED bit atomically
++ * via cmpxchg, and if the value had FUTEX_WAITERS
++ * set, wake up a waiter (if any). (We have to do a
++ * futex_wake() even if OWNER_DIED is already set -
++ * to handle the rare but possible case of recursive
++ * thread-death.) The rest of the cleanup is done in
++ * userspace.
++ */
++ mval = (uval & FUTEX_WAITERS) | FUTEX_OWNER_DIED;
++
++ /*
++ * We are not holding a lock here, but we want to have
++ * the pagefault_disable/enable() protection because
++ * we want to handle the fault gracefully. If the
++ * access fails we try to fault in the futex with R/W
++ * verification via get_user_pages. get_user() above
++ * does not guarantee R/W access. If that fails we
++ * give up and leave the futex locked.
++ */
++ if ((err = cmpxchg_futex_value_locked(&nval, uaddr, uval, mval))) {
++ switch (err) {
++ case -EFAULT:
+ if (fault_in_user_writeable(uaddr))
+ return -1;
+ goto retry;
+- }
+- if (nval != uval)
++
++ case -EAGAIN:
++ cond_resched();
+ goto retry;
+
+- /*
+- * Wake robust non-PI futexes here. The wakeup of
+- * PI futexes happens in exit_pi_state():
+- */
+- if (!pi && (uval & FUTEX_WAITERS))
+- futex_wake(uaddr, 1, 1, FUTEX_BITSET_MATCH_ANY);
++ default:
++ WARN_ON_ONCE(1);
++ return err;
++ }
+ }
++
++ if (nval != uval)
++ goto retry;
++
++ /*
++ * Wake robust non-PI futexes here. The wakeup of
++ * PI futexes happens in exit_pi_state():
++ */
++ if (!pi && (uval & FUTEX_WAITERS))
++ futex_wake(uaddr, 1, 1, FUTEX_BITSET_MATCH_ANY);
++
+ return 0;
+ }
+
acpi-scan-use-unique-number-for-instance_no.patch
perf-auxtrace-fix-auxtrace-queue-conflict.patch
idr-add-ida_is_empty.patch
+futex-use-smp_store_release-in-mark_wake_futex.patch
+futex-rt_mutex-introduce-rt_mutex_init_waiter.patch
+futex-rework-futex_lock_pi-to-use-rt_mutex_-_proxy_lock.patch
+futex-drop-hb-lock-before-enqueueing-on-the-rtmutex.patch
+futex-avoid-freeing-an-active-timer.patch
+futex-rt_mutex-fix-rt_mutex_cleanup_proxy_lock.patch
+futex-handle-early-deadlock-return-correctly.patch
+futex-fix-possible-missed-wakeup.patch
+locking-futex-allow-low-level-atomic-operations-to-return-eagain.patch
+arm64-futex-bound-number-of-ldxr-stxr-loops-in-futex_wake_op.patch
+futex-prevent-robust-futex-exit-race.patch
+futex-fix-incorrect-should_fail_futex-handling.patch
+futex-handle-transient-ownerless-rtmutex-state-correctly.patch