]> git.ipfire.org Git - thirdparty/kernel/stable-queue.git/commitdiff
4.9-stable patches
authorGreg Kroah-Hartman <gregkh@linuxfoundation.org>
Mon, 29 Mar 2021 05:49:51 +0000 (07:49 +0200)
committerGreg Kroah-Hartman <gregkh@linuxfoundation.org>
Mon, 29 Mar 2021 05:49:51 +0000 (07:49 +0200)
added patches:
arm64-futex-bound-number-of-ldxr-stxr-loops-in-futex_wake_op.patch
futex-avoid-freeing-an-active-timer.patch
futex-drop-hb-lock-before-enqueueing-on-the-rtmutex.patch
futex-fix-incorrect-should_fail_futex-handling.patch
futex-fix-possible-missed-wakeup.patch
futex-handle-early-deadlock-return-correctly.patch
futex-handle-transient-ownerless-rtmutex-state-correctly.patch
futex-prevent-robust-futex-exit-race.patch
futex-rework-futex_lock_pi-to-use-rt_mutex_-_proxy_lock.patch
futex-rt_mutex-fix-rt_mutex_cleanup_proxy_lock.patch
futex-rt_mutex-introduce-rt_mutex_init_waiter.patch
futex-use-smp_store_release-in-mark_wake_futex.patch
locking-futex-allow-low-level-atomic-operations-to-return-eagain.patch

14 files changed:
queue-4.9/arm64-futex-bound-number-of-ldxr-stxr-loops-in-futex_wake_op.patch [new file with mode: 0644]
queue-4.9/futex-avoid-freeing-an-active-timer.patch [new file with mode: 0644]
queue-4.9/futex-drop-hb-lock-before-enqueueing-on-the-rtmutex.patch [new file with mode: 0644]
queue-4.9/futex-fix-incorrect-should_fail_futex-handling.patch [new file with mode: 0644]
queue-4.9/futex-fix-possible-missed-wakeup.patch [new file with mode: 0644]
queue-4.9/futex-handle-early-deadlock-return-correctly.patch [new file with mode: 0644]
queue-4.9/futex-handle-transient-ownerless-rtmutex-state-correctly.patch [new file with mode: 0644]
queue-4.9/futex-prevent-robust-futex-exit-race.patch [new file with mode: 0644]
queue-4.9/futex-rework-futex_lock_pi-to-use-rt_mutex_-_proxy_lock.patch [new file with mode: 0644]
queue-4.9/futex-rt_mutex-fix-rt_mutex_cleanup_proxy_lock.patch [new file with mode: 0644]
queue-4.9/futex-rt_mutex-introduce-rt_mutex_init_waiter.patch [new file with mode: 0644]
queue-4.9/futex-use-smp_store_release-in-mark_wake_futex.patch [new file with mode: 0644]
queue-4.9/locking-futex-allow-low-level-atomic-operations-to-return-eagain.patch [new file with mode: 0644]
queue-4.9/series

diff --git a/queue-4.9/arm64-futex-bound-number-of-ldxr-stxr-loops-in-futex_wake_op.patch b/queue-4.9/arm64-futex-bound-number-of-ldxr-stxr-loops-in-futex_wake_op.patch
new file mode 100644 (file)
index 0000000..7fc9294
--- /dev/null
@@ -0,0 +1,153 @@
+From foo@baz Mon Mar 29 07:48:09 AM CEST 2021
+From: Ben Hutchings <ben@decadent.org.uk>
+Date: Sun, 28 Mar 2021 22:42:51 +0200
+Subject: arm64: futex: Bound number of LDXR/STXR loops in FUTEX_WAKE_OP
+To: stable@vger.kernel.org
+Cc: Lee Jones <lee.jones@linaro.org>, "Luis Claudio R. Goncalves" <lgoncalv@redhat.com>, Florian Fainelli <f.fainelli@gmail.com>
+Message-ID: <YGDqS7bKz9n1Jto3@decadent.org.uk>
+Content-Disposition: inline
+
+From: Will Deacon <will.deacon@arm.com>
+
+commit 03110a5cb2161690ae5ac04994d47ed0cd6cef75 upstream.
+
+Our futex implementation makes use of LDXR/STXR loops to perform atomic
+updates to user memory from atomic context. This can lead to latency
+problems if we end up spinning around the LL/SC sequence at the expense
+of doing something useful.
+
+Rework our futex atomic operations so that we return -EAGAIN if we fail
+to update the futex word after 128 attempts. The core futex code will
+reschedule if necessary and we'll try again later.
+
+Fixes: 6170a97460db ("arm64: Atomic operations")
+Signed-off-by: Will Deacon <will.deacon@arm.com>
+[bwh: Backported to 4.9: adjust context]
+Signed-off-by: Ben Hutchings <ben@decadent.org.uk>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ arch/arm64/include/asm/futex.h |   59 +++++++++++++++++++++++++----------------
+ 1 file changed, 37 insertions(+), 22 deletions(-)
+
+--- a/arch/arm64/include/asm/futex.h
++++ b/arch/arm64/include/asm/futex.h
+@@ -26,7 +26,12 @@
+ #include <asm/errno.h>
+ #include <asm/sysreg.h>
++#define FUTEX_MAX_LOOPS       128 /* What's the largest number you can think of? */
++
+ #define __futex_atomic_op(insn, ret, oldval, uaddr, tmp, oparg)               \
++do {                                                                  \
++      unsigned int loops = FUTEX_MAX_LOOPS;                           \
++                                                                      \
+       asm volatile(                                                   \
+       ALTERNATIVE("nop", SET_PSTATE_PAN(0), ARM64_HAS_PAN,            \
+                   CONFIG_ARM64_PAN)                                   \
+@@ -34,21 +39,26 @@
+ "1:   ldxr    %w1, %2\n"                                              \
+       insn "\n"                                                       \
+ "2:   stlxr   %w0, %w3, %2\n"                                         \
+-"     cbnz    %w0, 1b\n"                                              \
+-"     dmb     ish\n"                                                  \
++"     cbz     %w0, 3f\n"                                              \
++"     sub     %w4, %w4, %w0\n"                                        \
++"     cbnz    %w4, 1b\n"                                              \
++"     mov     %w0, %w7\n"                                             \
+ "3:\n"                                                                        \
++"     dmb     ish\n"                                                  \
+ "     .pushsection .fixup,\"ax\"\n"                                   \
+ "     .align  2\n"                                                    \
+-"4:   mov     %w0, %w5\n"                                             \
++"4:   mov     %w0, %w6\n"                                             \
+ "     b       3b\n"                                                   \
+ "     .popsection\n"                                                  \
+       _ASM_EXTABLE(1b, 4b)                                            \
+       _ASM_EXTABLE(2b, 4b)                                            \
+       ALTERNATIVE("nop", SET_PSTATE_PAN(1), ARM64_HAS_PAN,            \
+                   CONFIG_ARM64_PAN)                                   \
+-      : "=&r" (ret), "=&r" (oldval), "+Q" (*uaddr), "=&r" (tmp)       \
+-      : "r" (oparg), "Ir" (-EFAULT)                                   \
+-      : "memory")
++      : "=&r" (ret), "=&r" (oldval), "+Q" (*uaddr), "=&r" (tmp),      \
++        "+r" (loops)                                                  \
++      : "r" (oparg), "Ir" (-EFAULT), "Ir" (-EAGAIN)                   \
++      : "memory");                                                    \
++} while (0)
+ static inline int
+ arch_futex_atomic_op_inuser(int op, int oparg, int *oval, u32 __user *uaddr)
+@@ -59,23 +69,23 @@ arch_futex_atomic_op_inuser(int op, int
+       switch (op) {
+       case FUTEX_OP_SET:
+-              __futex_atomic_op("mov  %w3, %w4",
++              __futex_atomic_op("mov  %w3, %w5",
+                                 ret, oldval, uaddr, tmp, oparg);
+               break;
+       case FUTEX_OP_ADD:
+-              __futex_atomic_op("add  %w3, %w1, %w4",
++              __futex_atomic_op("add  %w3, %w1, %w5",
+                                 ret, oldval, uaddr, tmp, oparg);
+               break;
+       case FUTEX_OP_OR:
+-              __futex_atomic_op("orr  %w3, %w1, %w4",
++              __futex_atomic_op("orr  %w3, %w1, %w5",
+                                 ret, oldval, uaddr, tmp, oparg);
+               break;
+       case FUTEX_OP_ANDN:
+-              __futex_atomic_op("and  %w3, %w1, %w4",
++              __futex_atomic_op("and  %w3, %w1, %w5",
+                                 ret, oldval, uaddr, tmp, ~oparg);
+               break;
+       case FUTEX_OP_XOR:
+-              __futex_atomic_op("eor  %w3, %w1, %w4",
++              __futex_atomic_op("eor  %w3, %w1, %w5",
+                                 ret, oldval, uaddr, tmp, oparg);
+               break;
+       default:
+@@ -95,6 +105,7 @@ futex_atomic_cmpxchg_inatomic(u32 *uval,
+                             u32 oldval, u32 newval)
+ {
+       int ret = 0;
++      unsigned int loops = FUTEX_MAX_LOOPS;
+       u32 val, tmp;
+       u32 __user *uaddr;
+@@ -106,21 +117,25 @@ futex_atomic_cmpxchg_inatomic(u32 *uval,
+ ALTERNATIVE("nop", SET_PSTATE_PAN(0), ARM64_HAS_PAN, CONFIG_ARM64_PAN)
+ "     prfm    pstl1strm, %2\n"
+ "1:   ldxr    %w1, %2\n"
+-"     sub     %w3, %w1, %w4\n"
+-"     cbnz    %w3, 3f\n"
+-"2:   stlxr   %w3, %w5, %2\n"
+-"     cbnz    %w3, 1b\n"
+-"     dmb     ish\n"
++"     sub     %w3, %w1, %w5\n"
++"     cbnz    %w3, 4f\n"
++"2:   stlxr   %w3, %w6, %2\n"
++"     cbz     %w3, 3f\n"
++"     sub     %w4, %w4, %w3\n"
++"     cbnz    %w4, 1b\n"
++"     mov     %w0, %w8\n"
+ "3:\n"
++"     dmb     ish\n"
++"4:\n"
+ "     .pushsection .fixup,\"ax\"\n"
+-"4:   mov     %w0, %w6\n"
+-"     b       3b\n"
++"5:   mov     %w0, %w7\n"
++"     b       4b\n"
+ "     .popsection\n"
+-      _ASM_EXTABLE(1b, 4b)
+-      _ASM_EXTABLE(2b, 4b)
++      _ASM_EXTABLE(1b, 5b)
++      _ASM_EXTABLE(2b, 5b)
+ ALTERNATIVE("nop", SET_PSTATE_PAN(1), ARM64_HAS_PAN, CONFIG_ARM64_PAN)
+-      : "+r" (ret), "=&r" (val), "+Q" (*uaddr), "=&r" (tmp)
+-      : "r" (oldval), "r" (newval), "Ir" (-EFAULT)
++      : "+r" (ret), "=&r" (val), "+Q" (*uaddr), "=&r" (tmp), "+r" (loops)
++      : "r" (oldval), "r" (newval), "Ir" (-EFAULT), "Ir" (-EAGAIN)
+       : "memory");
+       *uval = val;
diff --git a/queue-4.9/futex-avoid-freeing-an-active-timer.patch b/queue-4.9/futex-avoid-freeing-an-active-timer.patch
new file mode 100644 (file)
index 0000000..c87b966
--- /dev/null
@@ -0,0 +1,61 @@
+From foo@baz Mon Mar 29 07:48:09 AM CEST 2021
+From: Ben Hutchings <ben@decadent.org.uk>
+Date: Sun, 28 Mar 2021 22:42:00 +0200
+Subject: futex: Avoid freeing an active timer
+To: stable@vger.kernel.org
+Cc: Lee Jones <lee.jones@linaro.org>, "Luis Claudio R. Goncalves" <lgoncalv@redhat.com>, Florian Fainelli <f.fainelli@gmail.com>
+Message-ID: <YGDqGB/tmAa2cjOz@decadent.org.uk>
+Content-Disposition: inline
+
+From: Thomas Gleixner <tglx@linutronix.de>
+
+commit 97181f9bd57405b879403763284537e27d46963d upstream.
+
+Alexander reported a hrtimer debug_object splat:
+
+  ODEBUG: free active (active state 0) object type: hrtimer hint: hrtimer_wakeup (kernel/time/hrtimer.c:1423)
+
+  debug_object_free (lib/debugobjects.c:603)
+  destroy_hrtimer_on_stack (kernel/time/hrtimer.c:427)
+  futex_lock_pi (kernel/futex.c:2740)
+  do_futex (kernel/futex.c:3399)
+  SyS_futex (kernel/futex.c:3447 kernel/futex.c:3415)
+  do_syscall_64 (arch/x86/entry/common.c:284)
+  entry_SYSCALL64_slow_path (arch/x86/entry/entry_64.S:249)
+
+Which was caused by commit:
+
+  cfafcd117da0 ("futex: Rework futex_lock_pi() to use rt_mutex_*_proxy_lock()")
+
+... losing the hrtimer_cancel() in the shuffle. Where previously the
+hrtimer_cancel() was done by rt_mutex_slowlock() we now need to do it
+manually.
+
+Reported-by: Alexander Levin <alexander.levin@verizon.com>
+Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
+Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
+Cc: Linus Torvalds <torvalds@linux-foundation.org>
+Cc: Peter Zijlstra <peterz@infradead.org>
+Fixes: cfafcd117da0 ("futex: Rework futex_lock_pi() to use rt_mutex_*_proxy_lock()")
+Link: http://lkml.kernel.org/r/alpine.DEB.2.20.1704101802370.2906@nanos
+Signed-off-by: Ingo Molnar <mingo@kernel.org>
+Signed-off-by: Ben Hutchings <ben@decadent.org.uk>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ kernel/futex.c |    4 +++-
+ 1 file changed, 3 insertions(+), 1 deletion(-)
+
+--- a/kernel/futex.c
++++ b/kernel/futex.c
+@@ -3018,8 +3018,10 @@ out_unlock_put_key:
+ out_put_key:
+       put_futex_key(&q.key);
+ out:
+-      if (to)
++      if (to) {
++              hrtimer_cancel(&to->timer);
+               destroy_hrtimer_on_stack(&to->timer);
++      }
+       return ret != -EINTR ? ret : -ERESTARTNOINTR;
+ uaddr_faulted:
diff --git a/queue-4.9/futex-drop-hb-lock-before-enqueueing-on-the-rtmutex.patch b/queue-4.9/futex-drop-hb-lock-before-enqueueing-on-the-rtmutex.patch
new file mode 100644 (file)
index 0000000..a3d59d6
--- /dev/null
@@ -0,0 +1,211 @@
+From foo@baz Mon Mar 29 07:48:09 AM CEST 2021
+From: Ben Hutchings <ben@decadent.org.uk>
+Date: Sun, 28 Mar 2021 22:41:51 +0200
+Subject: futex: Drop hb->lock before enqueueing on the rtmutex
+To: stable@vger.kernel.org
+Cc: Lee Jones <lee.jones@linaro.org>, "Luis Claudio R. Goncalves" <lgoncalv@redhat.com>, Florian Fainelli <f.fainelli@gmail.com>
+Message-ID: <YGDqD/h/x3P8HwyB@decadent.org.uk>
+Content-Disposition: inline
+
+From: Peter Zijlstra <peterz@infradead.org>
+
+commit 56222b212e8edb1cf51f5dd73ff645809b082b40 upstream.
+
+When PREEMPT_RT_FULL does the spinlock -> rt_mutex substitution the PI
+chain code will (falsely) report a deadlock and BUG.
+
+The problem is that it hold hb->lock (now an rt_mutex) while doing
+task_blocks_on_rt_mutex on the futex's pi_state::rtmutex. This, when
+interleaved just right with futex_unlock_pi() leads it to believe to see an
+AB-BA deadlock.
+
+  Task1 (holds rt_mutex,       Task2 (does FUTEX_LOCK_PI)
+         does FUTEX_UNLOCK_PI)
+
+                               lock hb->lock
+                               lock rt_mutex (as per start_proxy)
+  lock hb->lock
+
+Which is a trivial AB-BA.
+
+It is not an actual deadlock, because it won't be holding hb->lock by the
+time it actually blocks on the rt_mutex, but the chainwalk code doesn't
+know that and it would be a nightmare to handle this gracefully.
+
+To avoid this problem, do the same as in futex_unlock_pi() and drop
+hb->lock after acquiring wait_lock. This still fully serializes against
+futex_unlock_pi(), since adding to the wait_list does the very same lock
+dance, and removing it holds both locks.
+
+Aside of solving the RT problem this makes the lock and unlock mechanism
+symetric and reduces the hb->lock held time.
+
+Reported-and-tested-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
+Suggested-by: Thomas Gleixner <tglx@linutronix.de>
+Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
+Cc: juri.lelli@arm.com
+Cc: xlpang@redhat.com
+Cc: rostedt@goodmis.org
+Cc: mathieu.desnoyers@efficios.com
+Cc: jdesfossez@efficios.com
+Cc: dvhart@infradead.org
+Cc: bristot@redhat.com
+Link: http://lkml.kernel.org/r/20170322104152.161341537@infradead.org
+Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
+Signed-off-by: Ben Hutchings <ben@decadent.org.uk>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ kernel/futex.c                  |   30 +++++++++++++++++-------
+ kernel/locking/rtmutex.c        |   49 ++++++++++++++++++++++------------------
+ kernel/locking/rtmutex_common.h |    3 ++
+ 3 files changed, 52 insertions(+), 30 deletions(-)
+
+--- a/kernel/futex.c
++++ b/kernel/futex.c
+@@ -2948,20 +2948,33 @@ retry_private:
+               goto no_block;
+       }
++      rt_mutex_init_waiter(&rt_waiter);
++
+       /*
+-       * We must add ourselves to the rt_mutex waitlist while holding hb->lock
+-       * such that the hb and rt_mutex wait lists match.
++       * On PREEMPT_RT_FULL, when hb->lock becomes an rt_mutex, we must not
++       * hold it while doing rt_mutex_start_proxy(), because then it will
++       * include hb->lock in the blocking chain, even through we'll not in
++       * fact hold it while blocking. This will lead it to report -EDEADLK
++       * and BUG when futex_unlock_pi() interleaves with this.
++       *
++       * Therefore acquire wait_lock while holding hb->lock, but drop the
++       * latter before calling rt_mutex_start_proxy_lock(). This still fully
++       * serializes against futex_unlock_pi() as that does the exact same
++       * lock handoff sequence.
+        */
+-      rt_mutex_init_waiter(&rt_waiter);
+-      ret = rt_mutex_start_proxy_lock(&q.pi_state->pi_mutex, &rt_waiter, current);
++      raw_spin_lock_irq(&q.pi_state->pi_mutex.wait_lock);
++      spin_unlock(q.lock_ptr);
++      ret = __rt_mutex_start_proxy_lock(&q.pi_state->pi_mutex, &rt_waiter, current);
++      raw_spin_unlock_irq(&q.pi_state->pi_mutex.wait_lock);
++
+       if (ret) {
+               if (ret == 1)
+                       ret = 0;
++              spin_lock(q.lock_ptr);
+               goto no_block;
+       }
+-      spin_unlock(q.lock_ptr);
+       if (unlikely(to))
+               hrtimer_start_expires(&to->timer, HRTIMER_MODE_ABS);
+@@ -2974,6 +2987,9 @@ retry_private:
+        * first acquire the hb->lock before removing the lock from the
+        * rt_mutex waitqueue, such that we can keep the hb and rt_mutex
+        * wait lists consistent.
++       *
++       * In particular; it is important that futex_unlock_pi() can not
++       * observe this inconsistency.
+        */
+       if (ret && !rt_mutex_cleanup_proxy_lock(&q.pi_state->pi_mutex, &rt_waiter))
+               ret = 0;
+@@ -3071,10 +3087,6 @@ retry:
+               get_pi_state(pi_state);
+               /*
+-               * Since modifying the wait_list is done while holding both
+-               * hb->lock and wait_lock, holding either is sufficient to
+-               * observe it.
+-               *
+                * By taking wait_lock while still holding hb->lock, we ensure
+                * there is no point where we hold neither; and therefore
+                * wake_futex_pi() must observe a state consistent with what we
+--- a/kernel/locking/rtmutex.c
++++ b/kernel/locking/rtmutex.c
+@@ -1695,31 +1695,14 @@ void rt_mutex_proxy_unlock(struct rt_mut
+       rt_mutex_set_owner(lock, NULL);
+ }
+-/**
+- * rt_mutex_start_proxy_lock() - Start lock acquisition for another task
+- * @lock:             the rt_mutex to take
+- * @waiter:           the pre-initialized rt_mutex_waiter
+- * @task:             the task to prepare
+- *
+- * Returns:
+- *  0 - task blocked on lock
+- *  1 - acquired the lock for task, caller should wake it up
+- * <0 - error
+- *
+- * Special API call for FUTEX_REQUEUE_PI support.
+- */
+-int rt_mutex_start_proxy_lock(struct rt_mutex *lock,
++int __rt_mutex_start_proxy_lock(struct rt_mutex *lock,
+                             struct rt_mutex_waiter *waiter,
+                             struct task_struct *task)
+ {
+       int ret;
+-      raw_spin_lock_irq(&lock->wait_lock);
+-
+-      if (try_to_take_rt_mutex(lock, task, NULL)) {
+-              raw_spin_unlock_irq(&lock->wait_lock);
++      if (try_to_take_rt_mutex(lock, task, NULL))
+               return 1;
+-      }
+       /* We enforce deadlock detection for futexes */
+       ret = task_blocks_on_rt_mutex(lock, waiter, task,
+@@ -1738,12 +1721,36 @@ int rt_mutex_start_proxy_lock(struct rt_
+       if (unlikely(ret))
+               remove_waiter(lock, waiter);
+-      raw_spin_unlock_irq(&lock->wait_lock);
+-
+       debug_rt_mutex_print_deadlock(waiter);
+       return ret;
+ }
++
++/**
++ * rt_mutex_start_proxy_lock() - Start lock acquisition for another task
++ * @lock:             the rt_mutex to take
++ * @waiter:           the pre-initialized rt_mutex_waiter
++ * @task:             the task to prepare
++ *
++ * Returns:
++ *  0 - task blocked on lock
++ *  1 - acquired the lock for task, caller should wake it up
++ * <0 - error
++ *
++ * Special API call for FUTEX_REQUEUE_PI support.
++ */
++int rt_mutex_start_proxy_lock(struct rt_mutex *lock,
++                            struct rt_mutex_waiter *waiter,
++                            struct task_struct *task)
++{
++      int ret;
++
++      raw_spin_lock_irq(&lock->wait_lock);
++      ret = __rt_mutex_start_proxy_lock(lock, waiter, task);
++      raw_spin_unlock_irq(&lock->wait_lock);
++
++      return ret;
++}
+ /**
+  * rt_mutex_next_owner - return the next owner of the lock
+--- a/kernel/locking/rtmutex_common.h
++++ b/kernel/locking/rtmutex_common.h
+@@ -104,6 +104,9 @@ extern void rt_mutex_init_proxy_locked(s
+                                      struct task_struct *proxy_owner);
+ extern void rt_mutex_proxy_unlock(struct rt_mutex *lock);
+ extern void rt_mutex_init_waiter(struct rt_mutex_waiter *waiter);
++extern int __rt_mutex_start_proxy_lock(struct rt_mutex *lock,
++                                   struct rt_mutex_waiter *waiter,
++                                   struct task_struct *task);
+ extern int rt_mutex_start_proxy_lock(struct rt_mutex *lock,
+                                    struct rt_mutex_waiter *waiter,
+                                    struct task_struct *task);
diff --git a/queue-4.9/futex-fix-incorrect-should_fail_futex-handling.patch b/queue-4.9/futex-fix-incorrect-should_fail_futex-handling.patch
new file mode 100644 (file)
index 0000000..0fc668e
--- /dev/null
@@ -0,0 +1,50 @@
+From foo@baz Mon Mar 29 07:48:09 AM CEST 2021
+From: Ben Hutchings <ben@decadent.org.uk>
+Date: Sun, 28 Mar 2021 22:43:10 +0200
+Subject: futex: Fix incorrect should_fail_futex() handling
+To: stable@vger.kernel.org
+Cc: Lee Jones <lee.jones@linaro.org>, "Luis Claudio R. Goncalves" <lgoncalv@redhat.com>, Florian Fainelli <f.fainelli@gmail.com>
+Message-ID: <YGDqXlZ4Vb3r3Q8f@decadent.org.uk>
+Content-Disposition: inline
+
+From: Mateusz Nosek <mateusznosek0@gmail.com>
+
+commit 921c7ebd1337d1a46783d7e15a850e12aed2eaa0 upstream.
+
+If should_futex_fail() returns true in futex_wake_pi(), then the 'ret'
+variable is set to -EFAULT and then immediately overwritten. So the failure
+injection is non-functional.
+
+Fix it by actually leaving the function and returning -EFAULT.
+
+The Fixes tag is kinda blury because the initial commit which introduced
+failure injection was already sloppy, but the below mentioned commit broke
+it completely.
+
+[ tglx: Massaged changelog ]
+
+Fixes: 6b4f4bc9cb22 ("locking/futex: Allow low-level atomic operations to return -EAGAIN")
+Signed-off-by: Mateusz Nosek <mateusznosek0@gmail.com>
+Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
+Link: https://lore.kernel.org/r/20200927000858.24219-1-mateusznosek0@gmail.com
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+Signed-off-by: Ben Hutchings <ben@decadent.org.uk>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ kernel/futex.c |    4 +++-
+ 1 file changed, 3 insertions(+), 1 deletion(-)
+
+--- a/kernel/futex.c
++++ b/kernel/futex.c
+@@ -1605,8 +1605,10 @@ static int wake_futex_pi(u32 __user *uad
+        */
+       newval = FUTEX_WAITERS | task_pid_vnr(new_owner);
+-      if (unlikely(should_fail_futex(true)))
++      if (unlikely(should_fail_futex(true))) {
+               ret = -EFAULT;
++              goto out_unlock;
++      }
+       ret = cmpxchg_futex_value_locked(&curval, uaddr, uval, newval);
+       if (!ret && (curval != uval)) {
diff --git a/queue-4.9/futex-fix-possible-missed-wakeup.patch b/queue-4.9/futex-fix-possible-missed-wakeup.patch
new file mode 100644 (file)
index 0000000..9c8e214
--- /dev/null
@@ -0,0 +1,64 @@
+From foo@baz Mon Mar 29 07:48:09 AM CEST 2021
+From: Ben Hutchings <ben@decadent.org.uk>
+Date: Sun, 28 Mar 2021 22:42:32 +0200
+Subject: futex: Fix (possible) missed wakeup
+To: stable@vger.kernel.org
+Cc: Lee Jones <lee.jones@linaro.org>, "Luis Claudio R. Goncalves" <lgoncalv@redhat.com>, Florian Fainelli <f.fainelli@gmail.com>
+Message-ID: <YGDqOPquwh/m9sL9@decadent.org.uk>
+Content-Disposition: inline
+
+From: Peter Zijlstra <peterz@infradead.org>
+
+commit b061c38bef43406df8e73c5be06cbfacad5ee6ad upstream.
+
+We must not rely on wake_q_add() to delay the wakeup; in particular
+commit:
+
+  1d0dcb3ad9d3 ("futex: Implement lockless wakeups")
+
+moved wake_q_add() before smp_store_release(&q->lock_ptr, NULL), which
+could result in futex_wait() waking before observing ->lock_ptr ==
+NULL and going back to sleep again.
+
+Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
+Cc: Linus Torvalds <torvalds@linux-foundation.org>
+Cc: Peter Zijlstra <peterz@infradead.org>
+Cc: Thomas Gleixner <tglx@linutronix.de>
+Fixes: 1d0dcb3ad9d3 ("futex: Implement lockless wakeups")
+Signed-off-by: Ingo Molnar <mingo@kernel.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+Signed-off-by: Ben Hutchings <ben@decadent.org.uk>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ kernel/futex.c |   13 ++++++++-----
+ 1 file changed, 8 insertions(+), 5 deletions(-)
+
+--- a/kernel/futex.c
++++ b/kernel/futex.c
+@@ -1553,11 +1553,7 @@ static void mark_wake_futex(struct wake_
+       if (WARN(q->pi_state || q->rt_waiter, "refusing to wake PI futex\n"))
+               return;
+-      /*
+-       * Queue the task for later wakeup for after we've released
+-       * the hb->lock. wake_q_add() grabs reference to p.
+-       */
+-      wake_q_add(wake_q, p);
++      get_task_struct(p);
+       __unqueue_futex(q);
+       /*
+        * The waiting task can free the futex_q as soon as
+@@ -1566,6 +1562,13 @@ static void mark_wake_futex(struct wake_
+        * store to lock_ptr from getting ahead of the plist_del.
+        */
+       smp_store_release(&q->lock_ptr, NULL);
++
++      /*
++       * Queue the task for later wakeup for after we've released
++       * the hb->lock. wake_q_add() grabs reference to p.
++       */
++      wake_q_add(wake_q, p);
++      put_task_struct(p);
+ }
+ /*
diff --git a/queue-4.9/futex-handle-early-deadlock-return-correctly.patch b/queue-4.9/futex-handle-early-deadlock-return-correctly.patch
new file mode 100644 (file)
index 0000000..c5d088c
--- /dev/null
@@ -0,0 +1,231 @@
+From foo@baz Mon Mar 29 07:48:09 AM CEST 2021
+From: Ben Hutchings <ben@decadent.org.uk>
+Date: Sun, 28 Mar 2021 22:42:20 +0200
+Subject: futex: Handle early deadlock return correctly
+To: stable@vger.kernel.org
+Cc: Lee Jones <lee.jones@linaro.org>, "Luis Claudio R. Goncalves" <lgoncalv@redhat.com>, Florian Fainelli <f.fainelli@gmail.com>
+Message-ID: <YGDqLAxAfaH+BDVy@decadent.org.uk>
+Content-Disposition: inline
+
+From: Thomas Gleixner <tglx@linutronix.de>
+
+commit 1a1fb985f2e2b85ec0d3dc2e519ee48389ec2434 upstream.
+
+commit 56222b212e8e ("futex: Drop hb->lock before enqueueing on the
+rtmutex") changed the locking rules in the futex code so that the hash
+bucket lock is not longer held while the waiter is enqueued into the
+rtmutex wait list. This made the lock and the unlock path symmetric, but
+unfortunately the possible early exit from __rt_mutex_proxy_start() due to
+a detected deadlock was not updated accordingly. That allows a concurrent
+unlocker to observe inconsitent state which triggers the warning in the
+unlock path.
+
+futex_lock_pi()                         futex_unlock_pi()
+  lock(hb->lock)
+  queue(hb_waiter)                             lock(hb->lock)
+  lock(rtmutex->wait_lock)
+  unlock(hb->lock)
+                                        // acquired hb->lock
+                                        hb_waiter = futex_top_waiter()
+                                        lock(rtmutex->wait_lock)
+  __rt_mutex_proxy_start()
+     ---> fail
+          remove(rtmutex_waiter);
+     ---> returns -EDEADLOCK
+  unlock(rtmutex->wait_lock)
+                                        // acquired wait_lock
+                                        wake_futex_pi()
+                                        rt_mutex_next_owner()
+                                         --> returns NULL
+                                          --> WARN
+
+  lock(hb->lock)
+  unqueue(hb_waiter)
+
+The problem is caused by the remove(rtmutex_waiter) in the failure case of
+__rt_mutex_proxy_start() as this lets the unlocker observe a waiter in the
+hash bucket but no waiter on the rtmutex, i.e. inconsistent state.
+
+The original commit handles this correctly for the other early return cases
+(timeout, signal) by delaying the removal of the rtmutex waiter until the
+returning task reacquired the hash bucket lock.
+
+Treat the failure case of __rt_mutex_proxy_start() in the same way and let
+the existing cleanup code handle the eventual handover of the rtmutex
+gracefully. The regular rt_mutex_proxy_start() gains the rtmutex waiter
+removal for the failure case, so that the other callsites are still
+operating correctly.
+
+Add proper comments to the code so all these details are fully documented.
+
+Thanks to Peter for helping with the analysis and writing the really
+valuable code comments.
+
+Fixes: 56222b212e8e ("futex: Drop hb->lock before enqueueing on the rtmutex")
+Reported-by: Heiko Carstens <heiko.carstens@de.ibm.com>
+Co-developed-by: Peter Zijlstra <peterz@infradead.org>
+Signed-off-by: Peter Zijlstra <peterz@infradead.org>
+Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
+Tested-by: Heiko Carstens <heiko.carstens@de.ibm.com>
+Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
+Cc: linux-s390@vger.kernel.org
+Cc: Stefan Liebler <stli@linux.ibm.com>
+Cc: Sebastian Sewior <bigeasy@linutronix.de>
+Cc: stable@vger.kernel.org
+Link: https://lkml.kernel.org/r/alpine.DEB.2.21.1901292311410.1950@nanos.tec.linutronix.de
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+Signed-off-by: Ben Hutchings <ben@decadent.org.uk>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ kernel/futex.c           |   28 ++++++++++++++++++----------
+ kernel/locking/rtmutex.c |   37 ++++++++++++++++++++++++++++++++-----
+ 2 files changed, 50 insertions(+), 15 deletions(-)
+
+--- a/kernel/futex.c
++++ b/kernel/futex.c
+@@ -2958,35 +2958,39 @@ retry_private:
+        * and BUG when futex_unlock_pi() interleaves with this.
+        *
+        * Therefore acquire wait_lock while holding hb->lock, but drop the
+-       * latter before calling rt_mutex_start_proxy_lock(). This still fully
+-       * serializes against futex_unlock_pi() as that does the exact same
+-       * lock handoff sequence.
++       * latter before calling __rt_mutex_start_proxy_lock(). This
++       * interleaves with futex_unlock_pi() -- which does a similar lock
++       * handoff -- such that the latter can observe the futex_q::pi_state
++       * before __rt_mutex_start_proxy_lock() is done.
+        */
+       raw_spin_lock_irq(&q.pi_state->pi_mutex.wait_lock);
+       spin_unlock(q.lock_ptr);
++      /*
++       * __rt_mutex_start_proxy_lock() unconditionally enqueues the @rt_waiter
++       * such that futex_unlock_pi() is guaranteed to observe the waiter when
++       * it sees the futex_q::pi_state.
++       */
+       ret = __rt_mutex_start_proxy_lock(&q.pi_state->pi_mutex, &rt_waiter, current);
+       raw_spin_unlock_irq(&q.pi_state->pi_mutex.wait_lock);
+       if (ret) {
+               if (ret == 1)
+                       ret = 0;
+-
+-              spin_lock(q.lock_ptr);
+-              goto no_block;
++              goto cleanup;
+       }
+-
+       if (unlikely(to))
+               hrtimer_start_expires(&to->timer, HRTIMER_MODE_ABS);
+       ret = rt_mutex_wait_proxy_lock(&q.pi_state->pi_mutex, to, &rt_waiter);
++cleanup:
+       spin_lock(q.lock_ptr);
+       /*
+-       * If we failed to acquire the lock (signal/timeout), we must
++       * If we failed to acquire the lock (deadlock/signal/timeout), we must
+        * first acquire the hb->lock before removing the lock from the
+-       * rt_mutex waitqueue, such that we can keep the hb and rt_mutex
+-       * wait lists consistent.
++       * rt_mutex waitqueue, such that we can keep the hb and rt_mutex wait
++       * lists consistent.
+        *
+        * In particular; it is important that futex_unlock_pi() can not
+        * observe this inconsistency.
+@@ -3093,6 +3097,10 @@ retry:
+                * there is no point where we hold neither; and therefore
+                * wake_futex_pi() must observe a state consistent with what we
+                * observed.
++               *
++               * In particular; this forces __rt_mutex_start_proxy() to
++               * complete such that we're guaranteed to observe the
++               * rt_waiter. Also see the WARN in wake_futex_pi().
+                */
+               raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);
+               spin_unlock(&hb->lock);
+--- a/kernel/locking/rtmutex.c
++++ b/kernel/locking/rtmutex.c
+@@ -1695,12 +1695,33 @@ void rt_mutex_proxy_unlock(struct rt_mut
+       rt_mutex_set_owner(lock, NULL);
+ }
++/**
++ * __rt_mutex_start_proxy_lock() - Start lock acquisition for another task
++ * @lock:             the rt_mutex to take
++ * @waiter:           the pre-initialized rt_mutex_waiter
++ * @task:             the task to prepare
++ *
++ * Starts the rt_mutex acquire; it enqueues the @waiter and does deadlock
++ * detection. It does not wait, see rt_mutex_wait_proxy_lock() for that.
++ *
++ * NOTE: does _NOT_ remove the @waiter on failure; must either call
++ * rt_mutex_wait_proxy_lock() or rt_mutex_cleanup_proxy_lock() after this.
++ *
++ * Returns:
++ *  0 - task blocked on lock
++ *  1 - acquired the lock for task, caller should wake it up
++ * <0 - error
++ *
++ * Special API call for PI-futex support.
++ */
+ int __rt_mutex_start_proxy_lock(struct rt_mutex *lock,
+                             struct rt_mutex_waiter *waiter,
+                             struct task_struct *task)
+ {
+       int ret;
++      lockdep_assert_held(&lock->wait_lock);
++
+       if (try_to_take_rt_mutex(lock, task, NULL))
+               return 1;
+@@ -1718,9 +1739,6 @@ int __rt_mutex_start_proxy_lock(struct r
+               ret = 0;
+       }
+-      if (unlikely(ret))
+-              remove_waiter(lock, waiter);
+-
+       debug_rt_mutex_print_deadlock(waiter);
+       return ret;
+@@ -1732,12 +1750,18 @@ int __rt_mutex_start_proxy_lock(struct r
+  * @waiter:           the pre-initialized rt_mutex_waiter
+  * @task:             the task to prepare
+  *
++ * Starts the rt_mutex acquire; it enqueues the @waiter and does deadlock
++ * detection. It does not wait, see rt_mutex_wait_proxy_lock() for that.
++ *
++ * NOTE: unlike __rt_mutex_start_proxy_lock this _DOES_ remove the @waiter
++ * on failure.
++ *
+  * Returns:
+  *  0 - task blocked on lock
+  *  1 - acquired the lock for task, caller should wake it up
+  * <0 - error
+  *
+- * Special API call for FUTEX_REQUEUE_PI support.
++ * Special API call for PI-futex support.
+  */
+ int rt_mutex_start_proxy_lock(struct rt_mutex *lock,
+                             struct rt_mutex_waiter *waiter,
+@@ -1747,6 +1771,8 @@ int rt_mutex_start_proxy_lock(struct rt_
+       raw_spin_lock_irq(&lock->wait_lock);
+       ret = __rt_mutex_start_proxy_lock(lock, waiter, task);
++      if (unlikely(ret))
++              remove_waiter(lock, waiter);
+       raw_spin_unlock_irq(&lock->wait_lock);
+       return ret;
+@@ -1814,7 +1840,8 @@ int rt_mutex_wait_proxy_lock(struct rt_m
+  * @lock:             the rt_mutex we were woken on
+  * @waiter:           the pre-initialized rt_mutex_waiter
+  *
+- * Attempt to clean up after a failed rt_mutex_wait_proxy_lock().
++ * Attempt to clean up after a failed __rt_mutex_start_proxy_lock() or
++ * rt_mutex_wait_proxy_lock().
+  *
+  * Unless we acquired the lock; we're still enqueued on the wait-list and can
+  * in fact still be granted ownership until we're removed. Therefore we can
diff --git a/queue-4.9/futex-handle-transient-ownerless-rtmutex-state-correctly.patch b/queue-4.9/futex-handle-transient-ownerless-rtmutex-state-correctly.patch
new file mode 100644 (file)
index 0000000..0bb854d
--- /dev/null
@@ -0,0 +1,85 @@
+From foo@baz Mon Mar 29 07:48:09 AM CEST 2021
+From: Ben Hutchings <ben@decadent.org.uk>
+Date: Sun, 28 Mar 2021 22:43:15 +0200
+Subject: futex: Handle transient "ownerless" rtmutex state correctly
+To: stable@vger.kernel.org
+Cc: Lee Jones <lee.jones@linaro.org>, "Luis Claudio R. Goncalves" <lgoncalv@redhat.com>, Florian Fainelli <f.fainelli@gmail.com>
+Message-ID: <YGDqY9a/qbJKK5eC@decadent.org.uk>
+Content-Disposition: inline
+
+From: Mike Galbraith <efault@gmx.de>
+
+commit 9f5d1c336a10c0d24e83e40b4c1b9539f7dba627 upstream.
+
+Gratian managed to trigger the BUG_ON(!newowner) in fixup_pi_state_owner().
+This is one possible chain of events leading to this:
+
+Task Prio       Operation
+T1   120       lock(F)
+T2   120       lock(F)   -> blocks (top waiter)
+T3   50 (RT)   lock(F)   -> boosts T1 and blocks (new top waiter)
+XX             timeout/  -> wakes T2
+               signal
+T1   50                unlock(F) -> wakes T3 (rtmutex->owner == NULL, waiter bit is set)
+T2   120       cleanup   -> try_to_take_mutex() fails because T3 is the top waiter
+                            and the lower priority T2 cannot steal the lock.
+                         -> fixup_pi_state_owner() sees newowner == NULL -> BUG_ON()
+
+The comment states that this is invalid and rt_mutex_real_owner() must
+return a non NULL owner when the trylock failed, but in case of a queued
+and woken up waiter rt_mutex_real_owner() == NULL is a valid transient
+state. The higher priority waiter has simply not yet managed to take over
+the rtmutex.
+
+The BUG_ON() is therefore wrong and this is just another retry condition in
+fixup_pi_state_owner().
+
+Drop the locks, so that T3 can make progress, and then try the fixup again.
+
+Gratian provided a great analysis, traces and a reproducer. The analysis is
+to the point, but it confused the hell out of that tglx dude who had to
+page in all the futex horrors again. Condensed version is above.
+
+[ tglx: Wrote comment and changelog ]
+
+Fixes: c1e2f0eaf015 ("futex: Avoid violating the 10th rule of futex")
+Reported-by: Gratian Crisan <gratian.crisan@ni.com>
+Signed-off-by: Mike Galbraith <efault@gmx.de>
+Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
+Cc: stable@vger.kernel.org
+Link: https://lore.kernel.org/r/87a6w6x7bb.fsf@ni.com
+Link: https://lore.kernel.org/r/87sg9pkvf7.fsf@nanos.tec.linutronix.de
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+Signed-off-by: Ben Hutchings <ben@decadent.org.uk>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ kernel/futex.c |   16 ++++++++++++++--
+ 1 file changed, 14 insertions(+), 2 deletions(-)
+
+--- a/kernel/futex.c
++++ b/kernel/futex.c
+@@ -2497,10 +2497,22 @@ retry:
+               }
+               /*
+-               * Since we just failed the trylock; there must be an owner.
++               * The trylock just failed, so either there is an owner or
++               * there is a higher priority waiter than this one.
+                */
+               newowner = rt_mutex_owner(&pi_state->pi_mutex);
+-              BUG_ON(!newowner);
++              /*
++               * If the higher priority waiter has not yet taken over the
++               * rtmutex then newowner is NULL. We can't return here with
++               * that state because it's inconsistent vs. the user space
++               * state. So drop the locks and try again. It's a valid
++               * situation and not any different from the other retry
++               * conditions.
++               */
++              if (unlikely(!newowner)) {
++                      err = -EAGAIN;
++                      goto handle_err;
++              }
+       } else {
+               WARN_ON_ONCE(argowner != current);
+               if (oldowner == current) {
diff --git a/queue-4.9/futex-prevent-robust-futex-exit-race.patch b/queue-4.9/futex-prevent-robust-futex-exit-race.patch
new file mode 100644 (file)
index 0000000..6f9d81d
--- /dev/null
@@ -0,0 +1,266 @@
+From foo@baz Mon Mar 29 07:48:09 AM CEST 2021
+From: Ben Hutchings <ben@decadent.org.uk>
+Date: Sun, 28 Mar 2021 22:42:58 +0200
+Subject: futex: Prevent robust futex exit race
+To: stable@vger.kernel.org
+Cc: Lee Jones <lee.jones@linaro.org>, "Luis Claudio R. Goncalves" <lgoncalv@redhat.com>, Florian Fainelli <f.fainelli@gmail.com>
+Message-ID: <YGDqUkDvp7aAhzSQ@decadent.org.uk>
+Content-Disposition: inline
+
+From: Yang Tao <yang.tao172@zte.com.cn>
+
+commit ca16d5bee59807bf04deaab0a8eccecd5061528c upstream.
+
+Robust futexes utilize the robust_list mechanism to allow the kernel to
+release futexes which are held when a task exits. The exit can be voluntary
+or caused by a signal or fault. This prevents that waiters block forever.
+
+The futex operations in user space store a pointer to the futex they are
+either locking or unlocking in the op_pending member of the per task robust
+list.
+
+After a lock operation has succeeded the futex is queued in the robust list
+linked list and the op_pending pointer is cleared.
+
+After an unlock operation has succeeded the futex is removed from the
+robust list linked list and the op_pending pointer is cleared.
+
+The robust list exit code checks for the pending operation and any futex
+which is queued in the linked list. It carefully checks whether the futex
+value is the TID of the exiting task. If so, it sets the OWNER_DIED bit and
+tries to wake up a potential waiter.
+
+This is race free for the lock operation but unlock has two race scenarios
+where waiters might not be woken up. These issues can be observed with
+regular robust pthread mutexes. PI aware pthread mutexes are not affected.
+
+(1) Unlocking task is killed after unlocking the futex value in user space
+    before being able to wake a waiter.
+
+        pthread_mutex_unlock()
+                |
+                V
+        atomic_exchange_rel (&mutex->__data.__lock, 0)
+                        <------------------------killed
+            lll_futex_wake ()                   |
+                                                |
+                                                |(__lock = 0)
+                                                |(enter kernel)
+                                                |
+                                                V
+                                            do_exit()
+                                            exit_mm()
+                                          mm_release()
+                                        exit_robust_list()
+                                        handle_futex_death()
+                                                |
+                                                |(__lock = 0)
+                                                |(uval = 0)
+                                                |
+                                                V
+        if ((uval & FUTEX_TID_MASK) != task_pid_vnr(curr))
+                return 0;
+
+    The sanity check which ensures that the user space futex is owned by
+    the exiting task prevents the wakeup of waiters which in consequence
+    block infinitely.
+
+(2) Waiting task is killed after a wakeup and before it can acquire the
+    futex in user space.
+
+        OWNER                         WAITER
+                               futex_wait()
+   pthread_mutex_unlock()               |
+                |                       |
+                |(__lock = 0)           |
+                |                       |
+                V                       |
+         futex_wake() ------------>  wakeup()
+                                        |
+                                        |(return to userspace)
+                                        |(__lock = 0)
+                                        |
+                                        V
+                        oldval = mutex->__data.__lock
+                                          <-----------------killed
+    atomic_compare_and_exchange_val_acq (&mutex->__data.__lock,  |
+                        id | assume_other_futex_waiters, 0)      |
+                                                                 |
+                                                                 |
+                                                   (enter kernel)|
+                                                                 |
+                                                                 V
+                                                         do_exit()
+                                                        |
+                                                        |
+                                                        V
+                                        handle_futex_death()
+                                        |
+                                        |(__lock = 0)
+                                        |(uval = 0)
+                                        |
+                                        V
+        if ((uval & FUTEX_TID_MASK) != task_pid_vnr(curr))
+                return 0;
+
+    The sanity check which ensures that the user space futex is owned
+    by the exiting task prevents the wakeup of waiters, which seems to
+    be correct as the exiting task does not own the futex value, but
+    the consequence is that other waiters wont be woken up and block
+    infinitely.
+
+In both scenarios the following conditions are true:
+
+   - task->robust_list->list_op_pending != NULL
+   - user space futex value == 0
+   - Regular futex (not PI)
+
+If these conditions are met then it is reasonably safe to wake up a
+potential waiter in order to prevent the above problems.
+
+As this might be a false positive it can cause spurious wakeups, but the
+waiter side has to handle other types of unrelated wakeups, e.g. signals
+gracefully anyway. So such a spurious wakeup will not affect the
+correctness of these operations.
+
+This workaround must not touch the user space futex value and cannot set
+the OWNER_DIED bit because the lock value is 0, i.e. uncontended. Setting
+OWNER_DIED in this case would result in inconsistent state and subsequently
+in malfunction of the owner died handling in user space.
+
+The rest of the user space state is still consistent as no other task can
+observe the list_op_pending entry in the exiting tasks robust list.
+
+The eventually woken up waiter will observe the uncontended lock value and
+take it over.
+
+[ tglx: Massaged changelog and comment. Made the return explicit and not
+       depend on the subsequent check and added constants to hand into
+       handle_futex_death() instead of plain numbers. Fixed a few coding
+       style issues. ]
+
+Fixes: 0771dfefc9e5 ("[PATCH] lightweight robust futexes: core")
+Signed-off-by: Yang Tao <yang.tao172@zte.com.cn>
+Signed-off-by: Yi Wang <wang.yi59@zte.com.cn>
+Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
+Reviewed-by: Ingo Molnar <mingo@kernel.org>
+Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
+Cc: stable@vger.kernel.org
+Link: https://lkml.kernel.org/r/1573010582-35297-1-git-send-email-wang.yi59@zte.com.cn
+Link: https://lkml.kernel.org/r/20191106224555.943191378@linutronix.de
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+Signed-off-by: Ben Hutchings <ben@decadent.org.uk>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ kernel/futex.c |   58 ++++++++++++++++++++++++++++++++++++++++++++++++++-------
+ 1 file changed, 51 insertions(+), 7 deletions(-)
+
+--- a/kernel/futex.c
++++ b/kernel/futex.c
+@@ -3526,11 +3526,16 @@ err_unlock:
+       return ret;
+ }
++/* Constants for the pending_op argument of handle_futex_death */
++#define HANDLE_DEATH_PENDING  true
++#define HANDLE_DEATH_LIST     false
++
+ /*
+  * Process a futex-list entry, check whether it's owned by the
+  * dying task, and do notification if so:
+  */
+-static int handle_futex_death(u32 __user *uaddr, struct task_struct *curr, int pi)
++static int handle_futex_death(u32 __user *uaddr, struct task_struct *curr,
++                            bool pi, bool pending_op)
+ {
+       u32 uval, uninitialized_var(nval), mval;
+       int err;
+@@ -3543,6 +3548,42 @@ retry:
+       if (get_user(uval, uaddr))
+               return -1;
++      /*
++       * Special case for regular (non PI) futexes. The unlock path in
++       * user space has two race scenarios:
++       *
++       * 1. The unlock path releases the user space futex value and
++       *    before it can execute the futex() syscall to wake up
++       *    waiters it is killed.
++       *
++       * 2. A woken up waiter is killed before it can acquire the
++       *    futex in user space.
++       *
++       * In both cases the TID validation below prevents a wakeup of
++       * potential waiters which can cause these waiters to block
++       * forever.
++       *
++       * In both cases the following conditions are met:
++       *
++       *      1) task->robust_list->list_op_pending != NULL
++       *         @pending_op == true
++       *      2) User space futex value == 0
++       *      3) Regular futex: @pi == false
++       *
++       * If these conditions are met, it is safe to attempt waking up a
++       * potential waiter without touching the user space futex value and
++       * trying to set the OWNER_DIED bit. The user space futex value is
++       * uncontended and the rest of the user space mutex state is
++       * consistent, so a woken waiter will just take over the
++       * uncontended futex. Setting the OWNER_DIED bit would create
++       * inconsistent state and malfunction of the user space owner died
++       * handling.
++       */
++      if (pending_op && !pi && !uval) {
++              futex_wake(uaddr, 1, 1, FUTEX_BITSET_MATCH_ANY);
++              return 0;
++      }
++
+       if ((uval & FUTEX_TID_MASK) != task_pid_vnr(curr))
+               return 0;
+@@ -3662,10 +3703,11 @@ static void exit_robust_list(struct task
+                * A pending lock might already be on the list, so
+                * don't process it twice:
+                */
+-              if (entry != pending)
++              if (entry != pending) {
+                       if (handle_futex_death((void __user *)entry + futex_offset,
+-                                              curr, pi))
++                                              curr, pi, HANDLE_DEATH_LIST))
+                               return;
++              }
+               if (rc)
+                       return;
+               entry = next_entry;
+@@ -3679,9 +3721,10 @@ static void exit_robust_list(struct task
+               cond_resched();
+       }
+-      if (pending)
++      if (pending) {
+               handle_futex_death((void __user *)pending + futex_offset,
+-                                 curr, pip);
++                                 curr, pip, HANDLE_DEATH_PENDING);
++      }
+ }
+ static void futex_cleanup(struct task_struct *tsk)
+@@ -3964,7 +4007,8 @@ void compat_exit_robust_list(struct task
+               if (entry != pending) {
+                       void __user *uaddr = futex_uaddr(entry, futex_offset);
+-                      if (handle_futex_death(uaddr, curr, pi))
++                      if (handle_futex_death(uaddr, curr, pi,
++                                             HANDLE_DEATH_LIST))
+                               return;
+               }
+               if (rc)
+@@ -3983,7 +4027,7 @@ void compat_exit_robust_list(struct task
+       if (pending) {
+               void __user *uaddr = futex_uaddr(pending, futex_offset);
+-              handle_futex_death(uaddr, curr, pip);
++              handle_futex_death(uaddr, curr, pip, HANDLE_DEATH_PENDING);
+       }
+ }
diff --git a/queue-4.9/futex-rework-futex_lock_pi-to-use-rt_mutex_-_proxy_lock.patch b/queue-4.9/futex-rework-futex_lock_pi-to-use-rt_mutex_-_proxy_lock.patch
new file mode 100644 (file)
index 0000000..dc557f5
--- /dev/null
@@ -0,0 +1,273 @@
+From foo@baz Mon Mar 29 07:48:09 AM CEST 2021
+From: Ben Hutchings <ben@decadent.org.uk>
+Date: Sun, 28 Mar 2021 22:41:42 +0200
+Subject: futex: Rework futex_lock_pi() to use rt_mutex_*_proxy_lock()
+To: stable@vger.kernel.org
+Cc: Lee Jones <lee.jones@linaro.org>, "Luis Claudio R. Goncalves" <lgoncalv@redhat.com>, Florian Fainelli <f.fainelli@gmail.com>
+Message-ID: <YGDqBnDrUInZoDsM@decadent.org.uk>
+Content-Disposition: inline
+
+From: Peter Zijlstra <peterz@infradead.org>
+
+commit cfafcd117da0216520568c195cb2f6cd1980c4bb upstream.
+
+By changing futex_lock_pi() to use rt_mutex_*_proxy_lock() all wait_list
+modifications are done under both hb->lock and wait_lock.
+
+This closes the obvious interleave pattern between futex_lock_pi() and
+futex_unlock_pi(), but not entirely so. See below:
+
+Before:
+
+futex_lock_pi()                        futex_unlock_pi()
+  unlock hb->lock
+
+                                 lock hb->lock
+                                 unlock hb->lock
+
+                                 lock rt_mutex->wait_lock
+                                 unlock rt_mutex_wait_lock
+                                   -EAGAIN
+
+  lock rt_mutex->wait_lock
+  list_add
+  unlock rt_mutex->wait_lock
+
+  schedule()
+
+  lock rt_mutex->wait_lock
+  list_del
+  unlock rt_mutex->wait_lock
+
+                                 <idem>
+                                   -EAGAIN
+
+  lock hb->lock
+
+After:
+
+futex_lock_pi()                        futex_unlock_pi()
+
+  lock hb->lock
+  lock rt_mutex->wait_lock
+  list_add
+  unlock rt_mutex->wait_lock
+  unlock hb->lock
+
+  schedule()
+                                 lock hb->lock
+                                 unlock hb->lock
+  lock hb->lock
+  lock rt_mutex->wait_lock
+  list_del
+  unlock rt_mutex->wait_lock
+
+                                 lock rt_mutex->wait_lock
+                                 unlock rt_mutex_wait_lock
+                                   -EAGAIN
+
+  unlock hb->lock
+
+It does however solve the earlier starvation/live-lock scenario which got
+introduced with the -EAGAIN since unlike the before scenario; where the
+-EAGAIN happens while futex_unlock_pi() doesn't hold any locks; in the
+after scenario it happens while futex_unlock_pi() actually holds a lock,
+and then it is serialized on that lock.
+
+Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
+Cc: juri.lelli@arm.com
+Cc: bigeasy@linutronix.de
+Cc: xlpang@redhat.com
+Cc: rostedt@goodmis.org
+Cc: mathieu.desnoyers@efficios.com
+Cc: jdesfossez@efficios.com
+Cc: dvhart@infradead.org
+Cc: bristot@redhat.com
+Link: http://lkml.kernel.org/r/20170322104152.062785528@infradead.org
+Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
+[bwh: Backported to 4.9: adjust context]
+Signed-off-by: Ben Hutchings <ben@decadent.org.uk>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ kernel/futex.c                  |   77 ++++++++++++++++++++++++++++------------
+ kernel/locking/rtmutex.c        |   26 +++----------
+ kernel/locking/rtmutex_common.h |    1 
+ 3 files changed, 62 insertions(+), 42 deletions(-)
+
+--- a/kernel/futex.c
++++ b/kernel/futex.c
+@@ -2333,20 +2333,7 @@ queue_unlock(struct futex_hash_bucket *h
+       hb_waiters_dec(hb);
+ }
+-/**
+- * queue_me() - Enqueue the futex_q on the futex_hash_bucket
+- * @q:        The futex_q to enqueue
+- * @hb:       The destination hash bucket
+- *
+- * The hb->lock must be held by the caller, and is released here. A call to
+- * queue_me() is typically paired with exactly one call to unqueue_me().  The
+- * exceptions involve the PI related operations, which may use unqueue_me_pi()
+- * or nothing if the unqueue is done as part of the wake process and the unqueue
+- * state is implicit in the state of woken task (see futex_wait_requeue_pi() for
+- * an example).
+- */
+-static inline void queue_me(struct futex_q *q, struct futex_hash_bucket *hb)
+-      __releases(&hb->lock)
++static inline void __queue_me(struct futex_q *q, struct futex_hash_bucket *hb)
+ {
+       int prio;
+@@ -2363,6 +2350,24 @@ static inline void queue_me(struct futex
+       plist_node_init(&q->list, prio);
+       plist_add(&q->list, &hb->chain);
+       q->task = current;
++}
++
++/**
++ * queue_me() - Enqueue the futex_q on the futex_hash_bucket
++ * @q:        The futex_q to enqueue
++ * @hb:       The destination hash bucket
++ *
++ * The hb->lock must be held by the caller, and is released here. A call to
++ * queue_me() is typically paired with exactly one call to unqueue_me().  The
++ * exceptions involve the PI related operations, which may use unqueue_me_pi()
++ * or nothing if the unqueue is done as part of the wake process and the unqueue
++ * state is implicit in the state of woken task (see futex_wait_requeue_pi() for
++ * an example).
++ */
++static inline void queue_me(struct futex_q *q, struct futex_hash_bucket *hb)
++      __releases(&hb->lock)
++{
++      __queue_me(q, hb);
+       spin_unlock(&hb->lock);
+ }
+@@ -2868,6 +2873,7 @@ static int futex_lock_pi(u32 __user *uad
+ {
+       struct hrtimer_sleeper timeout, *to = NULL;
+       struct task_struct *exiting = NULL;
++      struct rt_mutex_waiter rt_waiter;
+       struct futex_hash_bucket *hb;
+       struct futex_q q = futex_q_init;
+       int res, ret;
+@@ -2928,25 +2934,52 @@ retry_private:
+               }
+       }
++      WARN_ON(!q.pi_state);
++
+       /*
+        * Only actually queue now that the atomic ops are done:
+        */
+-      queue_me(&q, hb);
++      __queue_me(&q, hb);
+-      WARN_ON(!q.pi_state);
+-      /*
+-       * Block on the PI mutex:
+-       */
+-      if (!trylock) {
+-              ret = rt_mutex_timed_futex_lock(&q.pi_state->pi_mutex, to);
+-      } else {
++      if (trylock) {
+               ret = rt_mutex_futex_trylock(&q.pi_state->pi_mutex);
+               /* Fixup the trylock return value: */
+               ret = ret ? 0 : -EWOULDBLOCK;
++              goto no_block;
+       }
++      /*
++       * We must add ourselves to the rt_mutex waitlist while holding hb->lock
++       * such that the hb and rt_mutex wait lists match.
++       */
++      rt_mutex_init_waiter(&rt_waiter);
++      ret = rt_mutex_start_proxy_lock(&q.pi_state->pi_mutex, &rt_waiter, current);
++      if (ret) {
++              if (ret == 1)
++                      ret = 0;
++
++              goto no_block;
++      }
++
++      spin_unlock(q.lock_ptr);
++
++      if (unlikely(to))
++              hrtimer_start_expires(&to->timer, HRTIMER_MODE_ABS);
++
++      ret = rt_mutex_wait_proxy_lock(&q.pi_state->pi_mutex, to, &rt_waiter);
++
+       spin_lock(q.lock_ptr);
+       /*
++       * If we failed to acquire the lock (signal/timeout), we must
++       * first acquire the hb->lock before removing the lock from the
++       * rt_mutex waitqueue, such that we can keep the hb and rt_mutex
++       * wait lists consistent.
++       */
++      if (ret && !rt_mutex_cleanup_proxy_lock(&q.pi_state->pi_mutex, &rt_waiter))
++              ret = 0;
++
++no_block:
++      /*
+        * Fixup the pi_state owner and possibly acquire the lock if we
+        * haven't already.
+        */
+--- a/kernel/locking/rtmutex.c
++++ b/kernel/locking/rtmutex.c
+@@ -1523,19 +1523,6 @@ int __sched rt_mutex_lock_interruptible(
+ EXPORT_SYMBOL_GPL(rt_mutex_lock_interruptible);
+ /*
+- * Futex variant with full deadlock detection.
+- * Futex variants must not use the fast-path, see __rt_mutex_futex_unlock().
+- */
+-int __sched rt_mutex_timed_futex_lock(struct rt_mutex *lock,
+-                            struct hrtimer_sleeper *timeout)
+-{
+-      might_sleep();
+-
+-      return rt_mutex_slowlock(lock, TASK_INTERRUPTIBLE,
+-                               timeout, RT_MUTEX_FULL_CHAINWALK);
+-}
+-
+-/*
+  * Futex variant, must not use fastpath.
+  */
+ int __sched rt_mutex_futex_trylock(struct rt_mutex *lock)
+@@ -1808,12 +1795,6 @@ int rt_mutex_wait_proxy_lock(struct rt_m
+       /* sleep on the mutex */
+       ret = __rt_mutex_slowlock(lock, TASK_INTERRUPTIBLE, to, waiter);
+-      /*
+-       * try_to_take_rt_mutex() sets the waiter bit unconditionally. We might
+-       * have to fix that up.
+-       */
+-      fixup_rt_mutex_waiters(lock);
+-
+       raw_spin_unlock_irq(&lock->wait_lock);
+       return ret;
+@@ -1853,6 +1834,13 @@ bool rt_mutex_cleanup_proxy_lock(struct
+               fixup_rt_mutex_waiters(lock);
+               cleanup = true;
+       }
++
++      /*
++       * try_to_take_rt_mutex() sets the waiter bit unconditionally. We might
++       * have to fix that up.
++       */
++      fixup_rt_mutex_waiters(lock);
++
+       raw_spin_unlock_irq(&lock->wait_lock);
+       return cleanup;
+--- a/kernel/locking/rtmutex_common.h
++++ b/kernel/locking/rtmutex_common.h
+@@ -112,7 +112,6 @@ extern int rt_mutex_wait_proxy_lock(stru
+                              struct rt_mutex_waiter *waiter);
+ extern bool rt_mutex_cleanup_proxy_lock(struct rt_mutex *lock,
+                                struct rt_mutex_waiter *waiter);
+-extern int rt_mutex_timed_futex_lock(struct rt_mutex *l, struct hrtimer_sleeper *to);
+ extern int rt_mutex_futex_trylock(struct rt_mutex *l);
+ extern int __rt_mutex_futex_trylock(struct rt_mutex *l);
diff --git a/queue-4.9/futex-rt_mutex-fix-rt_mutex_cleanup_proxy_lock.patch b/queue-4.9/futex-rt_mutex-fix-rt_mutex_cleanup_proxy_lock.patch
new file mode 100644 (file)
index 0000000..beac03a
--- /dev/null
@@ -0,0 +1,135 @@
+From foo@baz Mon Mar 29 07:48:09 AM CEST 2021
+From: Ben Hutchings <ben@decadent.org.uk>
+Date: Sun, 28 Mar 2021 22:42:08 +0200
+Subject: futex,rt_mutex: Fix rt_mutex_cleanup_proxy_lock()
+To: stable@vger.kernel.org
+Cc: Lee Jones <lee.jones@linaro.org>, "Luis Claudio R. Goncalves" <lgoncalv@redhat.com>, Florian Fainelli <f.fainelli@gmail.com>
+Message-ID: <YGDqIHbfbs/FszkQ@decadent.org.uk>
+Content-Disposition: inline
+
+From: Peter Zijlstra <peterz@infradead.org>
+
+commit 04dc1b2fff4e96cb4142227fbdc63c8871ad4ed9 upstream.
+
+Markus reported that the glibc/nptl/tst-robustpi8 test was failing after
+commit:
+
+  cfafcd117da0 ("futex: Rework futex_lock_pi() to use rt_mutex_*_proxy_lock()")
+
+The following trace shows the problem:
+
+ ld-linux-x86-64-2161  [019] ....   410.760971: SyS_futex: 00007ffbeb76b028: 80000875  op=FUTEX_LOCK_PI
+ ld-linux-x86-64-2161  [019] ...1   410.760972: lock_pi_update_atomic: 00007ffbeb76b028: curval=80000875 uval=80000875 newval=80000875 ret=0
+ ld-linux-x86-64-2165  [011] ....   410.760978: SyS_futex: 00007ffbeb76b028: 80000875  op=FUTEX_UNLOCK_PI
+ ld-linux-x86-64-2165  [011] d..1   410.760979: do_futex: 00007ffbeb76b028: curval=80000875 uval=80000875 newval=80000871 ret=0
+ ld-linux-x86-64-2165  [011] ....   410.760980: SyS_futex: 00007ffbeb76b028: 80000871 ret=0000
+ ld-linux-x86-64-2161  [019] ....   410.760980: SyS_futex: 00007ffbeb76b028: 80000871 ret=ETIMEDOUT
+
+Task 2165 does an UNLOCK_PI, assigning the lock to the waiter task 2161
+which then returns with -ETIMEDOUT. That wrecks the lock state, because now
+the owner isn't aware it acquired the lock and removes the pending robust
+list entry.
+
+If 2161 is killed, the robust list will not clear out this futex and the
+subsequent acquire on this futex will then (correctly) result in -ESRCH
+which is unexpected by glibc, triggers an internal assertion and dies.
+
+Task 2161                      Task 2165
+
+rt_mutex_wait_proxy_lock()
+   timeout();
+   /* T2161 is still queued in  the waiter list */
+   return -ETIMEDOUT;
+
+                               futex_unlock_pi()
+                               spin_lock(hb->lock);
+                               rtmutex_unlock()
+                                 remove_rtmutex_waiter(T2161);
+                                  mark_lock_available();
+                               /* Make the next waiter owner of the user space side */
+                               futex_uval = 2161;
+                               spin_unlock(hb->lock);
+spin_lock(hb->lock);
+rt_mutex_cleanup_proxy_lock()
+  if (rtmutex_owner() !== current)
+     ...
+     return FAIL;
+....
+return -ETIMEOUT;
+
+This means that rt_mutex_cleanup_proxy_lock() needs to call
+try_to_take_rt_mutex() so it can take over the rtmutex correctly which was
+assigned by the waker. If the rtmutex is owned by some other task then this
+call is harmless and just confirmes that the waiter is not able to acquire
+it.
+
+While there, fix what looks like a merge error which resulted in
+rt_mutex_cleanup_proxy_lock() having two calls to
+fixup_rt_mutex_waiters() and rt_mutex_wait_proxy_lock() not having any.
+Both should have one, since both potentially touch the waiter list.
+
+Fixes: 38d589f2fd08 ("futex,rt_mutex: Restructure rt_mutex_finish_proxy_lock()")
+Reported-by: Markus Trippelsdorf <markus@trippelsdorf.de>
+Bug-Spotted-by: Thomas Gleixner <tglx@linutronix.de>
+Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
+Cc: Florian Weimer <fweimer@redhat.com>
+Cc: Darren Hart <dvhart@infradead.org>
+Cc: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
+Cc: Markus Trippelsdorf <markus@trippelsdorf.de>
+Link: http://lkml.kernel.org/r/20170519154850.mlomgdsd26drq5j6@hirez.programming.kicks-ass.net
+Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
+Signed-off-by: Ben Hutchings <ben@decadent.org.uk>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ kernel/locking/rtmutex.c |   24 ++++++++++++++++++------
+ 1 file changed, 18 insertions(+), 6 deletions(-)
+
+--- a/kernel/locking/rtmutex.c
++++ b/kernel/locking/rtmutex.c
+@@ -1796,12 +1796,14 @@ int rt_mutex_wait_proxy_lock(struct rt_m
+       int ret;
+       raw_spin_lock_irq(&lock->wait_lock);
+-
+-      set_current_state(TASK_INTERRUPTIBLE);
+-
+       /* sleep on the mutex */
++      set_current_state(TASK_INTERRUPTIBLE);
+       ret = __rt_mutex_slowlock(lock, TASK_INTERRUPTIBLE, to, waiter);
+-
++      /*
++       * try_to_take_rt_mutex() sets the waiter bit unconditionally. We might
++       * have to fix that up.
++       */
++      fixup_rt_mutex_waiters(lock);
+       raw_spin_unlock_irq(&lock->wait_lock);
+       return ret;
+@@ -1833,15 +1835,25 @@ bool rt_mutex_cleanup_proxy_lock(struct
+       raw_spin_lock_irq(&lock->wait_lock);
+       /*
++       * Do an unconditional try-lock, this deals with the lock stealing
++       * state where __rt_mutex_futex_unlock() -> mark_wakeup_next_waiter()
++       * sets a NULL owner.
++       *
++       * We're not interested in the return value, because the subsequent
++       * test on rt_mutex_owner() will infer that. If the trylock succeeded,
++       * we will own the lock and it will have removed the waiter. If we
++       * failed the trylock, we're still not owner and we need to remove
++       * ourselves.
++       */
++      try_to_take_rt_mutex(lock, current, waiter);
++      /*
+        * Unless we're the owner; we're still enqueued on the wait_list.
+        * So check if we became owner, if not, take us off the wait_list.
+        */
+       if (rt_mutex_owner(lock) != current) {
+               remove_waiter(lock, waiter);
+-              fixup_rt_mutex_waiters(lock);
+               cleanup = true;
+       }
+-
+       /*
+        * try_to_take_rt_mutex() sets the waiter bit unconditionally. We might
+        * have to fix that up.
diff --git a/queue-4.9/futex-rt_mutex-introduce-rt_mutex_init_waiter.patch b/queue-4.9/futex-rt_mutex-introduce-rt_mutex_init_waiter.patch
new file mode 100644 (file)
index 0000000..427c051
--- /dev/null
@@ -0,0 +1,88 @@
+From foo@baz Mon Mar 29 07:48:09 AM CEST 2021
+From: Ben Hutchings <ben@decadent.org.uk>
+Date: Sun, 28 Mar 2021 22:41:33 +0200
+Subject: futex,rt_mutex: Introduce rt_mutex_init_waiter()
+To: stable@vger.kernel.org
+Cc: Lee Jones <lee.jones@linaro.org>, "Luis Claudio R. Goncalves" <lgoncalv@redhat.com>, Florian Fainelli <f.fainelli@gmail.com>
+Message-ID: <YGDp/WL8DllYuErc@decadent.org.uk>
+Content-Disposition: inline
+
+From: Peter Zijlstra <peterz@infradead.org>
+
+commit 50809358dd7199aa7ce232f6877dd09ec30ef374 upstream.
+
+Since there's already two copies of this code, introduce a helper now
+before adding a third one.
+
+Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
+Cc: juri.lelli@arm.com
+Cc: bigeasy@linutronix.de
+Cc: xlpang@redhat.com
+Cc: rostedt@goodmis.org
+Cc: mathieu.desnoyers@efficios.com
+Cc: jdesfossez@efficios.com
+Cc: dvhart@infradead.org
+Cc: bristot@redhat.com
+Link: http://lkml.kernel.org/r/20170322104151.950039479@infradead.org
+Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
+[bwh: Backported to 4.9: adjust context]
+Signed-off-by: Ben Hutchings <ben@decadent.org.uk>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ kernel/futex.c                  |    5 +----
+ kernel/locking/rtmutex.c        |   12 +++++++++---
+ kernel/locking/rtmutex_common.h |    1 +
+ 3 files changed, 11 insertions(+), 7 deletions(-)
+
+--- a/kernel/futex.c
++++ b/kernel/futex.c
+@@ -3234,10 +3234,7 @@ static int futex_wait_requeue_pi(u32 __u
+        * The waiter is allocated on our stack, manipulated by the requeue
+        * code while we sleep on uaddr.
+        */
+-      debug_rt_mutex_init_waiter(&rt_waiter);
+-      RB_CLEAR_NODE(&rt_waiter.pi_tree_entry);
+-      RB_CLEAR_NODE(&rt_waiter.tree_entry);
+-      rt_waiter.task = NULL;
++      rt_mutex_init_waiter(&rt_waiter);
+       ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2, VERIFY_WRITE);
+       if (unlikely(ret != 0))
+--- a/kernel/locking/rtmutex.c
++++ b/kernel/locking/rtmutex.c
+@@ -1176,6 +1176,14 @@ void rt_mutex_adjust_pi(struct task_stru
+                                  next_lock, NULL, task);
+ }
++void rt_mutex_init_waiter(struct rt_mutex_waiter *waiter)
++{
++      debug_rt_mutex_init_waiter(waiter);
++      RB_CLEAR_NODE(&waiter->pi_tree_entry);
++      RB_CLEAR_NODE(&waiter->tree_entry);
++      waiter->task = NULL;
++}
++
+ /**
+  * __rt_mutex_slowlock() - Perform the wait-wake-try-to-take loop
+  * @lock:              the rt_mutex to take
+@@ -1258,9 +1266,7 @@ rt_mutex_slowlock(struct rt_mutex *lock,
+       unsigned long flags;
+       int ret = 0;
+-      debug_rt_mutex_init_waiter(&waiter);
+-      RB_CLEAR_NODE(&waiter.pi_tree_entry);
+-      RB_CLEAR_NODE(&waiter.tree_entry);
++      rt_mutex_init_waiter(&waiter);
+       /*
+        * Technically we could use raw_spin_[un]lock_irq() here, but this can
+--- a/kernel/locking/rtmutex_common.h
++++ b/kernel/locking/rtmutex_common.h
+@@ -103,6 +103,7 @@ extern struct task_struct *rt_mutex_next
+ extern void rt_mutex_init_proxy_locked(struct rt_mutex *lock,
+                                      struct task_struct *proxy_owner);
+ extern void rt_mutex_proxy_unlock(struct rt_mutex *lock);
++extern void rt_mutex_init_waiter(struct rt_mutex_waiter *waiter);
+ extern int rt_mutex_start_proxy_lock(struct rt_mutex *lock,
+                                    struct rt_mutex_waiter *waiter,
+                                    struct task_struct *task);
diff --git a/queue-4.9/futex-use-smp_store_release-in-mark_wake_futex.patch b/queue-4.9/futex-use-smp_store_release-in-mark_wake_futex.patch
new file mode 100644 (file)
index 0000000..246f3aa
--- /dev/null
@@ -0,0 +1,46 @@
+From foo@baz Mon Mar 29 07:48:09 AM CEST 2021
+From: Ben Hutchings <ben@decadent.org.uk>
+Date: Sun, 28 Mar 2021 22:40:54 +0200
+Subject: futex: Use smp_store_release() in mark_wake_futex()
+To: stable@vger.kernel.org
+Cc: Lee Jones <lee.jones@linaro.org>, "Luis Claudio R. Goncalves" <lgoncalv@redhat.com>, Florian Fainelli <f.fainelli@gmail.com>
+Message-ID: <YGDp1qJOCUJmE1Ty@decadent.org.uk>
+Content-Disposition: inline
+
+From: Peter Zijlstra <peterz@infradead.org>
+
+commit 1b367ece0d7e696cab1c8501bab282cc6a538b3f upstream.
+
+Since the futex_q can dissapear the instruction after assigning NULL,
+this really should be a RELEASE barrier. That stops loads from hitting
+dead memory too.
+
+Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
+Cc: juri.lelli@arm.com
+Cc: bigeasy@linutronix.de
+Cc: xlpang@redhat.com
+Cc: rostedt@goodmis.org
+Cc: mathieu.desnoyers@efficios.com
+Cc: jdesfossez@efficios.com
+Cc: dvhart@infradead.org
+Cc: bristot@redhat.com
+Link: http://lkml.kernel.org/r/20170322104151.604296452@infradead.org
+Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
+Signed-off-by: Ben Hutchings <ben@decadent.org.uk>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ kernel/futex.c |    3 +--
+ 1 file changed, 1 insertion(+), 2 deletions(-)
+
+--- a/kernel/futex.c
++++ b/kernel/futex.c
+@@ -1565,8 +1565,7 @@ static void mark_wake_futex(struct wake_
+        * memory barrier is required here to prevent the following
+        * store to lock_ptr from getting ahead of the plist_del.
+        */
+-      smp_wmb();
+-      q->lock_ptr = NULL;
++      smp_store_release(&q->lock_ptr, NULL);
+ }
+ /*
diff --git a/queue-4.9/locking-futex-allow-low-level-atomic-operations-to-return-eagain.patch b/queue-4.9/locking-futex-allow-low-level-atomic-operations-to-return-eagain.patch
new file mode 100644 (file)
index 0000000..dd2c3e0
--- /dev/null
@@ -0,0 +1,337 @@
+From foo@baz Mon Mar 29 07:48:09 AM CEST 2021
+From: Ben Hutchings <ben@decadent.org.uk>
+Date: Sun, 28 Mar 2021 22:42:44 +0200
+Subject: locking/futex: Allow low-level atomic operations to return -EAGAIN
+To: stable@vger.kernel.org
+Cc: Lee Jones <lee.jones@linaro.org>, "Luis Claudio R. Goncalves" <lgoncalv@redhat.com>, Florian Fainelli <f.fainelli@gmail.com>
+Message-ID: <YGDqRGR3K1fUDkoR@decadent.org.uk>
+Content-Disposition: inline
+
+From: Will Deacon <will.deacon@arm.com>
+
+commit 6b4f4bc9cb22875f97023984a625386f0c7cc1c0 upstream.
+
+Some futex() operations, including FUTEX_WAKE_OP, require the kernel to
+perform an atomic read-modify-write of the futex word via the userspace
+mapping. These operations are implemented by each architecture in
+arch_futex_atomic_op_inuser() and futex_atomic_cmpxchg_inatomic(), which
+are called in atomic context with the relevant hash bucket locks held.
+
+Although these routines may return -EFAULT in response to a page fault
+generated when accessing userspace, they are expected to succeed (i.e.
+return 0) in all other cases. This poses a problem for architectures
+that do not provide bounded forward progress guarantees or fairness of
+contended atomic operations and can lead to starvation in some cases.
+
+In these problematic scenarios, we must return back to the core futex
+code so that we can drop the hash bucket locks and reschedule if
+necessary, much like we do in the case of a page fault.
+
+Allow architectures to return -EAGAIN from their implementations of
+arch_futex_atomic_op_inuser() and futex_atomic_cmpxchg_inatomic(), which
+will cause the core futex code to reschedule if necessary and return
+back to the architecture code later on.
+
+Cc: <stable@kernel.org>
+Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
+Signed-off-by: Will Deacon <will.deacon@arm.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+[bwh: Backported to 4.9: adjust context]
+Signed-off-by: Ben Hutchings <ben@decadent.org.uk>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ kernel/futex.c |  185 +++++++++++++++++++++++++++++++++++----------------------
+ 1 file changed, 115 insertions(+), 70 deletions(-)
+
+--- a/kernel/futex.c
++++ b/kernel/futex.c
+@@ -1407,13 +1407,15 @@ static int lookup_pi_state(u32 __user *u
+ static int lock_pi_update_atomic(u32 __user *uaddr, u32 uval, u32 newval)
+ {
++      int err;
+       u32 uninitialized_var(curval);
+       if (unlikely(should_fail_futex(true)))
+               return -EFAULT;
+-      if (unlikely(cmpxchg_futex_value_locked(&curval, uaddr, uval, newval)))
+-              return -EFAULT;
++      err = cmpxchg_futex_value_locked(&curval, uaddr, uval, newval);
++      if (unlikely(err))
++              return err;
+       /* If user space value changed, let the caller retry */
+       return curval != uval ? -EAGAIN : 0;
+@@ -1606,10 +1608,8 @@ static int wake_futex_pi(u32 __user *uad
+       if (unlikely(should_fail_futex(true)))
+               ret = -EFAULT;
+-      if (cmpxchg_futex_value_locked(&curval, uaddr, uval, newval)) {
+-              ret = -EFAULT;
+-
+-      } else if (curval != uval) {
++      ret = cmpxchg_futex_value_locked(&curval, uaddr, uval, newval);
++      if (!ret && (curval != uval)) {
+               /*
+                * If a unconditional UNLOCK_PI operation (user space did not
+                * try the TID->0 transition) raced with a waiter setting the
+@@ -1795,32 +1795,32 @@ retry_private:
+       double_lock_hb(hb1, hb2);
+       op_ret = futex_atomic_op_inuser(op, uaddr2);
+       if (unlikely(op_ret < 0)) {
+-
+               double_unlock_hb(hb1, hb2);
+-#ifndef CONFIG_MMU
+-              /*
+-               * we don't get EFAULT from MMU faults if we don't have an MMU,
+-               * but we might get them from range checking
+-               */
+-              ret = op_ret;
+-              goto out_put_keys;
+-#endif
+-
+-              if (unlikely(op_ret != -EFAULT)) {
++              if (!IS_ENABLED(CONFIG_MMU) ||
++                  unlikely(op_ret != -EFAULT && op_ret != -EAGAIN)) {
++                      /*
++                       * we don't get EFAULT from MMU faults if we don't have
++                       * an MMU, but we might get them from range checking
++                       */
+                       ret = op_ret;
+                       goto out_put_keys;
+               }
+-              ret = fault_in_user_writeable(uaddr2);
+-              if (ret)
+-                      goto out_put_keys;
++              if (op_ret == -EFAULT) {
++                      ret = fault_in_user_writeable(uaddr2);
++                      if (ret)
++                              goto out_put_keys;
++              }
+-              if (!(flags & FLAGS_SHARED))
++              if (!(flags & FLAGS_SHARED)) {
++                      cond_resched();
+                       goto retry_private;
++              }
+               put_futex_key(&key2);
+               put_futex_key(&key1);
++              cond_resched();
+               goto retry;
+       }
+@@ -2516,14 +2516,17 @@ retry:
+       if (!pi_state->owner)
+               newtid |= FUTEX_OWNER_DIED;
+-      if (get_futex_value_locked(&uval, uaddr))
+-              goto handle_fault;
++      err = get_futex_value_locked(&uval, uaddr);
++      if (err)
++              goto handle_err;
+       for (;;) {
+               newval = (uval & FUTEX_OWNER_DIED) | newtid;
+-              if (cmpxchg_futex_value_locked(&curval, uaddr, uval, newval))
+-                      goto handle_fault;
++              err = cmpxchg_futex_value_locked(&curval, uaddr, uval, newval);
++              if (err)
++                      goto handle_err;
++
+               if (curval == uval)
+                       break;
+               uval = curval;
+@@ -2538,23 +2541,36 @@ retry:
+       return argowner == current;
+       /*
+-       * To handle the page fault we need to drop the locks here. That gives
+-       * the other task (either the highest priority waiter itself or the
+-       * task which stole the rtmutex) the chance to try the fixup of the
+-       * pi_state. So once we are back from handling the fault we need to
+-       * check the pi_state after reacquiring the locks and before trying to
+-       * do another fixup. When the fixup has been done already we simply
+-       * return.
++       * In order to reschedule or handle a page fault, we need to drop the
++       * locks here. In the case of a fault, this gives the other task
++       * (either the highest priority waiter itself or the task which stole
++       * the rtmutex) the chance to try the fixup of the pi_state. So once we
++       * are back from handling the fault we need to check the pi_state after
++       * reacquiring the locks and before trying to do another fixup. When
++       * the fixup has been done already we simply return.
+        *
+        * Note: we hold both hb->lock and pi_mutex->wait_lock. We can safely
+        * drop hb->lock since the caller owns the hb -> futex_q relation.
+        * Dropping the pi_mutex->wait_lock requires the state revalidate.
+        */
+-handle_fault:
++handle_err:
+       raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
+       spin_unlock(q->lock_ptr);
+-      err = fault_in_user_writeable(uaddr);
++      switch (err) {
++      case -EFAULT:
++              err = fault_in_user_writeable(uaddr);
++              break;
++
++      case -EAGAIN:
++              cond_resched();
++              err = 0;
++              break;
++
++      default:
++              WARN_ON_ONCE(1);
++              break;
++      }
+       spin_lock(q->lock_ptr);
+       raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);
+@@ -3128,10 +3144,8 @@ retry:
+                * A unconditional UNLOCK_PI op raced against a waiter
+                * setting the FUTEX_WAITERS bit. Try again.
+                */
+-              if (ret == -EAGAIN) {
+-                      put_futex_key(&key);
+-                      goto retry;
+-              }
++              if (ret == -EAGAIN)
++                      goto pi_retry;
+               /*
+                * wake_futex_pi has detected invalid state. Tell user
+                * space.
+@@ -3146,9 +3160,19 @@ retry:
+        * preserve the WAITERS bit not the OWNER_DIED one. We are the
+        * owner.
+        */
+-      if (cmpxchg_futex_value_locked(&curval, uaddr, uval, 0)) {
++      if ((ret = cmpxchg_futex_value_locked(&curval, uaddr, uval, 0))) {
+               spin_unlock(&hb->lock);
+-              goto pi_faulted;
++              switch (ret) {
++              case -EFAULT:
++                      goto pi_faulted;
++
++              case -EAGAIN:
++                      goto pi_retry;
++
++              default:
++                      WARN_ON_ONCE(1);
++                      goto out_putkey;
++              }
+       }
+       /*
+@@ -3162,6 +3186,11 @@ out_putkey:
+       put_futex_key(&key);
+       return ret;
++pi_retry:
++      put_futex_key(&key);
++      cond_resched();
++      goto retry;
++
+ pi_faulted:
+       put_futex_key(&key);
+@@ -3504,6 +3533,7 @@ err_unlock:
+ static int handle_futex_death(u32 __user *uaddr, struct task_struct *curr, int pi)
+ {
+       u32 uval, uninitialized_var(nval), mval;
++      int err;
+       /* Futex address must be 32bit aligned */
+       if ((((unsigned long)uaddr) % sizeof(*uaddr)) != 0)
+@@ -3513,42 +3543,57 @@ retry:
+       if (get_user(uval, uaddr))
+               return -1;
+-      if ((uval & FUTEX_TID_MASK) == task_pid_vnr(curr)) {
+-              /*
+-               * Ok, this dying thread is truly holding a futex
+-               * of interest. Set the OWNER_DIED bit atomically
+-               * via cmpxchg, and if the value had FUTEX_WAITERS
+-               * set, wake up a waiter (if any). (We have to do a
+-               * futex_wake() even if OWNER_DIED is already set -
+-               * to handle the rare but possible case of recursive
+-               * thread-death.) The rest of the cleanup is done in
+-               * userspace.
+-               */
+-              mval = (uval & FUTEX_WAITERS) | FUTEX_OWNER_DIED;
+-              /*
+-               * We are not holding a lock here, but we want to have
+-               * the pagefault_disable/enable() protection because
+-               * we want to handle the fault gracefully. If the
+-               * access fails we try to fault in the futex with R/W
+-               * verification via get_user_pages. get_user() above
+-               * does not guarantee R/W access. If that fails we
+-               * give up and leave the futex locked.
+-               */
+-              if (cmpxchg_futex_value_locked(&nval, uaddr, uval, mval)) {
++      if ((uval & FUTEX_TID_MASK) != task_pid_vnr(curr))
++              return 0;
++
++      /*
++       * Ok, this dying thread is truly holding a futex
++       * of interest. Set the OWNER_DIED bit atomically
++       * via cmpxchg, and if the value had FUTEX_WAITERS
++       * set, wake up a waiter (if any). (We have to do a
++       * futex_wake() even if OWNER_DIED is already set -
++       * to handle the rare but possible case of recursive
++       * thread-death.) The rest of the cleanup is done in
++       * userspace.
++       */
++      mval = (uval & FUTEX_WAITERS) | FUTEX_OWNER_DIED;
++
++      /*
++       * We are not holding a lock here, but we want to have
++       * the pagefault_disable/enable() protection because
++       * we want to handle the fault gracefully. If the
++       * access fails we try to fault in the futex with R/W
++       * verification via get_user_pages. get_user() above
++       * does not guarantee R/W access. If that fails we
++       * give up and leave the futex locked.
++       */
++      if ((err = cmpxchg_futex_value_locked(&nval, uaddr, uval, mval))) {
++              switch (err) {
++              case -EFAULT:
+                       if (fault_in_user_writeable(uaddr))
+                               return -1;
+                       goto retry;
+-              }
+-              if (nval != uval)
++
++              case -EAGAIN:
++                      cond_resched();
+                       goto retry;
+-              /*
+-               * Wake robust non-PI futexes here. The wakeup of
+-               * PI futexes happens in exit_pi_state():
+-               */
+-              if (!pi && (uval & FUTEX_WAITERS))
+-                      futex_wake(uaddr, 1, 1, FUTEX_BITSET_MATCH_ANY);
++              default:
++                      WARN_ON_ONCE(1);
++                      return err;
++              }
+       }
++
++      if (nval != uval)
++              goto retry;
++
++      /*
++       * Wake robust non-PI futexes here. The wakeup of
++       * PI futexes happens in exit_pi_state():
++       */
++      if (!pi && (uval & FUTEX_WAITERS))
++              futex_wake(uaddr, 1, 1, FUTEX_BITSET_MATCH_ANY);
++
+       return 0;
+ }
index 82aa6ab8daed04fae2577282e035fde596f13791..240edd61d6e8f3f7a9802384abeb63a7b6ebb6bb 100644 (file)
@@ -34,3 +34,16 @@ acpi-scan-rearrange-memory-allocation-in-acpi_device.patch
 acpi-scan-use-unique-number-for-instance_no.patch
 perf-auxtrace-fix-auxtrace-queue-conflict.patch
 idr-add-ida_is_empty.patch
+futex-use-smp_store_release-in-mark_wake_futex.patch
+futex-rt_mutex-introduce-rt_mutex_init_waiter.patch
+futex-rework-futex_lock_pi-to-use-rt_mutex_-_proxy_lock.patch
+futex-drop-hb-lock-before-enqueueing-on-the-rtmutex.patch
+futex-avoid-freeing-an-active-timer.patch
+futex-rt_mutex-fix-rt_mutex_cleanup_proxy_lock.patch
+futex-handle-early-deadlock-return-correctly.patch
+futex-fix-possible-missed-wakeup.patch
+locking-futex-allow-low-level-atomic-operations-to-return-eagain.patch
+arm64-futex-bound-number-of-ldxr-stxr-loops-in-futex_wake_op.patch
+futex-prevent-robust-futex-exit-race.patch
+futex-fix-incorrect-should_fail_futex-handling.patch
+futex-handle-transient-ownerless-rtmutex-state-correctly.patch