]> git.ipfire.org Git - thirdparty/kernel/stable-queue.git/commitdiff
4.4-stable patches
authorGreg Kroah-Hartman <gregkh@linuxfoundation.org>
Sun, 8 Aug 2021 06:46:11 +0000 (08:46 +0200)
committerGreg Kroah-Hartman <gregkh@linuxfoundation.org>
Sun, 8 Aug 2021 06:46:11 +0000 (08:46 +0200)
added patches:
futex-avoid-freeing-an-active-timer.patch
futex-cleanup-refcounting.patch
futex-futex_unlock_pi-determinism.patch
futex-handle-transient-ownerless-rtmutex-state-correctly.patch
futex-pull-rt_mutex_futex_unlock-out-from-under-hb-lock.patch
futex-rename-free_pi_state-to-put_pi_state.patch
futex-rework-futex_lock_pi-to-use-rt_mutex_-_proxy_lock.patch
futex-rt_mutex-fix-rt_mutex_cleanup_proxy_lock.patch
futex-rt_mutex-introduce-rt_mutex_init_waiter.patch
rcu-update-documentation-of-rcu_read_unlock.patch
rtmutex-make-wait_lock-irq-safe.patch

12 files changed:
queue-4.4/futex-avoid-freeing-an-active-timer.patch [new file with mode: 0644]
queue-4.4/futex-cleanup-refcounting.patch [new file with mode: 0644]
queue-4.4/futex-futex_unlock_pi-determinism.patch [new file with mode: 0644]
queue-4.4/futex-handle-transient-ownerless-rtmutex-state-correctly.patch [new file with mode: 0644]
queue-4.4/futex-pull-rt_mutex_futex_unlock-out-from-under-hb-lock.patch [new file with mode: 0644]
queue-4.4/futex-rename-free_pi_state-to-put_pi_state.patch [new file with mode: 0644]
queue-4.4/futex-rework-futex_lock_pi-to-use-rt_mutex_-_proxy_lock.patch [new file with mode: 0644]
queue-4.4/futex-rt_mutex-fix-rt_mutex_cleanup_proxy_lock.patch [new file with mode: 0644]
queue-4.4/futex-rt_mutex-introduce-rt_mutex_init_waiter.patch [new file with mode: 0644]
queue-4.4/rcu-update-documentation-of-rcu_read_unlock.patch [new file with mode: 0644]
queue-4.4/rtmutex-make-wait_lock-irq-safe.patch [new file with mode: 0644]
queue-4.4/series [new file with mode: 0644]

diff --git a/queue-4.4/futex-avoid-freeing-an-active-timer.patch b/queue-4.4/futex-avoid-freeing-an-active-timer.patch
new file mode 100644 (file)
index 0000000..eb61062
--- /dev/null
@@ -0,0 +1,61 @@
+From foo@baz Sun Aug  8 08:43:25 AM CEST 2021
+From: Zhen Lei <thunder.leizhen@huawei.com>
+Date: Mon, 2 Aug 2021 21:46:22 +0800
+Subject: futex: Avoid freeing an active timer
+To: Greg Kroah-Hartman <gregkh@linuxfoundation.org>, stable <stable@vger.kernel.org>
+Cc: Zhen Lei <thunder.leizhen@huawei.com>, Anna-Maria Gleixner <anna-maria@linutronix.de>, Mike Galbraith <efault@gmx.de>, Sasha Levin <sasha.levin@oracle.com>, Ingo Molnar <mingo@kernel.org>, Peter Zijlstra <peterz@infradead.org>, Thomas Gleixner <tglx@linutronix.de>, linux-kernel <linux-kernel@vger.kernel.org>
+Message-ID: <20210802134624.1934-10-thunder.leizhen@huawei.com>
+
+From: Thomas Gleixner <tglx@linutronix.de>
+
+[ Upstream commit 97181f9bd57405b879403763284537e27d46963d ]
+
+Alexander reported a hrtimer debug_object splat:
+
+  ODEBUG: free active (active state 0) object type: hrtimer hint: hrtimer_wakeup (kernel/time/hrtimer.c:1423)
+
+  debug_object_free (lib/debugobjects.c:603)
+  destroy_hrtimer_on_stack (kernel/time/hrtimer.c:427)
+  futex_lock_pi (kernel/futex.c:2740)
+  do_futex (kernel/futex.c:3399)
+  SyS_futex (kernel/futex.c:3447 kernel/futex.c:3415)
+  do_syscall_64 (arch/x86/entry/common.c:284)
+  entry_SYSCALL64_slow_path (arch/x86/entry/entry_64.S:249)
+
+Which was caused by commit:
+
+  cfafcd117da0 ("futex: Rework futex_lock_pi() to use rt_mutex_*_proxy_lock()")
+
+... losing the hrtimer_cancel() in the shuffle. Where previously the
+hrtimer_cancel() was done by rt_mutex_slowlock() we now need to do it
+manually.
+
+Reported-by: Alexander Levin <alexander.levin@verizon.com>
+Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
+Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
+Cc: Linus Torvalds <torvalds@linux-foundation.org>
+Cc: Peter Zijlstra <peterz@infradead.org>
+Fixes: cfafcd117da0 ("futex: Rework futex_lock_pi() to use rt_mutex_*_proxy_lock()")
+Link: http://lkml.kernel.org/r/alpine.DEB.2.20.1704101802370.2906@nanos
+Signed-off-by: Ingo Molnar <mingo@kernel.org>
+Signed-off-by: Zhen Lei <thunder.leizhen@huawei.com>
+Acked-by: Joe Korty <joe.korty@concurrent-rt.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ kernel/futex.c |    4 +++-
+ 1 file changed, 3 insertions(+), 1 deletion(-)
+
+--- a/kernel/futex.c
++++ b/kernel/futex.c
+@@ -2960,8 +2960,10 @@ out_unlock_put_key:
+ out_put_key:
+       put_futex_key(&q.key);
+ out:
+-      if (to)
++      if (to) {
++              hrtimer_cancel(&to->timer);
+               destroy_hrtimer_on_stack(&to->timer);
++      }
+       return ret != -EINTR ? ret : -ERESTARTNOINTR;
+ uaddr_faulted:
diff --git a/queue-4.4/futex-cleanup-refcounting.patch b/queue-4.4/futex-cleanup-refcounting.patch
new file mode 100644 (file)
index 0000000..a24eb76
--- /dev/null
@@ -0,0 +1,83 @@
+From foo@baz Sun Aug  8 08:43:25 AM CEST 2021
+From: Zhen Lei <thunder.leizhen@huawei.com>
+Date: Mon, 2 Aug 2021 21:46:15 +0800
+Subject: futex: Cleanup refcounting
+To: Greg Kroah-Hartman <gregkh@linuxfoundation.org>, stable <stable@vger.kernel.org>
+Cc: Zhen Lei <thunder.leizhen@huawei.com>, Anna-Maria Gleixner <anna-maria@linutronix.de>, Mike Galbraith <efault@gmx.de>, Sasha Levin <sasha.levin@oracle.com>, Ingo Molnar <mingo@kernel.org>, Peter Zijlstra <peterz@infradead.org>, Thomas Gleixner <tglx@linutronix.de>, linux-kernel <linux-kernel@vger.kernel.org>
+Message-ID: <20210802134624.1934-3-thunder.leizhen@huawei.com>
+
+From: Peter Zijlstra <peterz@infradead.org>
+
+[ Upstream commit bf92cf3a5100f5a0d5f9834787b130159397cb22 ]
+
+Add a put_pit_state() as counterpart for get_pi_state() so the refcounting
+becomes consistent.
+
+Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
+Cc: juri.lelli@arm.com
+Cc: bigeasy@linutronix.de
+Cc: xlpang@redhat.com
+Cc: rostedt@goodmis.org
+Cc: mathieu.desnoyers@efficios.com
+Cc: jdesfossez@efficios.com
+Cc: dvhart@infradead.org
+Cc: bristot@redhat.com
+Link: http://lkml.kernel.org/r/20170322104151.801778516@infradead.org
+Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
+Signed-off-by: Zhen Lei <thunder.leizhen@huawei.com>
+Acked-by: Joe Korty <joe.korty@concurrent-rt.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ kernel/futex.c |   13 +++++++++----
+ 1 file changed, 9 insertions(+), 4 deletions(-)
+
+--- a/kernel/futex.c
++++ b/kernel/futex.c
+@@ -825,7 +825,7 @@ static int refill_pi_state_cache(void)
+       return 0;
+ }
+-static struct futex_pi_state * alloc_pi_state(void)
++static struct futex_pi_state *alloc_pi_state(void)
+ {
+       struct futex_pi_state *pi_state = current->pi_state_cache;
+@@ -858,6 +858,11 @@ static void pi_state_update_owner(struct
+       }
+ }
++static void get_pi_state(struct futex_pi_state *pi_state)
++{
++      WARN_ON_ONCE(!atomic_inc_not_zero(&pi_state->refcount));
++}
++
+ /*
+  * Drops a reference to the pi_state object and frees or caches it
+  * when the last reference is gone.
+@@ -901,7 +906,7 @@ static void put_pi_state(struct futex_pi
+  * Look up the task based on what TID userspace gave us.
+  * We dont trust it.
+  */
+-static struct task_struct * futex_find_get_task(pid_t pid)
++static struct task_struct *futex_find_get_task(pid_t pid)
+ {
+       struct task_struct *p;
+@@ -1149,7 +1154,7 @@ static int attach_to_pi_state(u32 __user
+               goto out_einval;
+ out_attach:
+-      atomic_inc(&pi_state->refcount);
++      get_pi_state(pi_state);
+       raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
+       *ps = pi_state;
+       return 0;
+@@ -2204,7 +2209,7 @@ retry_private:
+                */
+               if (requeue_pi) {
+                       /* Prepare the waiter to take the rt_mutex. */
+-                      atomic_inc(&pi_state->refcount);
++                      get_pi_state(pi_state);
+                       this->pi_state = pi_state;
+                       ret = rt_mutex_start_proxy_lock(&pi_state->pi_mutex,
+                                                       this->rt_waiter,
diff --git a/queue-4.4/futex-futex_unlock_pi-determinism.patch b/queue-4.4/futex-futex_unlock_pi-determinism.patch
new file mode 100644 (file)
index 0000000..47859e5
--- /dev/null
@@ -0,0 +1,88 @@
+From foo@baz Sun Aug  8 08:43:25 AM CEST 2021
+From: Zhen Lei <thunder.leizhen@huawei.com>
+Date: Mon, 2 Aug 2021 21:46:19 +0800
+Subject: futex: Futex_unlock_pi() determinism
+To: Greg Kroah-Hartman <gregkh@linuxfoundation.org>, stable <stable@vger.kernel.org>
+Cc: Zhen Lei <thunder.leizhen@huawei.com>, Anna-Maria Gleixner <anna-maria@linutronix.de>, Mike Galbraith <efault@gmx.de>, Sasha Levin <sasha.levin@oracle.com>, Ingo Molnar <mingo@kernel.org>, Peter Zijlstra <peterz@infradead.org>, Thomas Gleixner <tglx@linutronix.de>, linux-kernel <linux-kernel@vger.kernel.org>
+Message-ID: <20210802134624.1934-7-thunder.leizhen@huawei.com>
+
+From: Peter Zijlstra <peterz@infradead.org>
+
+[ Upstream commit bebe5b514345f09be2c15e414d076b02ecb9cce8 ]
+
+The problem with returning -EAGAIN when the waiter state mismatches is that
+it becomes very hard to proof a bounded execution time on the
+operation. And seeing that this is a RT operation, this is somewhat
+important.
+
+While in practise; given the previous patch; it will be very unlikely to
+ever really take more than one or two rounds, proving so becomes rather
+hard.
+
+However, now that modifying wait_list is done while holding both hb->lock
+and wait_lock, the scenario can be avoided entirely by acquiring wait_lock
+while still holding hb-lock. Doing a hand-over, without leaving a hole.
+
+Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
+Cc: juri.lelli@arm.com
+Cc: bigeasy@linutronix.de
+Cc: xlpang@redhat.com
+Cc: rostedt@goodmis.org
+Cc: mathieu.desnoyers@efficios.com
+Cc: jdesfossez@efficios.com
+Cc: dvhart@infradead.org
+Cc: bristot@redhat.com
+Link: http://lkml.kernel.org/r/20170322104152.112378812@infradead.org
+Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
+Signed-off-by: Zhen Lei <thunder.leizhen@huawei.com>
+Acked-by: Joe Korty <joe.korty@concurrent-rt.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ kernel/futex.c |   24 +++++++++++-------------
+ 1 file changed, 11 insertions(+), 13 deletions(-)
+
+--- a/kernel/futex.c
++++ b/kernel/futex.c
+@@ -1555,15 +1555,10 @@ static int wake_futex_pi(u32 __user *uad
+       WAKE_Q(wake_q);
+       int ret = 0;
+-      raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);
+       new_owner = rt_mutex_next_owner(&pi_state->pi_mutex);
+-      if (!new_owner) {
++      if (WARN_ON_ONCE(!new_owner)) {
+               /*
+-               * Since we held neither hb->lock nor wait_lock when coming
+-               * into this function, we could have raced with futex_lock_pi()
+-               * such that we might observe @this futex_q waiter, but the
+-               * rt_mutex's wait_list can be empty (either still, or again,
+-               * depending on which side we land).
++               * As per the comment in futex_unlock_pi() this should not happen.
+                *
+                * When this happens, give up our locks and try again, giving
+                * the futex_lock_pi() instance time to complete, either by
+@@ -3020,15 +3015,18 @@ retry:
+               if (pi_state->owner != current)
+                       goto out_unlock;
++              get_pi_state(pi_state);
+               /*
+-               * Grab a reference on the pi_state and drop hb->lock.
++               * Since modifying the wait_list is done while holding both
++               * hb->lock and wait_lock, holding either is sufficient to
++               * observe it.
+                *
+-               * The reference ensures pi_state lives, dropping the hb->lock
+-               * is tricky.. wake_futex_pi() will take rt_mutex::wait_lock to
+-               * close the races against futex_lock_pi(), but in case of
+-               * _any_ fail we'll abort and retry the whole deal.
++               * By taking wait_lock while still holding hb->lock, we ensure
++               * there is no point where we hold neither; and therefore
++               * wake_futex_pi() must observe a state consistent with what we
++               * observed.
+                */
+-              get_pi_state(pi_state);
++              raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);
+               spin_unlock(&hb->lock);
+               ret = wake_futex_pi(uaddr, uval, pi_state);
diff --git a/queue-4.4/futex-handle-transient-ownerless-rtmutex-state-correctly.patch b/queue-4.4/futex-handle-transient-ownerless-rtmutex-state-correctly.patch
new file mode 100644 (file)
index 0000000..6d25902
--- /dev/null
@@ -0,0 +1,84 @@
+From foo@baz Sun Aug  8 08:43:25 AM CEST 2021
+From: Zhen Lei <thunder.leizhen@huawei.com>
+Date: Mon, 2 Aug 2021 21:46:21 +0800
+Subject: futex: Handle transient "ownerless" rtmutex state correctly
+To: Greg Kroah-Hartman <gregkh@linuxfoundation.org>, stable <stable@vger.kernel.org>
+Cc: Zhen Lei <thunder.leizhen@huawei.com>, Anna-Maria Gleixner <anna-maria@linutronix.de>, Mike Galbraith <efault@gmx.de>, Sasha Levin <sasha.levin@oracle.com>, Ingo Molnar <mingo@kernel.org>, Peter Zijlstra <peterz@infradead.org>, Thomas Gleixner <tglx@linutronix.de>, linux-kernel <linux-kernel@vger.kernel.org>
+Message-ID: <20210802134624.1934-9-thunder.leizhen@huawei.com>
+
+From: Mike Galbraith <efault@gmx.de>
+
+[ Upstream commit 9f5d1c336a10c0d24e83e40b4c1b9539f7dba627 ]
+
+Gratian managed to trigger the BUG_ON(!newowner) in fixup_pi_state_owner().
+This is one possible chain of events leading to this:
+
+Task Prio       Operation
+T1   120       lock(F)
+T2   120       lock(F)   -> blocks (top waiter)
+T3   50 (RT)   lock(F)   -> boosts T1 and blocks (new top waiter)
+XX             timeout/  -> wakes T2
+               signal
+T1   50                unlock(F) -> wakes T3 (rtmutex->owner == NULL, waiter bit is set)
+T2   120       cleanup   -> try_to_take_mutex() fails because T3 is the top waiter
+                            and the lower priority T2 cannot steal the lock.
+                         -> fixup_pi_state_owner() sees newowner == NULL -> BUG_ON()
+
+The comment states that this is invalid and rt_mutex_real_owner() must
+return a non NULL owner when the trylock failed, but in case of a queued
+and woken up waiter rt_mutex_real_owner() == NULL is a valid transient
+state. The higher priority waiter has simply not yet managed to take over
+the rtmutex.
+
+The BUG_ON() is therefore wrong and this is just another retry condition in
+fixup_pi_state_owner().
+
+Drop the locks, so that T3 can make progress, and then try the fixup again.
+
+Gratian provided a great analysis, traces and a reproducer. The analysis is
+to the point, but it confused the hell out of that tglx dude who had to
+page in all the futex horrors again. Condensed version is above.
+
+[ tglx: Wrote comment and changelog ]
+
+Fixes: c1e2f0eaf015 ("futex: Avoid violating the 10th rule of futex")
+Reported-by: Gratian Crisan <gratian.crisan@ni.com>
+Signed-off-by: Mike Galbraith <efault@gmx.de>
+Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
+Cc: stable@vger.kernel.org
+Link: https://lore.kernel.org/r/87a6w6x7bb.fsf@ni.com
+Link: https://lore.kernel.org/r/87sg9pkvf7.fsf@nanos.tec.linutronix.de
+Signed-off-by: Zhen Lei <thunder.leizhen@huawei.com>
+Acked-by: Joe Korty <joe.korty@concurrent-rt.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ kernel/futex.c |   16 ++++++++++++++--
+ 1 file changed, 14 insertions(+), 2 deletions(-)
+
+--- a/kernel/futex.c
++++ b/kernel/futex.c
+@@ -2437,10 +2437,22 @@ retry:
+               }
+               /*
+-               * Since we just failed the trylock; there must be an owner.
++               * The trylock just failed, so either there is an owner or
++               * there is a higher priority waiter than this one.
+                */
+               newowner = rt_mutex_owner(&pi_state->pi_mutex);
+-              BUG_ON(!newowner);
++              /*
++               * If the higher priority waiter has not yet taken over the
++               * rtmutex then newowner is NULL. We can't return here with
++               * that state because it's inconsistent vs. the user space
++               * state. So drop the locks and try again. It's a valid
++               * situation and not any different from the other retry
++               * conditions.
++               */
++              if (unlikely(!newowner)) {
++                      err = -EAGAIN;
++                      goto handle_fault;
++              }
+       } else {
+               WARN_ON_ONCE(argowner != current);
+               if (oldowner == current) {
diff --git a/queue-4.4/futex-pull-rt_mutex_futex_unlock-out-from-under-hb-lock.patch b/queue-4.4/futex-pull-rt_mutex_futex_unlock-out-from-under-hb-lock.patch
new file mode 100644 (file)
index 0000000..4340e86
--- /dev/null
@@ -0,0 +1,257 @@
+From foo@baz Sun Aug  8 08:43:25 AM CEST 2021
+From: Zhen Lei <thunder.leizhen@huawei.com>
+Date: Mon, 2 Aug 2021 21:46:17 +0800
+Subject: futex: Pull rt_mutex_futex_unlock() out from under hb->lock
+To: Greg Kroah-Hartman <gregkh@linuxfoundation.org>, stable <stable@vger.kernel.org>
+Cc: Zhen Lei <thunder.leizhen@huawei.com>, Anna-Maria Gleixner <anna-maria@linutronix.de>, Mike Galbraith <efault@gmx.de>, Sasha Levin <sasha.levin@oracle.com>, Ingo Molnar <mingo@kernel.org>, Peter Zijlstra <peterz@infradead.org>, Thomas Gleixner <tglx@linutronix.de>, linux-kernel <linux-kernel@vger.kernel.org>
+Message-ID: <20210802134624.1934-5-thunder.leizhen@huawei.com>
+
+From: Peter Zijlstra <peterz@infradead.org>
+
+[ Upstream commit 16ffa12d742534d4ff73e8b3a4e81c1de39196f0 ]
+
+There's a number of 'interesting' problems, all caused by holding
+hb->lock while doing the rt_mutex_unlock() equivalient.
+
+Notably:
+
+ - a PI inversion on hb->lock; and,
+
+ - a SCHED_DEADLINE crash because of pointer instability.
+
+The previous changes:
+
+ - changed the locking rules to cover {uval,pi_state} with wait_lock.
+
+ - allow to do rt_mutex_futex_unlock() without dropping wait_lock; which in
+   turn allows to rely on wait_lock atomicity completely.
+
+ - simplified the waiter conundrum.
+
+It's now sufficient to hold rtmutex::wait_lock and a reference on the
+pi_state to protect the state consistency, so hb->lock can be dropped
+before calling rt_mutex_futex_unlock().
+
+Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
+Cc: juri.lelli@arm.com
+Cc: bigeasy@linutronix.de
+Cc: xlpang@redhat.com
+Cc: rostedt@goodmis.org
+Cc: mathieu.desnoyers@efficios.com
+Cc: jdesfossez@efficios.com
+Cc: dvhart@infradead.org
+Cc: bristot@redhat.com
+Link: http://lkml.kernel.org/r/20170322104151.900002056@infradead.org
+Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
+Signed-off-by: Zhen Lei <thunder.leizhen@huawei.com>
+Acked-by: Joe Korty <joe.korty@concurrent-rt.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ kernel/futex.c |  111 ++++++++++++++++++++++++++++++++++-----------------------
+ 1 file changed, 68 insertions(+), 43 deletions(-)
+
+--- a/kernel/futex.c
++++ b/kernel/futex.c
+@@ -966,10 +966,12 @@ static void exit_pi_state_list(struct ta
+               pi_state->owner = NULL;
+               raw_spin_unlock_irq(&curr->pi_lock);
+-              rt_mutex_futex_unlock(&pi_state->pi_mutex);
+-
++              get_pi_state(pi_state);
+               spin_unlock(&hb->lock);
++              rt_mutex_futex_unlock(&pi_state->pi_mutex);
++              put_pi_state(pi_state);
++
+               raw_spin_lock_irq(&curr->pi_lock);
+       }
+       raw_spin_unlock_irq(&curr->pi_lock);
+@@ -1083,6 +1085,11 @@ static int attach_to_pi_state(u32 __user
+        * has dropped the hb->lock in between queue_me() and unqueue_me_pi(),
+        * which in turn means that futex_lock_pi() still has a reference on
+        * our pi_state.
++       *
++       * The waiter holding a reference on @pi_state also protects against
++       * the unlocked put_pi_state() in futex_unlock_pi(), futex_lock_pi()
++       * and futex_wait_requeue_pi() as it cannot go to 0 and consequently
++       * free pi_state before we can take a reference ourselves.
+        */
+       WARN_ON(!atomic_read(&pi_state->refcount));
+@@ -1537,48 +1544,40 @@ static void mark_wake_futex(struct wake_
+       q->lock_ptr = NULL;
+ }
+-static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this,
+-                       struct futex_hash_bucket *hb)
++/*
++ * Caller must hold a reference on @pi_state.
++ */
++static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_pi_state *pi_state)
+ {
+-      struct task_struct *new_owner;
+-      struct futex_pi_state *pi_state = this->pi_state;
+       u32 uninitialized_var(curval), newval;
++      struct task_struct *new_owner;
++      bool deboost = false;
+       WAKE_Q(wake_q);
+-      bool deboost;
+       int ret = 0;
+-      if (!pi_state)
+-              return -EINVAL;
+-
+-      /*
+-       * If current does not own the pi_state then the futex is
+-       * inconsistent and user space fiddled with the futex value.
+-       */
+-      if (pi_state->owner != current)
+-              return -EINVAL;
+-
+       raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);
+       new_owner = rt_mutex_next_owner(&pi_state->pi_mutex);
+-
+-      /*
+-       * When we interleave with futex_lock_pi() where it does
+-       * rt_mutex_timed_futex_lock(), we might observe @this futex_q waiter,
+-       * but the rt_mutex's wait_list can be empty (either still, or again,
+-       * depending on which side we land).
+-       *
+-       * When this happens, give up our locks and try again, giving the
+-       * futex_lock_pi() instance time to complete, either by waiting on the
+-       * rtmutex or removing itself from the futex queue.
+-       */
+       if (!new_owner) {
+-              raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
+-              return -EAGAIN;
++              /*
++               * Since we held neither hb->lock nor wait_lock when coming
++               * into this function, we could have raced with futex_lock_pi()
++               * such that we might observe @this futex_q waiter, but the
++               * rt_mutex's wait_list can be empty (either still, or again,
++               * depending on which side we land).
++               *
++               * When this happens, give up our locks and try again, giving
++               * the futex_lock_pi() instance time to complete, either by
++               * waiting on the rtmutex or removing itself from the futex
++               * queue.
++               */
++              ret = -EAGAIN;
++              goto out_unlock;
+       }
+       /*
+-       * We pass it to the next owner. The WAITERS bit is always
+-       * kept enabled while there is PI state around. We cleanup the
+-       * owner died bit, because we are the owner.
++       * We pass it to the next owner. The WAITERS bit is always kept
++       * enabled while there is PI state around. We cleanup the owner
++       * died bit, because we are the owner.
+        */
+       newval = FUTEX_WAITERS | task_pid_vnr(new_owner);
+@@ -1611,15 +1610,15 @@ static int wake_futex_pi(u32 __user *uad
+               deboost = __rt_mutex_futex_unlock(&pi_state->pi_mutex, &wake_q);
+       }
++out_unlock:
+       raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
+-      spin_unlock(&hb->lock);
+       if (deboost) {
+               wake_up_q(&wake_q);
+               rt_mutex_adjust_prio(current);
+       }
+-      return 0;
++      return ret;
+ }
+ /*
+@@ -2462,7 +2461,7 @@ retry:
+       if (get_futex_value_locked(&uval, uaddr))
+               goto handle_fault;
+-      while (1) {
++      for (;;) {
+               newval = (uval & FUTEX_OWNER_DIED) | newtid;
+               if (cmpxchg_futex_value_locked(&curval, uaddr, uval, newval))
+@@ -2975,10 +2974,36 @@ retry:
+        */
+       match = futex_top_waiter(hb, &key);
+       if (match) {
+-              ret = wake_futex_pi(uaddr, uval, match, hb);
++              struct futex_pi_state *pi_state = match->pi_state;
++
++              ret = -EINVAL;
++              if (!pi_state)
++                      goto out_unlock;
++
+               /*
+-               * In case of success wake_futex_pi dropped the hash
+-               * bucket lock.
++               * If current does not own the pi_state then the futex is
++               * inconsistent and user space fiddled with the futex value.
++               */
++              if (pi_state->owner != current)
++                      goto out_unlock;
++
++              /*
++               * Grab a reference on the pi_state and drop hb->lock.
++               *
++               * The reference ensures pi_state lives, dropping the hb->lock
++               * is tricky.. wake_futex_pi() will take rt_mutex::wait_lock to
++               * close the races against futex_lock_pi(), but in case of
++               * _any_ fail we'll abort and retry the whole deal.
++               */
++              get_pi_state(pi_state);
++              spin_unlock(&hb->lock);
++
++              ret = wake_futex_pi(uaddr, uval, pi_state);
++
++              put_pi_state(pi_state);
++
++              /*
++               * Success, we're done! No tricky corner cases.
+                */
+               if (!ret)
+                       goto out_putkey;
+@@ -2993,7 +3018,6 @@ retry:
+                * setting the FUTEX_WAITERS bit. Try again.
+                */
+               if (ret == -EAGAIN) {
+-                      spin_unlock(&hb->lock);
+                       put_futex_key(&key);
+                       goto retry;
+               }
+@@ -3001,7 +3025,7 @@ retry:
+                * wake_futex_pi has detected invalid state. Tell user
+                * space.
+                */
+-              goto out_unlock;
++              goto out_putkey;
+       }
+       /*
+@@ -3011,8 +3035,10 @@ retry:
+        * preserve the WAITERS bit not the OWNER_DIED one. We are the
+        * owner.
+        */
+-      if (cmpxchg_futex_value_locked(&curval, uaddr, uval, 0))
++      if (cmpxchg_futex_value_locked(&curval, uaddr, uval, 0)) {
++              spin_unlock(&hb->lock);
+               goto pi_faulted;
++      }
+       /*
+        * If uval has changed, let user space handle it.
+@@ -3026,7 +3052,6 @@ out_putkey:
+       return ret;
+ pi_faulted:
+-      spin_unlock(&hb->lock);
+       put_futex_key(&key);
+       ret = fault_in_user_writeable(uaddr);
diff --git a/queue-4.4/futex-rename-free_pi_state-to-put_pi_state.patch b/queue-4.4/futex-rename-free_pi_state-to-put_pi_state.patch
new file mode 100644 (file)
index 0000000..4cc57a5
--- /dev/null
@@ -0,0 +1,101 @@
+From foo@baz Sun Aug  8 08:43:25 AM CEST 2021
+From: Zhen Lei <thunder.leizhen@huawei.com>
+Date: Mon, 2 Aug 2021 21:46:14 +0800
+Subject: futex: Rename free_pi_state() to put_pi_state()
+To: Greg Kroah-Hartman <gregkh@linuxfoundation.org>, stable <stable@vger.kernel.org>
+Cc: Zhen Lei <thunder.leizhen@huawei.com>, Anna-Maria Gleixner <anna-maria@linutronix.de>, Mike Galbraith <efault@gmx.de>, Sasha Levin <sasha.levin@oracle.com>, Ingo Molnar <mingo@kernel.org>, Peter Zijlstra <peterz@infradead.org>, Thomas Gleixner <tglx@linutronix.de>, linux-kernel <linux-kernel@vger.kernel.org>
+Message-ID: <20210802134624.1934-2-thunder.leizhen@huawei.com>
+
+From: Thomas Gleixner <tglx@linutronix.de>
+
+[ Upstream commit 29e9ee5d48c35d6cf8afe09bdf03f77125c9ac11 ]
+
+free_pi_state() is confusing as it is in fact only freeing/caching the
+pi state when the last reference is gone. Rename it to put_pi_state()
+which reflects better what it is doing.
+
+Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
+Cc: Peter Zijlstra <peterz@infradead.org>
+Cc: Darren Hart <darren@dvhart.com>
+Cc: Davidlohr Bueso <dave@stgolabs.net>
+Cc: Bhuvanesh_Surachari@mentor.com
+Cc: Andy Lowe <Andy_Lowe@mentor.com>
+Link: http://lkml.kernel.org/r/20151219200607.259636467@linutronix.de
+Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
+Signed-off-by: Zhen Lei <thunder.leizhen@huawei.com>
+Acked-by: Joe Korty <joe.korty@concurrent-rt.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ kernel/futex.c |   17 ++++++++++-------
+ 1 file changed, 10 insertions(+), 7 deletions(-)
+
+--- a/kernel/futex.c
++++ b/kernel/futex.c
+@@ -859,9 +859,12 @@ static void pi_state_update_owner(struct
+ }
+ /*
++ * Drops a reference to the pi_state object and frees or caches it
++ * when the last reference is gone.
++ *
+  * Must be called with the hb lock held.
+  */
+-static void free_pi_state(struct futex_pi_state *pi_state)
++static void put_pi_state(struct futex_pi_state *pi_state)
+ {
+       if (!pi_state)
+               return;
+@@ -2121,7 +2124,7 @@ retry_private:
+               case 0:
+                       break;
+               case -EFAULT:
+-                      free_pi_state(pi_state);
++                      put_pi_state(pi_state);
+                       pi_state = NULL;
+                       double_unlock_hb(hb1, hb2);
+                       hb_waiters_dec(hb2);
+@@ -2139,7 +2142,7 @@ retry_private:
+                        *   exit to complete.
+                        * - EAGAIN: The user space value changed.
+                        */
+-                      free_pi_state(pi_state);
++                      put_pi_state(pi_state);
+                       pi_state = NULL;
+                       double_unlock_hb(hb1, hb2);
+                       hb_waiters_dec(hb2);
+@@ -2214,7 +2217,7 @@ retry_private:
+                       } else if (ret) {
+                               /* -EDEADLK */
+                               this->pi_state = NULL;
+-                              free_pi_state(pi_state);
++                              put_pi_state(pi_state);
+                               goto out_unlock;
+                       }
+               }
+@@ -2223,7 +2226,7 @@ retry_private:
+       }
+ out_unlock:
+-      free_pi_state(pi_state);
++      put_pi_state(pi_state);
+       double_unlock_hb(hb1, hb2);
+       wake_up_q(&wake_q);
+       hb_waiters_dec(hb2);
+@@ -2376,7 +2379,7 @@ static void unqueue_me_pi(struct futex_q
+       __unqueue_futex(q);
+       BUG_ON(!q->pi_state);
+-      free_pi_state(q->pi_state);
++      put_pi_state(q->pi_state);
+       q->pi_state = NULL;
+       spin_unlock(q->lock_ptr);
+@@ -3210,7 +3213,7 @@ static int futex_wait_requeue_pi(u32 __u
+                        * Drop the reference to the pi state which
+                        * the requeue_pi() code acquired for us.
+                        */
+-                      free_pi_state(q.pi_state);
++                      put_pi_state(q.pi_state);
+                       spin_unlock(q.lock_ptr);
+                       /*
+                        * Adjust the return value. It's either -EFAULT or
diff --git a/queue-4.4/futex-rework-futex_lock_pi-to-use-rt_mutex_-_proxy_lock.patch b/queue-4.4/futex-rework-futex_lock_pi-to-use-rt_mutex_-_proxy_lock.patch
new file mode 100644 (file)
index 0000000..c851d56
--- /dev/null
@@ -0,0 +1,272 @@
+From foo@baz Sun Aug  8 08:43:25 AM CEST 2021
+From: Zhen Lei <thunder.leizhen@huawei.com>
+Date: Mon, 2 Aug 2021 21:46:18 +0800
+Subject: futex: Rework futex_lock_pi() to use rt_mutex_*_proxy_lock()
+To: Greg Kroah-Hartman <gregkh@linuxfoundation.org>, stable <stable@vger.kernel.org>
+Cc: Zhen Lei <thunder.leizhen@huawei.com>, Anna-Maria Gleixner <anna-maria@linutronix.de>, Mike Galbraith <efault@gmx.de>, Sasha Levin <sasha.levin@oracle.com>, Ingo Molnar <mingo@kernel.org>, Peter Zijlstra <peterz@infradead.org>, Thomas Gleixner <tglx@linutronix.de>, linux-kernel <linux-kernel@vger.kernel.org>
+Message-ID: <20210802134624.1934-6-thunder.leizhen@huawei.com>
+
+From: Peter Zijlstra <peterz@infradead.org>
+
+[ Upstream commit cfafcd117da0216520568c195cb2f6cd1980c4bb ]
+
+By changing futex_lock_pi() to use rt_mutex_*_proxy_lock() all wait_list
+modifications are done under both hb->lock and wait_lock.
+
+This closes the obvious interleave pattern between futex_lock_pi() and
+futex_unlock_pi(), but not entirely so. See below:
+
+Before:
+
+futex_lock_pi()                        futex_unlock_pi()
+  unlock hb->lock
+
+                                 lock hb->lock
+                                 unlock hb->lock
+
+                                 lock rt_mutex->wait_lock
+                                 unlock rt_mutex_wait_lock
+                                   -EAGAIN
+
+  lock rt_mutex->wait_lock
+  list_add
+  unlock rt_mutex->wait_lock
+
+  schedule()
+
+  lock rt_mutex->wait_lock
+  list_del
+  unlock rt_mutex->wait_lock
+
+                                 <idem>
+                                   -EAGAIN
+
+  lock hb->lock
+
+After:
+
+futex_lock_pi()                        futex_unlock_pi()
+
+  lock hb->lock
+  lock rt_mutex->wait_lock
+  list_add
+  unlock rt_mutex->wait_lock
+  unlock hb->lock
+
+  schedule()
+                                 lock hb->lock
+                                 unlock hb->lock
+  lock hb->lock
+  lock rt_mutex->wait_lock
+  list_del
+  unlock rt_mutex->wait_lock
+
+                                 lock rt_mutex->wait_lock
+                                 unlock rt_mutex_wait_lock
+                                   -EAGAIN
+
+  unlock hb->lock
+
+It does however solve the earlier starvation/live-lock scenario which got
+introduced with the -EAGAIN since unlike the before scenario; where the
+-EAGAIN happens while futex_unlock_pi() doesn't hold any locks; in the
+after scenario it happens while futex_unlock_pi() actually holds a lock,
+and then it is serialized on that lock.
+
+Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
+Cc: juri.lelli@arm.com
+Cc: bigeasy@linutronix.de
+Cc: xlpang@redhat.com
+Cc: rostedt@goodmis.org
+Cc: mathieu.desnoyers@efficios.com
+Cc: jdesfossez@efficios.com
+Cc: dvhart@infradead.org
+Cc: bristot@redhat.com
+Link: http://lkml.kernel.org/r/20170322104152.062785528@infradead.org
+Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
+Signed-off-by: Zhen Lei <thunder.leizhen@huawei.com>
+Acked-by: Joe Korty <joe.korty@concurrent-rt.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ kernel/futex.c                  |   77 ++++++++++++++++++++++++++++------------
+ kernel/locking/rtmutex.c        |   26 +++----------
+ kernel/locking/rtmutex_common.h |    1 
+ 3 files changed, 62 insertions(+), 42 deletions(-)
+
+--- a/kernel/futex.c
++++ b/kernel/futex.c
+@@ -2284,20 +2284,7 @@ queue_unlock(struct futex_hash_bucket *h
+       hb_waiters_dec(hb);
+ }
+-/**
+- * queue_me() - Enqueue the futex_q on the futex_hash_bucket
+- * @q:        The futex_q to enqueue
+- * @hb:       The destination hash bucket
+- *
+- * The hb->lock must be held by the caller, and is released here. A call to
+- * queue_me() is typically paired with exactly one call to unqueue_me().  The
+- * exceptions involve the PI related operations, which may use unqueue_me_pi()
+- * or nothing if the unqueue is done as part of the wake process and the unqueue
+- * state is implicit in the state of woken task (see futex_wait_requeue_pi() for
+- * an example).
+- */
+-static inline void queue_me(struct futex_q *q, struct futex_hash_bucket *hb)
+-      __releases(&hb->lock)
++static inline void __queue_me(struct futex_q *q, struct futex_hash_bucket *hb)
+ {
+       int prio;
+@@ -2314,6 +2301,24 @@ static inline void queue_me(struct futex
+       plist_node_init(&q->list, prio);
+       plist_add(&q->list, &hb->chain);
+       q->task = current;
++}
++
++/**
++ * queue_me() - Enqueue the futex_q on the futex_hash_bucket
++ * @q:        The futex_q to enqueue
++ * @hb:       The destination hash bucket
++ *
++ * The hb->lock must be held by the caller, and is released here. A call to
++ * queue_me() is typically paired with exactly one call to unqueue_me().  The
++ * exceptions involve the PI related operations, which may use unqueue_me_pi()
++ * or nothing if the unqueue is done as part of the wake process and the unqueue
++ * state is implicit in the state of woken task (see futex_wait_requeue_pi() for
++ * an example).
++ */
++static inline void queue_me(struct futex_q *q, struct futex_hash_bucket *hb)
++      __releases(&hb->lock)
++{
++      __queue_me(q, hb);
+       spin_unlock(&hb->lock);
+ }
+@@ -2819,6 +2824,7 @@ static int futex_lock_pi(u32 __user *uad
+ {
+       struct hrtimer_sleeper timeout, *to = NULL;
+       struct task_struct *exiting = NULL;
++      struct rt_mutex_waiter rt_waiter;
+       struct futex_hash_bucket *hb;
+       struct futex_q q = futex_q_init;
+       int res, ret;
+@@ -2879,25 +2885,52 @@ retry_private:
+               }
+       }
++      WARN_ON(!q.pi_state);
++
+       /*
+        * Only actually queue now that the atomic ops are done:
+        */
+-      queue_me(&q, hb);
++      __queue_me(&q, hb);
+-      WARN_ON(!q.pi_state);
+-      /*
+-       * Block on the PI mutex:
+-       */
+-      if (!trylock) {
+-              ret = rt_mutex_timed_futex_lock(&q.pi_state->pi_mutex, to);
+-      } else {
++      if (trylock) {
+               ret = rt_mutex_futex_trylock(&q.pi_state->pi_mutex);
+               /* Fixup the trylock return value: */
+               ret = ret ? 0 : -EWOULDBLOCK;
++              goto no_block;
+       }
++      /*
++       * We must add ourselves to the rt_mutex waitlist while holding hb->lock
++       * such that the hb and rt_mutex wait lists match.
++       */
++      rt_mutex_init_waiter(&rt_waiter);
++      ret = rt_mutex_start_proxy_lock(&q.pi_state->pi_mutex, &rt_waiter, current);
++      if (ret) {
++              if (ret == 1)
++                      ret = 0;
++
++              goto no_block;
++      }
++
++      spin_unlock(q.lock_ptr);
++
++      if (unlikely(to))
++              hrtimer_start_expires(&to->timer, HRTIMER_MODE_ABS);
++
++      ret = rt_mutex_wait_proxy_lock(&q.pi_state->pi_mutex, to, &rt_waiter);
++
+       spin_lock(q.lock_ptr);
+       /*
++       * If we failed to acquire the lock (signal/timeout), we must
++       * first acquire the hb->lock before removing the lock from the
++       * rt_mutex waitqueue, such that we can keep the hb and rt_mutex
++       * wait lists consistent.
++       */
++      if (ret && !rt_mutex_cleanup_proxy_lock(&q.pi_state->pi_mutex, &rt_waiter))
++              ret = 0;
++
++no_block:
++      /*
+        * Fixup the pi_state owner and possibly acquire the lock if we
+        * haven't already.
+        */
+--- a/kernel/locking/rtmutex.c
++++ b/kernel/locking/rtmutex.c
+@@ -1489,19 +1489,6 @@ int __sched rt_mutex_lock_interruptible(
+ EXPORT_SYMBOL_GPL(rt_mutex_lock_interruptible);
+ /*
+- * Futex variant with full deadlock detection.
+- * Futex variants must not use the fast-path, see __rt_mutex_futex_unlock().
+- */
+-int __sched rt_mutex_timed_futex_lock(struct rt_mutex *lock,
+-                            struct hrtimer_sleeper *timeout)
+-{
+-      might_sleep();
+-
+-      return rt_mutex_slowlock(lock, TASK_INTERRUPTIBLE,
+-                               timeout, RT_MUTEX_FULL_CHAINWALK);
+-}
+-
+-/*
+  * Futex variant, must not use fastpath.
+  */
+ int __sched rt_mutex_futex_trylock(struct rt_mutex *lock)
+@@ -1774,12 +1761,6 @@ int rt_mutex_wait_proxy_lock(struct rt_m
+       /* sleep on the mutex */
+       ret = __rt_mutex_slowlock(lock, TASK_INTERRUPTIBLE, to, waiter);
+-      /*
+-       * try_to_take_rt_mutex() sets the waiter bit unconditionally. We might
+-       * have to fix that up.
+-       */
+-      fixup_rt_mutex_waiters(lock);
+-
+       raw_spin_unlock(&lock->wait_lock);
+       return ret;
+@@ -1819,6 +1800,13 @@ bool rt_mutex_cleanup_proxy_lock(struct
+               fixup_rt_mutex_waiters(lock);
+               cleanup = true;
+       }
++
++      /*
++       * try_to_take_rt_mutex() sets the waiter bit unconditionally. We might
++       * have to fix that up.
++       */
++      fixup_rt_mutex_waiters(lock);
++
+       raw_spin_unlock_irq(&lock->wait_lock);
+       return cleanup;
+--- a/kernel/locking/rtmutex_common.h
++++ b/kernel/locking/rtmutex_common.h
+@@ -111,7 +111,6 @@ extern int rt_mutex_wait_proxy_lock(stru
+                              struct rt_mutex_waiter *waiter);
+ extern bool rt_mutex_cleanup_proxy_lock(struct rt_mutex *lock,
+                                struct rt_mutex_waiter *waiter);
+-extern int rt_mutex_timed_futex_lock(struct rt_mutex *l, struct hrtimer_sleeper *to);
+ extern int rt_mutex_futex_trylock(struct rt_mutex *l);
+ extern int __rt_mutex_futex_trylock(struct rt_mutex *l);
diff --git a/queue-4.4/futex-rt_mutex-fix-rt_mutex_cleanup_proxy_lock.patch b/queue-4.4/futex-rt_mutex-fix-rt_mutex_cleanup_proxy_lock.patch
new file mode 100644 (file)
index 0000000..b066748
--- /dev/null
@@ -0,0 +1,135 @@
+From foo@baz Sun Aug  8 08:43:25 AM CEST 2021
+From: Zhen Lei <thunder.leizhen@huawei.com>
+Date: Mon, 2 Aug 2021 21:46:23 +0800
+Subject: futex,rt_mutex: Fix rt_mutex_cleanup_proxy_lock()
+To: Greg Kroah-Hartman <gregkh@linuxfoundation.org>, stable <stable@vger.kernel.org>
+Cc: Zhen Lei <thunder.leizhen@huawei.com>, Anna-Maria Gleixner <anna-maria@linutronix.de>, Mike Galbraith <efault@gmx.de>, Sasha Levin <sasha.levin@oracle.com>, Ingo Molnar <mingo@kernel.org>, Peter Zijlstra <peterz@infradead.org>, Thomas Gleixner <tglx@linutronix.de>, linux-kernel <linux-kernel@vger.kernel.org>
+Message-ID: <20210802134624.1934-11-thunder.leizhen@huawei.com>
+
+From: Peter Zijlstra <peterz@infradead.org>
+
+[ Upstream commit 04dc1b2fff4e96cb4142227fbdc63c8871ad4ed9 ]
+
+Markus reported that the glibc/nptl/tst-robustpi8 test was failing after
+commit:
+
+  cfafcd117da0 ("futex: Rework futex_lock_pi() to use rt_mutex_*_proxy_lock()")
+
+The following trace shows the problem:
+
+ ld-linux-x86-64-2161  [019] ....   410.760971: SyS_futex: 00007ffbeb76b028: 80000875  op=FUTEX_LOCK_PI
+ ld-linux-x86-64-2161  [019] ...1   410.760972: lock_pi_update_atomic: 00007ffbeb76b028: curval=80000875 uval=80000875 newval=80000875 ret=0
+ ld-linux-x86-64-2165  [011] ....   410.760978: SyS_futex: 00007ffbeb76b028: 80000875  op=FUTEX_UNLOCK_PI
+ ld-linux-x86-64-2165  [011] d..1   410.760979: do_futex: 00007ffbeb76b028: curval=80000875 uval=80000875 newval=80000871 ret=0
+ ld-linux-x86-64-2165  [011] ....   410.760980: SyS_futex: 00007ffbeb76b028: 80000871 ret=0000
+ ld-linux-x86-64-2161  [019] ....   410.760980: SyS_futex: 00007ffbeb76b028: 80000871 ret=ETIMEDOUT
+
+Task 2165 does an UNLOCK_PI, assigning the lock to the waiter task 2161
+which then returns with -ETIMEDOUT. That wrecks the lock state, because now
+the owner isn't aware it acquired the lock and removes the pending robust
+list entry.
+
+If 2161 is killed, the robust list will not clear out this futex and the
+subsequent acquire on this futex will then (correctly) result in -ESRCH
+which is unexpected by glibc, triggers an internal assertion and dies.
+
+Task 2161                      Task 2165
+
+rt_mutex_wait_proxy_lock()
+   timeout();
+   /* T2161 is still queued in  the waiter list */
+   return -ETIMEDOUT;
+
+                               futex_unlock_pi()
+                               spin_lock(hb->lock);
+                               rtmutex_unlock()
+                                 remove_rtmutex_waiter(T2161);
+                                  mark_lock_available();
+                               /* Make the next waiter owner of the user space side */
+                               futex_uval = 2161;
+                               spin_unlock(hb->lock);
+spin_lock(hb->lock);
+rt_mutex_cleanup_proxy_lock()
+  if (rtmutex_owner() !== current)
+     ...
+     return FAIL;
+....
+return -ETIMEOUT;
+
+This means that rt_mutex_cleanup_proxy_lock() needs to call
+try_to_take_rt_mutex() so it can take over the rtmutex correctly which was
+assigned by the waker. If the rtmutex is owned by some other task then this
+call is harmless and just confirmes that the waiter is not able to acquire
+it.
+
+While there, fix what looks like a merge error which resulted in
+rt_mutex_cleanup_proxy_lock() having two calls to
+fixup_rt_mutex_waiters() and rt_mutex_wait_proxy_lock() not having any.
+Both should have one, since both potentially touch the waiter list.
+
+Fixes: 38d589f2fd08 ("futex,rt_mutex: Restructure rt_mutex_finish_proxy_lock()")
+Reported-by: Markus Trippelsdorf <markus@trippelsdorf.de>
+Bug-Spotted-by: Thomas Gleixner <tglx@linutronix.de>
+Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
+Cc: Florian Weimer <fweimer@redhat.com>
+Cc: Darren Hart <dvhart@infradead.org>
+Cc: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
+Cc: Markus Trippelsdorf <markus@trippelsdorf.de>
+Link: http://lkml.kernel.org/r/20170519154850.mlomgdsd26drq5j6@hirez.programming.kicks-ass.net
+Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
+Signed-off-by: Zhen Lei <thunder.leizhen@huawei.com>
+Acked-by: Joe Korty <joe.korty@concurrent-rt.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ kernel/locking/rtmutex.c |   24 ++++++++++++++++++------
+ 1 file changed, 18 insertions(+), 6 deletions(-)
+
+--- a/kernel/locking/rtmutex.c
++++ b/kernel/locking/rtmutex.c
+@@ -1764,12 +1764,14 @@ int rt_mutex_wait_proxy_lock(struct rt_m
+       int ret;
+       raw_spin_lock_irq(&lock->wait_lock);
+-
+-      set_current_state(TASK_INTERRUPTIBLE);
+-
+       /* sleep on the mutex */
++      set_current_state(TASK_INTERRUPTIBLE);
+       ret = __rt_mutex_slowlock(lock, TASK_INTERRUPTIBLE, to, waiter);
+-
++      /*
++       * try_to_take_rt_mutex() sets the waiter bit unconditionally. We might
++       * have to fix that up.
++       */
++      fixup_rt_mutex_waiters(lock);
+       raw_spin_unlock_irq(&lock->wait_lock);
+       return ret;
+@@ -1801,15 +1803,25 @@ bool rt_mutex_cleanup_proxy_lock(struct
+       raw_spin_lock_irq(&lock->wait_lock);
+       /*
++       * Do an unconditional try-lock, this deals with the lock stealing
++       * state where __rt_mutex_futex_unlock() -> mark_wakeup_next_waiter()
++       * sets a NULL owner.
++       *
++       * We're not interested in the return value, because the subsequent
++       * test on rt_mutex_owner() will infer that. If the trylock succeeded,
++       * we will own the lock and it will have removed the waiter. If we
++       * failed the trylock, we're still not owner and we need to remove
++       * ourselves.
++       */
++      try_to_take_rt_mutex(lock, current, waiter);
++      /*
+        * Unless we're the owner; we're still enqueued on the wait_list.
+        * So check if we became owner, if not, take us off the wait_list.
+        */
+       if (rt_mutex_owner(lock) != current) {
+               remove_waiter(lock, waiter);
+-              fixup_rt_mutex_waiters(lock);
+               cleanup = true;
+       }
+-
+       /*
+        * try_to_take_rt_mutex() sets the waiter bit unconditionally. We might
+        * have to fix that up.
diff --git a/queue-4.4/futex-rt_mutex-introduce-rt_mutex_init_waiter.patch b/queue-4.4/futex-rt_mutex-introduce-rt_mutex_init_waiter.patch
new file mode 100644 (file)
index 0000000..f064c65
--- /dev/null
@@ -0,0 +1,87 @@
+From foo@baz Sun Aug  8 08:43:25 AM CEST 2021
+From: Zhen Lei <thunder.leizhen@huawei.com>
+Date: Mon, 2 Aug 2021 21:46:16 +0800
+Subject: futex,rt_mutex: Introduce rt_mutex_init_waiter()
+To: Greg Kroah-Hartman <gregkh@linuxfoundation.org>, stable <stable@vger.kernel.org>
+Cc: Zhen Lei <thunder.leizhen@huawei.com>, Anna-Maria Gleixner <anna-maria@linutronix.de>, Mike Galbraith <efault@gmx.de>, Sasha Levin <sasha.levin@oracle.com>, Ingo Molnar <mingo@kernel.org>, Peter Zijlstra <peterz@infradead.org>, Thomas Gleixner <tglx@linutronix.de>, linux-kernel <linux-kernel@vger.kernel.org>
+Message-ID: <20210802134624.1934-4-thunder.leizhen@huawei.com>
+
+From: Peter Zijlstra <peterz@infradead.org>
+
+[ Upstream commit 50809358dd7199aa7ce232f6877dd09ec30ef374 ]
+
+Since there's already two copies of this code, introduce a helper now
+before adding a third one.
+
+Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
+Cc: juri.lelli@arm.com
+Cc: bigeasy@linutronix.de
+Cc: xlpang@redhat.com
+Cc: rostedt@goodmis.org
+Cc: mathieu.desnoyers@efficios.com
+Cc: jdesfossez@efficios.com
+Cc: dvhart@infradead.org
+Cc: bristot@redhat.com
+Link: http://lkml.kernel.org/r/20170322104151.950039479@infradead.org
+Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
+Signed-off-by: Zhen Lei <thunder.leizhen@huawei.com>
+Acked-by: Joe Korty <joe.korty@concurrent-rt.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ kernel/futex.c                  |    5 +----
+ kernel/locking/rtmutex.c        |   12 +++++++++---
+ kernel/locking/rtmutex_common.h |    1 +
+ 3 files changed, 11 insertions(+), 7 deletions(-)
+
+--- a/kernel/futex.c
++++ b/kernel/futex.c
+@@ -3156,10 +3156,7 @@ static int futex_wait_requeue_pi(u32 __u
+        * The waiter is allocated on our stack, manipulated by the requeue
+        * code while we sleep on uaddr.
+        */
+-      debug_rt_mutex_init_waiter(&rt_waiter);
+-      RB_CLEAR_NODE(&rt_waiter.pi_tree_entry);
+-      RB_CLEAR_NODE(&rt_waiter.tree_entry);
+-      rt_waiter.task = NULL;
++      rt_mutex_init_waiter(&rt_waiter);
+       ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2, VERIFY_WRITE);
+       if (unlikely(ret != 0))
+--- a/kernel/locking/rtmutex.c
++++ b/kernel/locking/rtmutex.c
+@@ -1155,6 +1155,14 @@ void rt_mutex_adjust_pi(struct task_stru
+                                  next_lock, NULL, task);
+ }
++void rt_mutex_init_waiter(struct rt_mutex_waiter *waiter)
++{
++      debug_rt_mutex_init_waiter(waiter);
++      RB_CLEAR_NODE(&waiter->pi_tree_entry);
++      RB_CLEAR_NODE(&waiter->tree_entry);
++      waiter->task = NULL;
++}
++
+ /**
+  * __rt_mutex_slowlock() - Perform the wait-wake-try-to-take loop
+  * @lock:              the rt_mutex to take
+@@ -1236,9 +1244,7 @@ rt_mutex_slowlock(struct rt_mutex *lock,
+       struct rt_mutex_waiter waiter;
+       int ret = 0;
+-      debug_rt_mutex_init_waiter(&waiter);
+-      RB_CLEAR_NODE(&waiter.pi_tree_entry);
+-      RB_CLEAR_NODE(&waiter.tree_entry);
++      rt_mutex_init_waiter(&waiter);
+       raw_spin_lock(&lock->wait_lock);
+--- a/kernel/locking/rtmutex_common.h
++++ b/kernel/locking/rtmutex_common.h
+@@ -102,6 +102,7 @@ extern struct task_struct *rt_mutex_next
+ extern void rt_mutex_init_proxy_locked(struct rt_mutex *lock,
+                                      struct task_struct *proxy_owner);
+ extern void rt_mutex_proxy_unlock(struct rt_mutex *lock);
++extern void rt_mutex_init_waiter(struct rt_mutex_waiter *waiter);
+ extern int rt_mutex_start_proxy_lock(struct rt_mutex *lock,
+                                    struct rt_mutex_waiter *waiter,
+                                    struct task_struct *task);
diff --git a/queue-4.4/rcu-update-documentation-of-rcu_read_unlock.patch b/queue-4.4/rcu-update-documentation-of-rcu_read_unlock.patch
new file mode 100644 (file)
index 0000000..9201885
--- /dev/null
@@ -0,0 +1,46 @@
+From foo@baz Sun Aug  8 08:43:25 AM CEST 2021
+From: Zhen Lei <thunder.leizhen@huawei.com>
+Date: Mon, 2 Aug 2021 21:46:24 +0800
+Subject: rcu: Update documentation of rcu_read_unlock()
+To: Greg Kroah-Hartman <gregkh@linuxfoundation.org>, stable <stable@vger.kernel.org>
+Cc: Zhen Lei <thunder.leizhen@huawei.com>, Anna-Maria Gleixner <anna-maria@linutronix.de>, Mike Galbraith <efault@gmx.de>, Sasha Levin <sasha.levin@oracle.com>, Ingo Molnar <mingo@kernel.org>, Peter Zijlstra <peterz@infradead.org>, Thomas Gleixner <tglx@linutronix.de>, linux-kernel <linux-kernel@vger.kernel.org>
+Message-ID: <20210802134624.1934-12-thunder.leizhen@huawei.com>
+
+From: Anna-Maria Gleixner <anna-maria@linutronix.de>
+
+[ Upstream commit ec84b27f9b3b569f9235413d1945a2006b97b0aa ]
+
+Since commit b4abf91047cf ("rtmutex: Make wait_lock irq safe") the
+explanation in rcu_read_unlock() documentation about irq unsafe rtmutex
+wait_lock is no longer valid.
+
+Remove it to prevent kernel developers reading the documentation to rely on
+it.
+
+Suggested-by: Eric W. Biederman <ebiederm@xmission.com>
+Signed-off-by: Anna-Maria Gleixner <anna-maria@linutronix.de>
+Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
+Reviewed-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
+Acked-by: "Eric W. Biederman" <ebiederm@xmission.com>
+Cc: bigeasy@linutronix.de
+Link: https://lkml.kernel.org/r/20180525090507.22248-2-anna-maria@linutronix.de
+Signed-off-by: Zhen Lei <thunder.leizhen@huawei.com>
+Acked-by: Joe Korty <joe.korty@concurrent-rt.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ include/linux/rcupdate.h |    4 +---
+ 1 file changed, 1 insertion(+), 3 deletions(-)
+
+--- a/include/linux/rcupdate.h
++++ b/include/linux/rcupdate.h
+@@ -880,9 +880,7 @@ static __always_inline void rcu_read_loc
+  * Unfortunately, this function acquires the scheduler's runqueue and
+  * priority-inheritance spinlocks.  This means that deadlock could result
+  * if the caller of rcu_read_unlock() already holds one of these locks or
+- * any lock that is ever acquired while holding them; or any lock which
+- * can be taken from interrupt context because rcu_boost()->rt_mutex_lock()
+- * does not disable irqs while taking ->wait_lock.
++ * any lock that is ever acquired while holding them.
+  *
+  * That said, RCU readers are never priority boosted unless they were
+  * preempted.  Therefore, one way to avoid deadlock is to make sure
diff --git a/queue-4.4/rtmutex-make-wait_lock-irq-safe.patch b/queue-4.4/rtmutex-make-wait_lock-irq-safe.patch
new file mode 100644 (file)
index 0000000..fbaeec5
--- /dev/null
@@ -0,0 +1,545 @@
+From foo@baz Sun Aug  8 08:43:25 AM CEST 2021
+From: Zhen Lei <thunder.leizhen@huawei.com>
+Date: Mon, 2 Aug 2021 21:46:20 +0800
+Subject: rtmutex: Make wait_lock irq safe
+To: Greg Kroah-Hartman <gregkh@linuxfoundation.org>, stable <stable@vger.kernel.org>
+Cc: Zhen Lei <thunder.leizhen@huawei.com>, Anna-Maria Gleixner <anna-maria@linutronix.de>, Mike Galbraith <efault@gmx.de>, Sasha Levin <sasha.levin@oracle.com>, Ingo Molnar <mingo@kernel.org>, Peter Zijlstra <peterz@infradead.org>, Thomas Gleixner <tglx@linutronix.de>, linux-kernel <linux-kernel@vger.kernel.org>
+Message-ID: <20210802134624.1934-8-thunder.leizhen@huawei.com>
+
+From: Thomas Gleixner <tglx@linutronix.de>
+
+[ Upstream commit b4abf91047cf054f203dcfac97e1038388826937 ]
+
+Sasha reported a lockdep splat about a potential deadlock between RCU boosting
+rtmutex and the posix timer it_lock.
+
+CPU0                                   CPU1
+
+rtmutex_lock(&rcu->rt_mutex)
+  spin_lock(&rcu->rt_mutex.wait_lock)
+                                       local_irq_disable()
+                                       spin_lock(&timer->it_lock)
+                                       spin_lock(&rcu->mutex.wait_lock)
+--> Interrupt
+    spin_lock(&timer->it_lock)
+
+This is caused by the following code sequence on CPU1
+
+     rcu_read_lock()
+     x = lookup();
+     if (x)
+       spin_lock_irqsave(&x->it_lock);
+     rcu_read_unlock();
+     return x;
+
+We could fix that in the posix timer code by keeping rcu read locked across
+the spinlocked and irq disabled section, but the above sequence is common and
+there is no reason not to support it.
+
+Taking rt_mutex.wait_lock irq safe prevents the deadlock.
+
+Reported-by: Sasha Levin <sasha.levin@oracle.com>
+Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
+Cc: Peter Zijlstra <peterz@infradead.org>
+Cc: Paul McKenney <paulmck@linux.vnet.ibm.com>
+Signed-off-by: Zhen Lei <thunder.leizhen@huawei.com>
+Acked-by: Joe Korty <joe.korty@concurrent-rt.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ kernel/locking/rtmutex.c |  135 +++++++++++++++++++++++++----------------------
+ 1 file changed, 72 insertions(+), 63 deletions(-)
+
+--- a/kernel/locking/rtmutex.c
++++ b/kernel/locking/rtmutex.c
+@@ -163,13 +163,14 @@ static inline void mark_rt_mutex_waiters
+  * 2) Drop lock->wait_lock
+  * 3) Try to unlock the lock with cmpxchg
+  */
+-static inline bool unlock_rt_mutex_safe(struct rt_mutex *lock)
++static inline bool unlock_rt_mutex_safe(struct rt_mutex *lock,
++                                      unsigned long flags)
+       __releases(lock->wait_lock)
+ {
+       struct task_struct *owner = rt_mutex_owner(lock);
+       clear_rt_mutex_waiters(lock);
+-      raw_spin_unlock(&lock->wait_lock);
++      raw_spin_unlock_irqrestore(&lock->wait_lock, flags);
+       /*
+        * If a new waiter comes in between the unlock and the cmpxchg
+        * we have two situations:
+@@ -211,11 +212,12 @@ static inline void mark_rt_mutex_waiters
+ /*
+  * Simple slow path only version: lock->owner is protected by lock->wait_lock.
+  */
+-static inline bool unlock_rt_mutex_safe(struct rt_mutex *lock)
++static inline bool unlock_rt_mutex_safe(struct rt_mutex *lock,
++                                      unsigned long flags)
+       __releases(lock->wait_lock)
+ {
+       lock->owner = NULL;
+-      raw_spin_unlock(&lock->wait_lock);
++      raw_spin_unlock_irqrestore(&lock->wait_lock, flags);
+       return true;
+ }
+ #endif
+@@ -497,7 +499,6 @@ static int rt_mutex_adjust_prio_chain(st
+       int ret = 0, depth = 0;
+       struct rt_mutex *lock;
+       bool detect_deadlock;
+-      unsigned long flags;
+       bool requeue = true;
+       detect_deadlock = rt_mutex_cond_detect_deadlock(orig_waiter, chwalk);
+@@ -540,7 +541,7 @@ static int rt_mutex_adjust_prio_chain(st
+       /*
+        * [1] Task cannot go away as we did a get_task() before !
+        */
+-      raw_spin_lock_irqsave(&task->pi_lock, flags);
++      raw_spin_lock_irq(&task->pi_lock);
+       /*
+        * [2] Get the waiter on which @task is blocked on.
+@@ -624,7 +625,7 @@ static int rt_mutex_adjust_prio_chain(st
+        * operations.
+        */
+       if (!raw_spin_trylock(&lock->wait_lock)) {
+-              raw_spin_unlock_irqrestore(&task->pi_lock, flags);
++              raw_spin_unlock_irq(&task->pi_lock);
+               cpu_relax();
+               goto retry;
+       }
+@@ -655,7 +656,7 @@ static int rt_mutex_adjust_prio_chain(st
+               /*
+                * No requeue[7] here. Just release @task [8]
+                */
+-              raw_spin_unlock_irqrestore(&task->pi_lock, flags);
++              raw_spin_unlock(&task->pi_lock);
+               put_task_struct(task);
+               /*
+@@ -663,14 +664,14 @@ static int rt_mutex_adjust_prio_chain(st
+                * If there is no owner of the lock, end of chain.
+                */
+               if (!rt_mutex_owner(lock)) {
+-                      raw_spin_unlock(&lock->wait_lock);
++                      raw_spin_unlock_irq(&lock->wait_lock);
+                       return 0;
+               }
+               /* [10] Grab the next task, i.e. owner of @lock */
+               task = rt_mutex_owner(lock);
+               get_task_struct(task);
+-              raw_spin_lock_irqsave(&task->pi_lock, flags);
++              raw_spin_lock(&task->pi_lock);
+               /*
+                * No requeue [11] here. We just do deadlock detection.
+@@ -685,8 +686,8 @@ static int rt_mutex_adjust_prio_chain(st
+               top_waiter = rt_mutex_top_waiter(lock);
+               /* [13] Drop locks */
+-              raw_spin_unlock_irqrestore(&task->pi_lock, flags);
+-              raw_spin_unlock(&lock->wait_lock);
++              raw_spin_unlock(&task->pi_lock);
++              raw_spin_unlock_irq(&lock->wait_lock);
+               /* If owner is not blocked, end of chain. */
+               if (!next_lock)
+@@ -707,7 +708,7 @@ static int rt_mutex_adjust_prio_chain(st
+       rt_mutex_enqueue(lock, waiter);
+       /* [8] Release the task */
+-      raw_spin_unlock_irqrestore(&task->pi_lock, flags);
++      raw_spin_unlock(&task->pi_lock);
+       put_task_struct(task);
+       /*
+@@ -725,14 +726,14 @@ static int rt_mutex_adjust_prio_chain(st
+                */
+               if (prerequeue_top_waiter != rt_mutex_top_waiter(lock))
+                       wake_up_process(rt_mutex_top_waiter(lock)->task);
+-              raw_spin_unlock(&lock->wait_lock);
++              raw_spin_unlock_irq(&lock->wait_lock);
+               return 0;
+       }
+       /* [10] Grab the next task, i.e. the owner of @lock */
+       task = rt_mutex_owner(lock);
+       get_task_struct(task);
+-      raw_spin_lock_irqsave(&task->pi_lock, flags);
++      raw_spin_lock(&task->pi_lock);
+       /* [11] requeue the pi waiters if necessary */
+       if (waiter == rt_mutex_top_waiter(lock)) {
+@@ -786,8 +787,8 @@ static int rt_mutex_adjust_prio_chain(st
+       top_waiter = rt_mutex_top_waiter(lock);
+       /* [13] Drop the locks */
+-      raw_spin_unlock_irqrestore(&task->pi_lock, flags);
+-      raw_spin_unlock(&lock->wait_lock);
++      raw_spin_unlock(&task->pi_lock);
++      raw_spin_unlock_irq(&lock->wait_lock);
+       /*
+        * Make the actual exit decisions [12], based on the stored
+@@ -810,7 +811,7 @@ static int rt_mutex_adjust_prio_chain(st
+       goto again;
+  out_unlock_pi:
+-      raw_spin_unlock_irqrestore(&task->pi_lock, flags);
++      raw_spin_unlock_irq(&task->pi_lock);
+  out_put_task:
+       put_task_struct(task);
+@@ -820,7 +821,7 @@ static int rt_mutex_adjust_prio_chain(st
+ /*
+  * Try to take an rt-mutex
+  *
+- * Must be called with lock->wait_lock held.
++ * Must be called with lock->wait_lock held and interrupts disabled
+  *
+  * @lock:   The lock to be acquired.
+  * @task:   The task which wants to acquire the lock
+@@ -830,8 +831,6 @@ static int rt_mutex_adjust_prio_chain(st
+ static int try_to_take_rt_mutex(struct rt_mutex *lock, struct task_struct *task,
+                               struct rt_mutex_waiter *waiter)
+ {
+-      unsigned long flags;
+-
+       /*
+        * Before testing whether we can acquire @lock, we set the
+        * RT_MUTEX_HAS_WAITERS bit in @lock->owner. This forces all
+@@ -916,7 +915,7 @@ static int try_to_take_rt_mutex(struct r
+        * case, but conditionals are more expensive than a redundant
+        * store.
+        */
+-      raw_spin_lock_irqsave(&task->pi_lock, flags);
++      raw_spin_lock(&task->pi_lock);
+       task->pi_blocked_on = NULL;
+       /*
+        * Finish the lock acquisition. @task is the new owner. If
+@@ -925,7 +924,7 @@ static int try_to_take_rt_mutex(struct r
+        */
+       if (rt_mutex_has_waiters(lock))
+               rt_mutex_enqueue_pi(task, rt_mutex_top_waiter(lock));
+-      raw_spin_unlock_irqrestore(&task->pi_lock, flags);
++      raw_spin_unlock(&task->pi_lock);
+ takeit:
+       /* We got the lock. */
+@@ -945,7 +944,7 @@ takeit:
+  *
+  * Prepare waiter and propagate pi chain
+  *
+- * This must be called with lock->wait_lock held.
++ * This must be called with lock->wait_lock held and interrupts disabled
+  */
+ static int task_blocks_on_rt_mutex(struct rt_mutex *lock,
+                                  struct rt_mutex_waiter *waiter,
+@@ -956,7 +955,6 @@ static int task_blocks_on_rt_mutex(struc
+       struct rt_mutex_waiter *top_waiter = waiter;
+       struct rt_mutex *next_lock;
+       int chain_walk = 0, res;
+-      unsigned long flags;
+       /*
+        * Early deadlock detection. We really don't want the task to
+@@ -970,7 +968,7 @@ static int task_blocks_on_rt_mutex(struc
+       if (owner == task)
+               return -EDEADLK;
+-      raw_spin_lock_irqsave(&task->pi_lock, flags);
++      raw_spin_lock(&task->pi_lock);
+       __rt_mutex_adjust_prio(task);
+       waiter->task = task;
+       waiter->lock = lock;
+@@ -983,12 +981,12 @@ static int task_blocks_on_rt_mutex(struc
+       task->pi_blocked_on = waiter;
+-      raw_spin_unlock_irqrestore(&task->pi_lock, flags);
++      raw_spin_unlock(&task->pi_lock);
+       if (!owner)
+               return 0;
+-      raw_spin_lock_irqsave(&owner->pi_lock, flags);
++      raw_spin_lock(&owner->pi_lock);
+       if (waiter == rt_mutex_top_waiter(lock)) {
+               rt_mutex_dequeue_pi(owner, top_waiter);
+               rt_mutex_enqueue_pi(owner, waiter);
+@@ -1003,7 +1001,7 @@ static int task_blocks_on_rt_mutex(struc
+       /* Store the lock on which owner is blocked or NULL */
+       next_lock = task_blocked_on_lock(owner);
+-      raw_spin_unlock_irqrestore(&owner->pi_lock, flags);
++      raw_spin_unlock(&owner->pi_lock);
+       /*
+        * Even if full deadlock detection is on, if the owner is not
+        * blocked itself, we can avoid finding this out in the chain
+@@ -1019,12 +1017,12 @@ static int task_blocks_on_rt_mutex(struc
+        */
+       get_task_struct(owner);
+-      raw_spin_unlock(&lock->wait_lock);
++      raw_spin_unlock_irq(&lock->wait_lock);
+       res = rt_mutex_adjust_prio_chain(owner, chwalk, lock,
+                                        next_lock, waiter, task);
+-      raw_spin_lock(&lock->wait_lock);
++      raw_spin_lock_irq(&lock->wait_lock);
+       return res;
+ }
+@@ -1033,15 +1031,14 @@ static int task_blocks_on_rt_mutex(struc
+  * Remove the top waiter from the current tasks pi waiter tree and
+  * queue it up.
+  *
+- * Called with lock->wait_lock held.
++ * Called with lock->wait_lock held and interrupts disabled.
+  */
+ static void mark_wakeup_next_waiter(struct wake_q_head *wake_q,
+                                   struct rt_mutex *lock)
+ {
+       struct rt_mutex_waiter *waiter;
+-      unsigned long flags;
+-      raw_spin_lock_irqsave(&current->pi_lock, flags);
++      raw_spin_lock(&current->pi_lock);
+       waiter = rt_mutex_top_waiter(lock);
+@@ -1063,7 +1060,7 @@ static void mark_wakeup_next_waiter(stru
+        */
+       lock->owner = (void *) RT_MUTEX_HAS_WAITERS;
+-      raw_spin_unlock_irqrestore(&current->pi_lock, flags);
++      raw_spin_unlock(&current->pi_lock);
+       wake_q_add(wake_q, waiter->task);
+ }
+@@ -1071,7 +1068,7 @@ static void mark_wakeup_next_waiter(stru
+ /*
+  * Remove a waiter from a lock and give up
+  *
+- * Must be called with lock->wait_lock held and
++ * Must be called with lock->wait_lock held and interrupts disabled. I must
+  * have just failed to try_to_take_rt_mutex().
+  */
+ static void remove_waiter(struct rt_mutex *lock,
+@@ -1080,12 +1077,11 @@ static void remove_waiter(struct rt_mute
+       bool is_top_waiter = (waiter == rt_mutex_top_waiter(lock));
+       struct task_struct *owner = rt_mutex_owner(lock);
+       struct rt_mutex *next_lock;
+-      unsigned long flags;
+-      raw_spin_lock_irqsave(&current->pi_lock, flags);
++      raw_spin_lock(&current->pi_lock);
+       rt_mutex_dequeue(lock, waiter);
+       current->pi_blocked_on = NULL;
+-      raw_spin_unlock_irqrestore(&current->pi_lock, flags);
++      raw_spin_unlock(&current->pi_lock);
+       /*
+        * Only update priority if the waiter was the highest priority
+@@ -1094,7 +1090,7 @@ static void remove_waiter(struct rt_mute
+       if (!owner || !is_top_waiter)
+               return;
+-      raw_spin_lock_irqsave(&owner->pi_lock, flags);
++      raw_spin_lock(&owner->pi_lock);
+       rt_mutex_dequeue_pi(owner, waiter);
+@@ -1106,7 +1102,7 @@ static void remove_waiter(struct rt_mute
+       /* Store the lock on which owner is blocked or NULL */
+       next_lock = task_blocked_on_lock(owner);
+-      raw_spin_unlock_irqrestore(&owner->pi_lock, flags);
++      raw_spin_unlock(&owner->pi_lock);
+       /*
+        * Don't walk the chain, if the owner task is not blocked
+@@ -1118,12 +1114,12 @@ static void remove_waiter(struct rt_mute
+       /* gets dropped in rt_mutex_adjust_prio_chain()! */
+       get_task_struct(owner);
+-      raw_spin_unlock(&lock->wait_lock);
++      raw_spin_unlock_irq(&lock->wait_lock);
+       rt_mutex_adjust_prio_chain(owner, RT_MUTEX_MIN_CHAINWALK, lock,
+                                  next_lock, NULL, current);
+-      raw_spin_lock(&lock->wait_lock);
++      raw_spin_lock_irq(&lock->wait_lock);
+ }
+ /*
+@@ -1167,11 +1163,11 @@ void rt_mutex_init_waiter(struct rt_mute
+  * __rt_mutex_slowlock() - Perform the wait-wake-try-to-take loop
+  * @lock:              the rt_mutex to take
+  * @state:             the state the task should block in (TASK_INTERRUPTIBLE
+- *                     or TASK_UNINTERRUPTIBLE)
++ *                     or TASK_UNINTERRUPTIBLE)
+  * @timeout:           the pre-initialized and started timer, or NULL for none
+  * @waiter:            the pre-initialized rt_mutex_waiter
+  *
+- * lock->wait_lock must be held by the caller.
++ * Must be called with lock->wait_lock held and interrupts disabled
+  */
+ static int __sched
+ __rt_mutex_slowlock(struct rt_mutex *lock, int state,
+@@ -1199,13 +1195,13 @@ __rt_mutex_slowlock(struct rt_mutex *loc
+                               break;
+               }
+-              raw_spin_unlock(&lock->wait_lock);
++              raw_spin_unlock_irq(&lock->wait_lock);
+               debug_rt_mutex_print_deadlock(waiter);
+               schedule();
+-              raw_spin_lock(&lock->wait_lock);
++              raw_spin_lock_irq(&lock->wait_lock);
+               set_current_state(state);
+       }
+@@ -1242,15 +1238,24 @@ rt_mutex_slowlock(struct rt_mutex *lock,
+                 enum rtmutex_chainwalk chwalk)
+ {
+       struct rt_mutex_waiter waiter;
++      unsigned long flags;
+       int ret = 0;
+       rt_mutex_init_waiter(&waiter);
+-      raw_spin_lock(&lock->wait_lock);
++      /*
++       * Technically we could use raw_spin_[un]lock_irq() here, but this can
++       * be called in early boot if the cmpxchg() fast path is disabled
++       * (debug, no architecture support). In this case we will acquire the
++       * rtmutex with lock->wait_lock held. But we cannot unconditionally
++       * enable interrupts in that early boot case. So we need to use the
++       * irqsave/restore variants.
++       */
++      raw_spin_lock_irqsave(&lock->wait_lock, flags);
+       /* Try to acquire the lock again: */
+       if (try_to_take_rt_mutex(lock, current, NULL)) {
+-              raw_spin_unlock(&lock->wait_lock);
++              raw_spin_unlock_irqrestore(&lock->wait_lock, flags);
+               return 0;
+       }
+@@ -1279,7 +1284,7 @@ rt_mutex_slowlock(struct rt_mutex *lock,
+        */
+       fixup_rt_mutex_waiters(lock);
+-      raw_spin_unlock(&lock->wait_lock);
++      raw_spin_unlock_irqrestore(&lock->wait_lock, flags);
+       /* Remove pending timer: */
+       if (unlikely(timeout))
+@@ -1308,6 +1313,7 @@ static inline int __rt_mutex_slowtrylock
+  */
+ static inline int rt_mutex_slowtrylock(struct rt_mutex *lock)
+ {
++      unsigned long flags;
+       int ret;
+       /*
+@@ -1319,14 +1325,14 @@ static inline int rt_mutex_slowtrylock(s
+               return 0;
+       /*
+-       * The mutex has currently no owner. Lock the wait lock and
+-       * try to acquire the lock.
++       * The mutex has currently no owner. Lock the wait lock and try to
++       * acquire the lock. We use irqsave here to support early boot calls.
+        */
+-      raw_spin_lock(&lock->wait_lock);
++      raw_spin_lock_irqsave(&lock->wait_lock, flags);
+       ret = __rt_mutex_slowtrylock(lock);
+-      raw_spin_unlock(&lock->wait_lock);
++      raw_spin_unlock_irqrestore(&lock->wait_lock, flags);
+       return ret;
+ }
+@@ -1338,7 +1344,10 @@ static inline int rt_mutex_slowtrylock(s
+ static bool __sched rt_mutex_slowunlock(struct rt_mutex *lock,
+                                       struct wake_q_head *wake_q)
+ {
+-      raw_spin_lock(&lock->wait_lock);
++      unsigned long flags;
++
++      /* irqsave required to support early boot calls */
++      raw_spin_lock_irqsave(&lock->wait_lock, flags);
+       debug_rt_mutex_unlock(lock);
+@@ -1375,10 +1384,10 @@ static bool __sched rt_mutex_slowunlock(
+        */
+       while (!rt_mutex_has_waiters(lock)) {
+               /* Drops lock->wait_lock ! */
+-              if (unlock_rt_mutex_safe(lock) == true)
++              if (unlock_rt_mutex_safe(lock, flags) == true)
+                       return false;
+               /* Relock the rtmutex and try again */
+-              raw_spin_lock(&lock->wait_lock);
++              raw_spin_lock_irqsave(&lock->wait_lock, flags);
+       }
+       /*
+@@ -1389,7 +1398,7 @@ static bool __sched rt_mutex_slowunlock(
+        */
+       mark_wakeup_next_waiter(wake_q, lock);
+-      raw_spin_unlock(&lock->wait_lock);
++      raw_spin_unlock_irqrestore(&lock->wait_lock, flags);
+       /* check PI boosting */
+       return true;
+@@ -1680,10 +1689,10 @@ int rt_mutex_start_proxy_lock(struct rt_
+ {
+       int ret;
+-      raw_spin_lock(&lock->wait_lock);
++      raw_spin_lock_irq(&lock->wait_lock);
+       if (try_to_take_rt_mutex(lock, task, NULL)) {
+-              raw_spin_unlock(&lock->wait_lock);
++              raw_spin_unlock_irq(&lock->wait_lock);
+               return 1;
+       }
+@@ -1704,7 +1713,7 @@ int rt_mutex_start_proxy_lock(struct rt_
+       if (unlikely(ret))
+               remove_waiter(lock, waiter);
+-      raw_spin_unlock(&lock->wait_lock);
++      raw_spin_unlock_irq(&lock->wait_lock);
+       debug_rt_mutex_print_deadlock(waiter);
+@@ -1754,14 +1763,14 @@ int rt_mutex_wait_proxy_lock(struct rt_m
+ {
+       int ret;
+-      raw_spin_lock(&lock->wait_lock);
++      raw_spin_lock_irq(&lock->wait_lock);
+       set_current_state(TASK_INTERRUPTIBLE);
+       /* sleep on the mutex */
+       ret = __rt_mutex_slowlock(lock, TASK_INTERRUPTIBLE, to, waiter);
+-      raw_spin_unlock(&lock->wait_lock);
++      raw_spin_unlock_irq(&lock->wait_lock);
+       return ret;
+ }
diff --git a/queue-4.4/series b/queue-4.4/series
new file mode 100644 (file)
index 0000000..4f9a7ad
--- /dev/null
@@ -0,0 +1,11 @@
+futex-rename-free_pi_state-to-put_pi_state.patch
+futex-cleanup-refcounting.patch
+futex-rt_mutex-introduce-rt_mutex_init_waiter.patch
+futex-pull-rt_mutex_futex_unlock-out-from-under-hb-lock.patch
+futex-rework-futex_lock_pi-to-use-rt_mutex_-_proxy_lock.patch
+futex-futex_unlock_pi-determinism.patch
+rtmutex-make-wait_lock-irq-safe.patch
+futex-handle-transient-ownerless-rtmutex-state-correctly.patch
+futex-avoid-freeing-an-active-timer.patch
+futex-rt_mutex-fix-rt_mutex_cleanup_proxy_lock.patch
+rcu-update-documentation-of-rcu_read_unlock.patch