patches for 4.19

author Sasha Levin <sashal@kernel.org>

Wed, 19 Dec 2018 00:46:56 +0000 (19:46 -0500)

committer Sasha Levin <sashal@kernel.org>

Wed, 19 Dec 2018 00:46:56 +0000 (19:46 -0500)
author Sasha Levin <sashal@kernel.org>
Wed, 19 Dec 2018 00:46:56 +0000 (19:46 -0500)
committer Sasha Levin <sashal@kernel.org>
Wed, 19 Dec 2018 00:46:56 +0000 (19:46 -0500)
diff --git a/queue-4.19/ib-hfi1-remove-race-conditions-in-user_sdma-send-pat.patch b/queue-4.19/ib-hfi1-remove-race-conditions-in-user_sdma-send-pat.patch

new file mode 100644 (file)

index 0000000..091f8d6
--- /dev/null
+++ b/queue-4.19/ib-hfi1-remove-race-conditions-in-user_sdma-send-pat.patch
@@ -0,0 +1,137 @@
+From e6bd2d9ff28a627fa744173cbe2ae57c4cd76576 Mon Sep 17 00:00:00 2001
+From: "Michael J. Ruhl" <michael.j.ruhl@intel.com>
+Date: Tue, 18 Dec 2018 16:00:22 -0500
+Subject: IB/hfi1: Remove race conditions in user_sdma send path
+
+commit 28a9a9e83ceae2cee25b9af9ad20d53aaa9ab951 upstream
+
+Packet queue state is over used to determine SDMA descriptor
+availablitity and packet queue request state.
+
+cpu 0  ret = user_sdma_send_pkts(req, pcount);
+cpu 0  if (atomic_read(&pq->n_reqs))
+cpu 1  IRQ user_sdma_txreq_cb calls pq_update() (state to _INACTIVE)
+cpu 0        xchg(&pq->state, SDMA_PKT_Q_ACTIVE);
+
+At this point pq->n_reqs == 0 and pq->state is incorrectly
+SDMA_PKT_Q_ACTIVE.  The close path will hang waiting for the state
+to return to _INACTIVE.
+
+This can also change the state from _DEFERRED to _ACTIVE.  However,
+this is a mostly benign race.
+
+Remove the racy code path.
+
+Use n_reqs to determine if a packet queue is active or not.
+
+Cc: <stable@vger.kernel.org> # 4.19.x
+Reviewed-by: Mitko Haralanov <mitko.haralanov@intel.com>
+Reviewed-by: Mike Marciniszyn <mike.marciniszyn@intel.com>
+Signed-off-by: Michael J. Ruhl <michael.j.ruhl@intel.com>
+Signed-off-by: Dennis Dalessandro <dennis.dalessandro@intel.com>
+Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ drivers/infiniband/hw/hfi1/user_sdma.c | 24 ++++++++++--------------
+ drivers/infiniband/hw/hfi1/user_sdma.h |  9 +++++----
+ 2 files changed, 15 insertions(+), 18 deletions(-)
+
+diff --git a/drivers/infiniband/hw/hfi1/user_sdma.c b/drivers/infiniband/hw/hfi1/user_sdma.c
+index 39134dd305f5..51831bfbf90f 100644
+--- a/drivers/infiniband/hw/hfi1/user_sdma.c
++++ b/drivers/infiniband/hw/hfi1/user_sdma.c
+@@ -187,7 +187,6 @@ int hfi1_user_sdma_alloc_queues(struct hfi1_ctxtdata *uctxt,
+       pq->ctxt = uctxt->ctxt;
+       pq->subctxt = fd->subctxt;
+       pq->n_max_reqs = hfi1_sdma_comp_ring_size;
+-      pq->state = SDMA_PKT_Q_INACTIVE;
+       atomic_set(&pq->n_reqs, 0);
+       init_waitqueue_head(&pq->wait);
+       atomic_set(&pq->n_locked, 0);
+@@ -276,7 +275,7 @@ int hfi1_user_sdma_free_queues(struct hfi1_filedata *fd,
+               /* Wait until all requests have been freed. */
+               wait_event_interruptible(
+                       pq->wait,
+-                      (READ_ONCE(pq->state) == SDMA_PKT_Q_INACTIVE));
++                      !atomic_read(&pq->n_reqs));
+               kfree(pq->reqs);
+               kfree(pq->req_in_use);
+               kmem_cache_destroy(pq->txreq_cache);
+@@ -312,6 +311,13 @@ static u8 dlid_to_selector(u16 dlid)
+       return mapping[hash];
+ }
+ 
++/**
++ * hfi1_user_sdma_process_request() - Process and start a user sdma request
++ * @fd: valid file descriptor
++ * @iovec: array of io vectors to process
++ * @dim: overall iovec array size
++ * @count: number of io vector array entries processed
++ */
+ int hfi1_user_sdma_process_request(struct hfi1_filedata *fd,
+                                  struct iovec *iovec, unsigned long dim,
+                                  unsigned long *count)
+@@ -560,20 +566,12 @@ int hfi1_user_sdma_process_request(struct hfi1_filedata *fd,
+               req->ahg_idx = sdma_ahg_alloc(req->sde);
+ 
+       set_comp_state(pq, cq, info.comp_idx, QUEUED, 0);
++      pq->state = SDMA_PKT_Q_ACTIVE;
+       /* Send the first N packets in the request to buy us some time */
+       ret = user_sdma_send_pkts(req, pcount);
+       if (unlikely(ret < 0 && ret != -EBUSY))
+               goto free_req;
+ 
+-      /*
+-       * It is possible that the SDMA engine would have processed all the
+-       * submitted packets by the time we get here. Therefore, only set
+-       * packet queue state to ACTIVE if there are still uncompleted
+-       * requests.
+-       */
+-      if (atomic_read(&pq->n_reqs))
+-              xchg(&pq->state, SDMA_PKT_Q_ACTIVE);
+-
+       /*
+        * This is a somewhat blocking send implementation.
+        * The driver will block the caller until all packets of the
+@@ -1409,10 +1407,8 @@ static void user_sdma_txreq_cb(struct sdma_txreq *txreq, int status)
+ 
+ static inline void pq_update(struct hfi1_user_sdma_pkt_q *pq)
+ {
+-      if (atomic_dec_and_test(&pq->n_reqs)) {
+-              xchg(&pq->state, SDMA_PKT_Q_INACTIVE);
++      if (atomic_dec_and_test(&pq->n_reqs))
+               wake_up(&pq->wait);
+-      }
+ }
+ 
+ static void user_sdma_free_request(struct user_sdma_request *req, bool unpin)
+diff --git a/drivers/infiniband/hw/hfi1/user_sdma.h b/drivers/infiniband/hw/hfi1/user_sdma.h
+index 0ae06456c868..91c343f91776 100644
+--- a/drivers/infiniband/hw/hfi1/user_sdma.h
++++ b/drivers/infiniband/hw/hfi1/user_sdma.h
+@@ -105,9 +105,10 @@ static inline int ahg_header_set(u32 *arr, int idx, size_t array_size,
+ #define TXREQ_FLAGS_REQ_ACK   BIT(0)      /* Set the ACK bit in the header */
+ #define TXREQ_FLAGS_REQ_DISABLE_SH BIT(1) /* Disable header suppression */
+ 
+-#define SDMA_PKT_Q_INACTIVE BIT(0)
+-#define SDMA_PKT_Q_ACTIVE   BIT(1)
+-#define SDMA_PKT_Q_DEFERRED BIT(2)
++enum pkt_q_sdma_state {
++      SDMA_PKT_Q_ACTIVE,
++      SDMA_PKT_Q_DEFERRED,
++};
+ 
+ /*
+  * Maximum retry attempts to submit a TX request
+@@ -133,7 +134,7 @@ struct hfi1_user_sdma_pkt_q {
+       struct user_sdma_request *reqs;
+       unsigned long *req_in_use;
+       struct iowait busy;
+-      unsigned state;
++      enum pkt_q_sdma_state state;
+       wait_queue_head_t wait;
+       unsigned long unpinned;
+       struct mmu_rb_handler *handler;
+-- 
+2.19.1
+
diff --git a/queue-4.19/locking-qspinlock-re-order-code.patch b/queue-4.19/locking-qspinlock-re-order-code.patch

new file mode 100644 (file)

index 0000000..da97a81
--- /dev/null
+++ b/queue-4.19/locking-qspinlock-re-order-code.patch
@@ -0,0 +1,100 @@
+From 88e538bc095147a501654eb7c3b2161396e41ba3 Mon Sep 17 00:00:00 2001
+From: Peter Zijlstra <peterz@infradead.org>
+Date: Tue, 18 Dec 2018 18:48:27 +0100
+Subject: locking/qspinlock: Re-order code
+
+commit 53bf57fab7321fb42b703056a4c80fc9d986d170 upstream.
+
+Flip the branch condition after atomic_fetch_or_acquire(_Q_PENDING_VAL)
+such that we loose the indent. This also result in a more natural code
+flow IMO.
+
+Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
+Acked-by: Will Deacon <will.deacon@arm.com>
+Cc: Linus Torvalds <torvalds@linux-foundation.org>
+Cc: Peter Zijlstra <peterz@infradead.org>
+Cc: Thomas Gleixner <tglx@linutronix.de>
+Cc: andrea.parri@amarulasolutions.com
+Cc: longman@redhat.com
+Link: https://lkml.kernel.org/r/20181003130257.156322446@infradead.org
+Signed-off-by: Ingo Molnar <mingo@kernel.org>
+Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ kernel/locking/qspinlock.c | 56 ++++++++++++++++++--------------------
+ 1 file changed, 27 insertions(+), 29 deletions(-)
+
+diff --git a/kernel/locking/qspinlock.c b/kernel/locking/qspinlock.c
+index bfaeb05123ff..ec343276f975 100644
+--- a/kernel/locking/qspinlock.c
++++ b/kernel/locking/qspinlock.c
+@@ -330,39 +330,37 @@ void queued_spin_lock_slowpath(struct qspinlock *lock, u32 val)
+        * 0,0,1 -> 0,1,1 ; pending
+        */
+       val = atomic_fetch_or_acquire(_Q_PENDING_VAL, &lock->val);
+-      if (!(val & ~_Q_LOCKED_MASK)) {
+-              /*
+-               * We're pending, wait for the owner to go away.
+-               *
+-               * *,1,1 -> *,1,0
+-               *
+-               * this wait loop must be a load-acquire such that we match the
+-               * store-release that clears the locked bit and create lock
+-               * sequentiality; this is because not all
+-               * clear_pending_set_locked() implementations imply full
+-               * barriers.
+-               */
+-              if (val & _Q_LOCKED_MASK) {
+-                      atomic_cond_read_acquire(&lock->val,
+-                                               !(VAL & _Q_LOCKED_MASK));
+-              }
+-
+-              /*
+-               * take ownership and clear the pending bit.
+-               *
+-               * *,1,0 -> *,0,1
+-               */
+-              clear_pending_set_locked(lock);
+-              qstat_inc(qstat_lock_pending, true);
+-              return;
++      /*
++       * If we observe any contention; undo and queue.
++       */
++      if (unlikely(val & ~_Q_LOCKED_MASK)) {
++              if (!(val & _Q_PENDING_MASK))
++                      clear_pending(lock);
++              goto queue;
+       }
+ 
+       /*
+-       * If pending was clear but there are waiters in the queue, then
+-       * we need to undo our setting of pending before we queue ourselves.
++       * We're pending, wait for the owner to go away.
++       *
++       * 0,1,1 -> 0,1,0
++       *
++       * this wait loop must be a load-acquire such that we match the
++       * store-release that clears the locked bit and create lock
++       * sequentiality; this is because not all
++       * clear_pending_set_locked() implementations imply full
++       * barriers.
++       */
++      if (val & _Q_LOCKED_MASK)
++              atomic_cond_read_acquire(&lock->val, !(VAL & _Q_LOCKED_MASK));
++
++      /*
++       * take ownership and clear the pending bit.
++       *
++       * 0,1,0 -> 0,0,1
+        */
+-      if (!(val & _Q_PENDING_MASK))
+-              clear_pending(lock);
++      clear_pending_set_locked(lock);
++      qstat_inc(qstat_lock_pending, true);
++      return;
+ 
+       /*
+        * End of pending bit optimistic spinning and beginning of MCS
+-- 
+2.19.1
+
diff --git a/queue-4.19/locking-qspinlock-x86-provide-liveness-guarantee.patch b/queue-4.19/locking-qspinlock-x86-provide-liveness-guarantee.patch

new file mode 100644 (file)

index 0000000..d19ff1b
--- /dev/null
+++ b/queue-4.19/locking-qspinlock-x86-provide-liveness-guarantee.patch
@@ -0,0 +1,153 @@
+From ebda9c0f20ee3152eb7e24e4ea8d30f042e499d6 Mon Sep 17 00:00:00 2001
+From: Peter Zijlstra <peterz@infradead.org>
+Date: Tue, 18 Dec 2018 18:48:28 +0100
+Subject: locking/qspinlock, x86: Provide liveness guarantee
+
+commit 7aa54be2976550f17c11a1c3e3630002dea39303 upstream.
+
+On x86 we cannot do fetch_or() with a single instruction and thus end up
+using a cmpxchg loop, this reduces determinism. Replace the fetch_or()
+with a composite operation: tas-pending + load.
+
+Using two instructions of course opens a window we previously did not
+have. Consider the scenario:
+
+       CPU0            CPU1            CPU2
+
+ 1)    lock
+         trylock -> (0,0,1)
+
+ 2)                    lock
+                         trylock /* fail */
+
+ 3)    unlock -> (0,0,0)
+
+ 4)                                    lock
+                                         trylock -> (0,0,1)
+
+ 5)                      tas-pending -> (0,1,1)
+                         load-val <- (0,1,0) from 3
+
+ 6)                      clear-pending-set-locked -> (0,0,1)
+
+                         FAIL: _2_ owners
+
+where 5) is our new composite operation. When we consider each part of
+the qspinlock state as a separate variable (as we can when
+_Q_PENDING_BITS == 8) then the above is entirely possible, because
+tas-pending will only RmW the pending byte, so the later load is able
+to observe prior tail and lock state (but not earlier than its own
+trylock, which operates on the whole word, due to coherence).
+
+To avoid this we need 2 things:
+
+ - the load must come after the tas-pending (obviously, otherwise it
+   can trivially observe prior state).
+
+ - the tas-pending must be a full word RmW instruction, it cannot be an XCHGB for
+   example, such that we cannot observe other state prior to setting
+   pending.
+
+On x86 we can realize this by using "LOCK BTS m32, r32" for
+tas-pending followed by a regular load.
+
+Note that observing later state is not a problem:
+
+ - if we fail to observe a later unlock, we'll simply spin-wait for
+   that store to become visible.
+
+ - if we observe a later xchg_tail(), there is no difference from that
+   xchg_tail() having taken place before the tas-pending.
+
+Suggested-by: Will Deacon <will.deacon@arm.com>
+Reported-by: Thomas Gleixner <tglx@linutronix.de>
+Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
+Reviewed-by: Will Deacon <will.deacon@arm.com>
+Cc: Linus Torvalds <torvalds@linux-foundation.org>
+Cc: Peter Zijlstra <peterz@infradead.org>
+Cc: andrea.parri@amarulasolutions.com
+Cc: longman@redhat.com
+Fixes: 59fb586b4a07 ("locking/qspinlock: Remove unbounded cmpxchg() loop from locking slowpath")
+Link: https://lkml.kernel.org/r/20181003130957.183726335@infradead.org
+Signed-off-by: Ingo Molnar <mingo@kernel.org>
+[bigeasy: GEN_BINARY_RMWcc macro redo]
+Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ arch/x86/include/asm/qspinlock.h | 21 +++++++++++++++++++++
+ kernel/locking/qspinlock.c       | 17 ++++++++++++++++-
+ 2 files changed, 37 insertions(+), 1 deletion(-)
+
+diff --git a/arch/x86/include/asm/qspinlock.h b/arch/x86/include/asm/qspinlock.h
+index 3e70bed8a978..055c60a05756 100644
+--- a/arch/x86/include/asm/qspinlock.h
++++ b/arch/x86/include/asm/qspinlock.h
+@@ -6,9 +6,30 @@
+ #include <asm/cpufeature.h>
+ #include <asm-generic/qspinlock_types.h>
+ #include <asm/paravirt.h>
++#include <asm/rmwcc.h>
+ 
+ #define _Q_PENDING_LOOPS      (1 << 9)
+ 
++#define queued_fetch_set_pending_acquire queued_fetch_set_pending_acquire
++
++static __always_inline bool __queued_RMW_btsl(struct qspinlock *lock)
++{
++      GEN_BINARY_RMWcc(LOCK_PREFIX "btsl", lock->val.counter,
++                       "I", _Q_PENDING_OFFSET, "%0", c);
++}
++
++static __always_inline u32 queued_fetch_set_pending_acquire(struct qspinlock *lock)
++{
++      u32 val = 0;
++
++      if (__queued_RMW_btsl(lock))
++              val |= _Q_PENDING_VAL;
++
++      val |= atomic_read(&lock->val) & ~_Q_PENDING_MASK;
++
++      return val;
++}
++
+ #ifdef CONFIG_PARAVIRT_SPINLOCKS
+ extern void native_queued_spin_lock_slowpath(struct qspinlock *lock, u32 val);
+ extern void __pv_init_lock_hash(void);
+diff --git a/kernel/locking/qspinlock.c b/kernel/locking/qspinlock.c
+index ec343276f975..edd75e0c7d96 100644
+--- a/kernel/locking/qspinlock.c
++++ b/kernel/locking/qspinlock.c
+@@ -231,6 +231,20 @@ static __always_inline u32 xchg_tail(struct qspinlock *lock, u32 tail)
+ }
+ #endif /* _Q_PENDING_BITS == 8 */
+ 
++/**
++ * queued_fetch_set_pending_acquire - fetch the whole lock value and set pending
++ * @lock : Pointer to queued spinlock structure
++ * Return: The previous lock value
++ *
++ * *,*,* -> *,1,*
++ */
++#ifndef queued_fetch_set_pending_acquire
++static __always_inline u32 queued_fetch_set_pending_acquire(struct qspinlock *lock)
++{
++      return atomic_fetch_or_acquire(_Q_PENDING_VAL, &lock->val);
++}
++#endif
++
+ /**
+  * set_locked - Set the lock bit and own the lock
+  * @lock: Pointer to queued spinlock structure
+@@ -329,7 +343,8 @@ void queued_spin_lock_slowpath(struct qspinlock *lock, u32 val)
+        * 0,0,0 -> 0,0,1 ; trylock
+        * 0,0,1 -> 0,1,1 ; pending
+        */
+-      val = atomic_fetch_or_acquire(_Q_PENDING_VAL, &lock->val);
++      val = queued_fetch_set_pending_acquire(lock);
++
+       /*
+        * If we observe any contention; undo and queue.
+        */
+-- 
+2.19.1
+
diff --git a/queue-4.19/series b/queue-4.19/series

new file mode 100644 (file)

index 0000000..210802f
--- /dev/null
+++ b/queue-4.19/series
@@ -0,0 +1,3 @@
+locking-qspinlock-re-order-code.patch
+locking-qspinlock-x86-provide-liveness-guarantee.patch
+ib-hfi1-remove-race-conditions-in-user_sdma-send-pat.patch
author	Sasha Levin <sashal@kernel.org>
	Wed, 19 Dec 2018 00:46:56 +0000 (19:46 -0500)
committer	Sasha Levin <sashal@kernel.org>
	Wed, 19 Dec 2018 00:46:56 +0000 (19:46 -0500)
queue-4.19/ib-hfi1-remove-race-conditions-in-user_sdma-send-pat.patch	[new file with mode: 0644]	patch \| blob
queue-4.19/locking-qspinlock-re-order-code.patch	[new file with mode: 0644]	patch \| blob
queue-4.19/locking-qspinlock-x86-provide-liveness-guarantee.patch	[new file with mode: 0644]	patch \| blob
queue-4.19/series	[new file with mode: 0644]	patch \| blob