From: Sasha Levin Date: Wed, 19 Dec 2018 00:46:56 +0000 (-0500) Subject: patches for 4.19 X-Git-Tag: v4.19.11~12 X-Git-Url: http://git.ipfire.org/?a=commitdiff_plain;h=96d158a613a48d11cd2d1c7ec6b65c663fe5ace2;p=thirdparty%2Fkernel%2Fstable-queue.git patches for 4.19 Signed-off-by: Sasha Levin --- diff --git a/queue-4.19/ib-hfi1-remove-race-conditions-in-user_sdma-send-pat.patch b/queue-4.19/ib-hfi1-remove-race-conditions-in-user_sdma-send-pat.patch new file mode 100644 index 00000000000..091f8d6ae3f --- /dev/null +++ b/queue-4.19/ib-hfi1-remove-race-conditions-in-user_sdma-send-pat.patch @@ -0,0 +1,137 @@ +From e6bd2d9ff28a627fa744173cbe2ae57c4cd76576 Mon Sep 17 00:00:00 2001 +From: "Michael J. Ruhl" +Date: Tue, 18 Dec 2018 16:00:22 -0500 +Subject: IB/hfi1: Remove race conditions in user_sdma send path + +commit 28a9a9e83ceae2cee25b9af9ad20d53aaa9ab951 upstream + +Packet queue state is over used to determine SDMA descriptor +availablitity and packet queue request state. + +cpu 0 ret = user_sdma_send_pkts(req, pcount); +cpu 0 if (atomic_read(&pq->n_reqs)) +cpu 1 IRQ user_sdma_txreq_cb calls pq_update() (state to _INACTIVE) +cpu 0 xchg(&pq->state, SDMA_PKT_Q_ACTIVE); + +At this point pq->n_reqs == 0 and pq->state is incorrectly +SDMA_PKT_Q_ACTIVE. The close path will hang waiting for the state +to return to _INACTIVE. + +This can also change the state from _DEFERRED to _ACTIVE. However, +this is a mostly benign race. + +Remove the racy code path. + +Use n_reqs to determine if a packet queue is active or not. + +Cc: # 4.19.x +Reviewed-by: Mitko Haralanov +Reviewed-by: Mike Marciniszyn +Signed-off-by: Michael J. Ruhl +Signed-off-by: Dennis Dalessandro +Signed-off-by: Jason Gunthorpe +Signed-off-by: Sasha Levin +--- + drivers/infiniband/hw/hfi1/user_sdma.c | 24 ++++++++++-------------- + drivers/infiniband/hw/hfi1/user_sdma.h | 9 +++++---- + 2 files changed, 15 insertions(+), 18 deletions(-) + +diff --git a/drivers/infiniband/hw/hfi1/user_sdma.c b/drivers/infiniband/hw/hfi1/user_sdma.c +index 39134dd305f5..51831bfbf90f 100644 +--- a/drivers/infiniband/hw/hfi1/user_sdma.c ++++ b/drivers/infiniband/hw/hfi1/user_sdma.c +@@ -187,7 +187,6 @@ int hfi1_user_sdma_alloc_queues(struct hfi1_ctxtdata *uctxt, + pq->ctxt = uctxt->ctxt; + pq->subctxt = fd->subctxt; + pq->n_max_reqs = hfi1_sdma_comp_ring_size; +- pq->state = SDMA_PKT_Q_INACTIVE; + atomic_set(&pq->n_reqs, 0); + init_waitqueue_head(&pq->wait); + atomic_set(&pq->n_locked, 0); +@@ -276,7 +275,7 @@ int hfi1_user_sdma_free_queues(struct hfi1_filedata *fd, + /* Wait until all requests have been freed. */ + wait_event_interruptible( + pq->wait, +- (READ_ONCE(pq->state) == SDMA_PKT_Q_INACTIVE)); ++ !atomic_read(&pq->n_reqs)); + kfree(pq->reqs); + kfree(pq->req_in_use); + kmem_cache_destroy(pq->txreq_cache); +@@ -312,6 +311,13 @@ static u8 dlid_to_selector(u16 dlid) + return mapping[hash]; + } + ++/** ++ * hfi1_user_sdma_process_request() - Process and start a user sdma request ++ * @fd: valid file descriptor ++ * @iovec: array of io vectors to process ++ * @dim: overall iovec array size ++ * @count: number of io vector array entries processed ++ */ + int hfi1_user_sdma_process_request(struct hfi1_filedata *fd, + struct iovec *iovec, unsigned long dim, + unsigned long *count) +@@ -560,20 +566,12 @@ int hfi1_user_sdma_process_request(struct hfi1_filedata *fd, + req->ahg_idx = sdma_ahg_alloc(req->sde); + + set_comp_state(pq, cq, info.comp_idx, QUEUED, 0); ++ pq->state = SDMA_PKT_Q_ACTIVE; + /* Send the first N packets in the request to buy us some time */ + ret = user_sdma_send_pkts(req, pcount); + if (unlikely(ret < 0 && ret != -EBUSY)) + goto free_req; + +- /* +- * It is possible that the SDMA engine would have processed all the +- * submitted packets by the time we get here. Therefore, only set +- * packet queue state to ACTIVE if there are still uncompleted +- * requests. +- */ +- if (atomic_read(&pq->n_reqs)) +- xchg(&pq->state, SDMA_PKT_Q_ACTIVE); +- + /* + * This is a somewhat blocking send implementation. + * The driver will block the caller until all packets of the +@@ -1409,10 +1407,8 @@ static void user_sdma_txreq_cb(struct sdma_txreq *txreq, int status) + + static inline void pq_update(struct hfi1_user_sdma_pkt_q *pq) + { +- if (atomic_dec_and_test(&pq->n_reqs)) { +- xchg(&pq->state, SDMA_PKT_Q_INACTIVE); ++ if (atomic_dec_and_test(&pq->n_reqs)) + wake_up(&pq->wait); +- } + } + + static void user_sdma_free_request(struct user_sdma_request *req, bool unpin) +diff --git a/drivers/infiniband/hw/hfi1/user_sdma.h b/drivers/infiniband/hw/hfi1/user_sdma.h +index 0ae06456c868..91c343f91776 100644 +--- a/drivers/infiniband/hw/hfi1/user_sdma.h ++++ b/drivers/infiniband/hw/hfi1/user_sdma.h +@@ -105,9 +105,10 @@ static inline int ahg_header_set(u32 *arr, int idx, size_t array_size, + #define TXREQ_FLAGS_REQ_ACK BIT(0) /* Set the ACK bit in the header */ + #define TXREQ_FLAGS_REQ_DISABLE_SH BIT(1) /* Disable header suppression */ + +-#define SDMA_PKT_Q_INACTIVE BIT(0) +-#define SDMA_PKT_Q_ACTIVE BIT(1) +-#define SDMA_PKT_Q_DEFERRED BIT(2) ++enum pkt_q_sdma_state { ++ SDMA_PKT_Q_ACTIVE, ++ SDMA_PKT_Q_DEFERRED, ++}; + + /* + * Maximum retry attempts to submit a TX request +@@ -133,7 +134,7 @@ struct hfi1_user_sdma_pkt_q { + struct user_sdma_request *reqs; + unsigned long *req_in_use; + struct iowait busy; +- unsigned state; ++ enum pkt_q_sdma_state state; + wait_queue_head_t wait; + unsigned long unpinned; + struct mmu_rb_handler *handler; +-- +2.19.1 + diff --git a/queue-4.19/locking-qspinlock-re-order-code.patch b/queue-4.19/locking-qspinlock-re-order-code.patch new file mode 100644 index 00000000000..da97a81cdd6 --- /dev/null +++ b/queue-4.19/locking-qspinlock-re-order-code.patch @@ -0,0 +1,100 @@ +From 88e538bc095147a501654eb7c3b2161396e41ba3 Mon Sep 17 00:00:00 2001 +From: Peter Zijlstra +Date: Tue, 18 Dec 2018 18:48:27 +0100 +Subject: locking/qspinlock: Re-order code + +commit 53bf57fab7321fb42b703056a4c80fc9d986d170 upstream. + +Flip the branch condition after atomic_fetch_or_acquire(_Q_PENDING_VAL) +such that we loose the indent. This also result in a more natural code +flow IMO. + +Signed-off-by: Peter Zijlstra (Intel) +Acked-by: Will Deacon +Cc: Linus Torvalds +Cc: Peter Zijlstra +Cc: Thomas Gleixner +Cc: andrea.parri@amarulasolutions.com +Cc: longman@redhat.com +Link: https://lkml.kernel.org/r/20181003130257.156322446@infradead.org +Signed-off-by: Ingo Molnar +Signed-off-by: Sebastian Andrzej Siewior +Signed-off-by: Sasha Levin +--- + kernel/locking/qspinlock.c | 56 ++++++++++++++++++-------------------- + 1 file changed, 27 insertions(+), 29 deletions(-) + +diff --git a/kernel/locking/qspinlock.c b/kernel/locking/qspinlock.c +index bfaeb05123ff..ec343276f975 100644 +--- a/kernel/locking/qspinlock.c ++++ b/kernel/locking/qspinlock.c +@@ -330,39 +330,37 @@ void queued_spin_lock_slowpath(struct qspinlock *lock, u32 val) + * 0,0,1 -> 0,1,1 ; pending + */ + val = atomic_fetch_or_acquire(_Q_PENDING_VAL, &lock->val); +- if (!(val & ~_Q_LOCKED_MASK)) { +- /* +- * We're pending, wait for the owner to go away. +- * +- * *,1,1 -> *,1,0 +- * +- * this wait loop must be a load-acquire such that we match the +- * store-release that clears the locked bit and create lock +- * sequentiality; this is because not all +- * clear_pending_set_locked() implementations imply full +- * barriers. +- */ +- if (val & _Q_LOCKED_MASK) { +- atomic_cond_read_acquire(&lock->val, +- !(VAL & _Q_LOCKED_MASK)); +- } +- +- /* +- * take ownership and clear the pending bit. +- * +- * *,1,0 -> *,0,1 +- */ +- clear_pending_set_locked(lock); +- qstat_inc(qstat_lock_pending, true); +- return; ++ /* ++ * If we observe any contention; undo and queue. ++ */ ++ if (unlikely(val & ~_Q_LOCKED_MASK)) { ++ if (!(val & _Q_PENDING_MASK)) ++ clear_pending(lock); ++ goto queue; + } + + /* +- * If pending was clear but there are waiters in the queue, then +- * we need to undo our setting of pending before we queue ourselves. ++ * We're pending, wait for the owner to go away. ++ * ++ * 0,1,1 -> 0,1,0 ++ * ++ * this wait loop must be a load-acquire such that we match the ++ * store-release that clears the locked bit and create lock ++ * sequentiality; this is because not all ++ * clear_pending_set_locked() implementations imply full ++ * barriers. ++ */ ++ if (val & _Q_LOCKED_MASK) ++ atomic_cond_read_acquire(&lock->val, !(VAL & _Q_LOCKED_MASK)); ++ ++ /* ++ * take ownership and clear the pending bit. ++ * ++ * 0,1,0 -> 0,0,1 + */ +- if (!(val & _Q_PENDING_MASK)) +- clear_pending(lock); ++ clear_pending_set_locked(lock); ++ qstat_inc(qstat_lock_pending, true); ++ return; + + /* + * End of pending bit optimistic spinning and beginning of MCS +-- +2.19.1 + diff --git a/queue-4.19/locking-qspinlock-x86-provide-liveness-guarantee.patch b/queue-4.19/locking-qspinlock-x86-provide-liveness-guarantee.patch new file mode 100644 index 00000000000..d19ff1bbef5 --- /dev/null +++ b/queue-4.19/locking-qspinlock-x86-provide-liveness-guarantee.patch @@ -0,0 +1,153 @@ +From ebda9c0f20ee3152eb7e24e4ea8d30f042e499d6 Mon Sep 17 00:00:00 2001 +From: Peter Zijlstra +Date: Tue, 18 Dec 2018 18:48:28 +0100 +Subject: locking/qspinlock, x86: Provide liveness guarantee + +commit 7aa54be2976550f17c11a1c3e3630002dea39303 upstream. + +On x86 we cannot do fetch_or() with a single instruction and thus end up +using a cmpxchg loop, this reduces determinism. Replace the fetch_or() +with a composite operation: tas-pending + load. + +Using two instructions of course opens a window we previously did not +have. Consider the scenario: + + CPU0 CPU1 CPU2 + + 1) lock + trylock -> (0,0,1) + + 2) lock + trylock /* fail */ + + 3) unlock -> (0,0,0) + + 4) lock + trylock -> (0,0,1) + + 5) tas-pending -> (0,1,1) + load-val <- (0,1,0) from 3 + + 6) clear-pending-set-locked -> (0,0,1) + + FAIL: _2_ owners + +where 5) is our new composite operation. When we consider each part of +the qspinlock state as a separate variable (as we can when +_Q_PENDING_BITS == 8) then the above is entirely possible, because +tas-pending will only RmW the pending byte, so the later load is able +to observe prior tail and lock state (but not earlier than its own +trylock, which operates on the whole word, due to coherence). + +To avoid this we need 2 things: + + - the load must come after the tas-pending (obviously, otherwise it + can trivially observe prior state). + + - the tas-pending must be a full word RmW instruction, it cannot be an XCHGB for + example, such that we cannot observe other state prior to setting + pending. + +On x86 we can realize this by using "LOCK BTS m32, r32" for +tas-pending followed by a regular load. + +Note that observing later state is not a problem: + + - if we fail to observe a later unlock, we'll simply spin-wait for + that store to become visible. + + - if we observe a later xchg_tail(), there is no difference from that + xchg_tail() having taken place before the tas-pending. + +Suggested-by: Will Deacon +Reported-by: Thomas Gleixner +Signed-off-by: Peter Zijlstra (Intel) +Reviewed-by: Will Deacon +Cc: Linus Torvalds +Cc: Peter Zijlstra +Cc: andrea.parri@amarulasolutions.com +Cc: longman@redhat.com +Fixes: 59fb586b4a07 ("locking/qspinlock: Remove unbounded cmpxchg() loop from locking slowpath") +Link: https://lkml.kernel.org/r/20181003130957.183726335@infradead.org +Signed-off-by: Ingo Molnar +[bigeasy: GEN_BINARY_RMWcc macro redo] +Signed-off-by: Sebastian Andrzej Siewior +Signed-off-by: Sasha Levin +--- + arch/x86/include/asm/qspinlock.h | 21 +++++++++++++++++++++ + kernel/locking/qspinlock.c | 17 ++++++++++++++++- + 2 files changed, 37 insertions(+), 1 deletion(-) + +diff --git a/arch/x86/include/asm/qspinlock.h b/arch/x86/include/asm/qspinlock.h +index 3e70bed8a978..055c60a05756 100644 +--- a/arch/x86/include/asm/qspinlock.h ++++ b/arch/x86/include/asm/qspinlock.h +@@ -6,9 +6,30 @@ + #include + #include + #include ++#include + + #define _Q_PENDING_LOOPS (1 << 9) + ++#define queued_fetch_set_pending_acquire queued_fetch_set_pending_acquire ++ ++static __always_inline bool __queued_RMW_btsl(struct qspinlock *lock) ++{ ++ GEN_BINARY_RMWcc(LOCK_PREFIX "btsl", lock->val.counter, ++ "I", _Q_PENDING_OFFSET, "%0", c); ++} ++ ++static __always_inline u32 queued_fetch_set_pending_acquire(struct qspinlock *lock) ++{ ++ u32 val = 0; ++ ++ if (__queued_RMW_btsl(lock)) ++ val |= _Q_PENDING_VAL; ++ ++ val |= atomic_read(&lock->val) & ~_Q_PENDING_MASK; ++ ++ return val; ++} ++ + #ifdef CONFIG_PARAVIRT_SPINLOCKS + extern void native_queued_spin_lock_slowpath(struct qspinlock *lock, u32 val); + extern void __pv_init_lock_hash(void); +diff --git a/kernel/locking/qspinlock.c b/kernel/locking/qspinlock.c +index ec343276f975..edd75e0c7d96 100644 +--- a/kernel/locking/qspinlock.c ++++ b/kernel/locking/qspinlock.c +@@ -231,6 +231,20 @@ static __always_inline u32 xchg_tail(struct qspinlock *lock, u32 tail) + } + #endif /* _Q_PENDING_BITS == 8 */ + ++/** ++ * queued_fetch_set_pending_acquire - fetch the whole lock value and set pending ++ * @lock : Pointer to queued spinlock structure ++ * Return: The previous lock value ++ * ++ * *,*,* -> *,1,* ++ */ ++#ifndef queued_fetch_set_pending_acquire ++static __always_inline u32 queued_fetch_set_pending_acquire(struct qspinlock *lock) ++{ ++ return atomic_fetch_or_acquire(_Q_PENDING_VAL, &lock->val); ++} ++#endif ++ + /** + * set_locked - Set the lock bit and own the lock + * @lock: Pointer to queued spinlock structure +@@ -329,7 +343,8 @@ void queued_spin_lock_slowpath(struct qspinlock *lock, u32 val) + * 0,0,0 -> 0,0,1 ; trylock + * 0,0,1 -> 0,1,1 ; pending + */ +- val = atomic_fetch_or_acquire(_Q_PENDING_VAL, &lock->val); ++ val = queued_fetch_set_pending_acquire(lock); ++ + /* + * If we observe any contention; undo and queue. + */ +-- +2.19.1 + diff --git a/queue-4.19/series b/queue-4.19/series new file mode 100644 index 00000000000..210802f2385 --- /dev/null +++ b/queue-4.19/series @@ -0,0 +1,3 @@ +locking-qspinlock-re-order-code.patch +locking-qspinlock-x86-provide-liveness-guarantee.patch +ib-hfi1-remove-race-conditions-in-user_sdma-send-pat.patch