From: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Date: Tue, 3 Dec 2024 08:27:57 +0000 (+0100)
Subject: 6.11-stable patches
X-Git-Tag: v4.19.325~65
X-Git-Url: http://git.ipfire.org/gitweb.cgi?a=commitdiff_plain;h=46cb70f868b704f895a5e4e2f1a2c382a5f32ade;p=thirdparty%2Fkernel%2Fstable-queue.git

6.11-stable patches

added patches:
	usb-xhci-avoid-queuing-redundant-stop-endpoint-commands.patch
	usb-xhci-fix-td-invalidation-under-pending-set-tr-dequeue.patch
	usb-xhci-limit-stop-endpoint-retries.patch
---

diff --git a/queue-6.11/series b/queue-6.11/series
index 8748cd64ebe..2bace1ecfe6 100644
--- a/queue-6.11/series
+++ b/queue-6.11/series
@@ -677,3 +677,6 @@ xhci-combine-two-if-statements-for-etron-xhci-host.patch
 xhci-don-t-perform-soft-retry-for-etron-xhci-host.patch
 xhci-don-t-issue-reset-device-command-to-etron-xhci-host.patch
 bluetooth-fix-type-of-len-in-rfcomm_sock_getsockopt-_old.patch
+usb-xhci-limit-stop-endpoint-retries.patch
+usb-xhci-fix-td-invalidation-under-pending-set-tr-dequeue.patch
+usb-xhci-avoid-queuing-redundant-stop-endpoint-commands.patch
diff --git a/queue-6.11/usb-xhci-avoid-queuing-redundant-stop-endpoint-commands.patch b/queue-6.11/usb-xhci-avoid-queuing-redundant-stop-endpoint-commands.patch
new file mode 100644
index 00000000000..fea1fba3ed1
--- /dev/null
+++ b/queue-6.11/usb-xhci-avoid-queuing-redundant-stop-endpoint-commands.patch
@@ -0,0 +1,111 @@
+From 474538b8dd1cd9c666e56cfe8ef60fbb0fb513f4 Mon Sep 17 00:00:00 2001
+From: Michal Pecio <michal.pecio@gmail.com>
+Date: Wed, 6 Nov 2024 12:14:59 +0200
+Subject: usb: xhci: Avoid queuing redundant Stop Endpoint commands
+
+From: Michal Pecio <michal.pecio@gmail.com>
+
+commit 474538b8dd1cd9c666e56cfe8ef60fbb0fb513f4 upstream.
+
+Stop Endpoint command on an already stopped endpoint fails and may be
+misinterpreted as a known hardware bug by the completion handler. This
+results in an unnecessary delay with repeated retries of the command.
+
+Avoid queuing this command when endpoint state flags indicate that it's
+stopped or halted and the command will fail. If commands are pending on
+the endpoint, their completion handlers will process cancelled TDs so
+it's done. In case of waiting for external operations like clearing TT
+buffer, the endpoint is stopped and cancelled TDs can be processed now.
+
+This eliminates practically all unnecessary retries because an endpoint
+with pending URBs is maintained in Running state by the driver, unless
+aforementioned commands or other operations are pending on it. This is
+guaranteed by xhci_ring_ep_doorbell() and by the fact that it is called
+every time any of those operations completes.
+
+The only known exceptions are hardware bugs (the endpoint never starts
+at all) and Stream Protocol errors not associated with any TRB, which
+cause an endpoint reset not followed by restart. Sounds like a bug.
+
+Generally, these retries are only expected to happen when the endpoint
+fails to start for unknown/no reason, which is a worse problem itself,
+and fixing the bug eliminates the retries too.
+
+All cases were tested and found to work as expected. SET_DEQ_PENDING
+was produced by patching uvcvideo to unlink URBs in 100us intervals,
+which then runs into this case very often. EP_HALTED was produced by
+restarting 'cat /dev/ttyUSB0' on a serial dongle with broken cable.
+EP_CLEARING_TT by the same, with the dongle on an external hub.
+
+Fixes: fd9d55d190c0 ("xhci: retry Stop Endpoint on buggy NEC controllers")
+CC: stable@vger.kernel.org
+Signed-off-by: Michal Pecio <michal.pecio@gmail.com>
+Signed-off-by: Mathias Nyman <mathias.nyman@linux.intel.com>
+Link: https://lore.kernel.org/r/20241106101459.775897-34-mathias.nyman@linux.intel.com
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/usb/host/xhci-ring.c |   13 +++++++++++++
+ drivers/usb/host/xhci.c      |   19 +++++++++++++++----
+ drivers/usb/host/xhci.h      |    1 +
+ 3 files changed, 29 insertions(+), 4 deletions(-)
+
+--- a/drivers/usb/host/xhci-ring.c
++++ b/drivers/usb/host/xhci-ring.c
+@@ -1070,6 +1070,19 @@ static int xhci_invalidate_cancelled_tds
+ }
+ 
+ /*
++ * Erase queued TDs from transfer ring(s) and give back those the xHC didn't
++ * stop on. If necessary, queue commands to move the xHC off cancelled TDs it
++ * stopped on. Those will be given back later when the commands complete.
++ *
++ * Call under xhci->lock on a stopped endpoint.
++ */
++void xhci_process_cancelled_tds(struct xhci_virt_ep *ep)
++{
++	xhci_invalidate_cancelled_tds(ep);
++	xhci_giveback_invalidated_tds(ep);
++}
++
++/*
+  * Returns the TD the endpoint ring halted on.
+  * Only call for non-running rings without streams.
+  */
+--- a/drivers/usb/host/xhci.c
++++ b/drivers/usb/host/xhci.c
+@@ -1769,10 +1769,21 @@ static int xhci_urb_dequeue(struct usb_h
+ 		}
+ 	}
+ 
+-	/* Queue a stop endpoint command, but only if this is
+-	 * the first cancellation to be handled.
+-	 */
+-	if (!(ep->ep_state & EP_STOP_CMD_PENDING)) {
++	/* These completion handlers will sort out cancelled TDs for us */
++	if (ep->ep_state & (EP_STOP_CMD_PENDING | EP_HALTED | SET_DEQ_PENDING)) {
++		xhci_dbg(xhci, "Not queuing Stop Endpoint on slot %d ep %d in state 0x%x\n",
++				urb->dev->slot_id, ep_index, ep->ep_state);
++		goto done;
++	}
++
++	/* In this case no commands are pending but the endpoint is stopped */
++	if (ep->ep_state & EP_CLEARING_TT) {
++		/* and cancelled TDs can be given back right away */
++		xhci_dbg(xhci, "Invalidating TDs instantly on slot %d ep %d in state 0x%x\n",
++				urb->dev->slot_id, ep_index, ep->ep_state);
++		xhci_process_cancelled_tds(ep);
++	} else {
++		/* Otherwise, queue a new Stop Endpoint command */
+ 		command = xhci_alloc_command(xhci, false, GFP_ATOMIC);
+ 		if (!command) {
+ 			ret = -ENOMEM;
+--- a/drivers/usb/host/xhci.h
++++ b/drivers/usb/host/xhci.h
+@@ -1921,6 +1921,7 @@ void xhci_ring_doorbell_for_active_rings
+ void xhci_cleanup_command_queue(struct xhci_hcd *xhci);
+ void inc_deq(struct xhci_hcd *xhci, struct xhci_ring *ring);
+ unsigned int count_trbs(u64 addr, u64 len);
++void xhci_process_cancelled_tds(struct xhci_virt_ep *ep);
+ 
+ /* xHCI roothub code */
+ void xhci_set_link_state(struct xhci_hcd *xhci, struct xhci_port *port,
diff --git a/queue-6.11/usb-xhci-fix-td-invalidation-under-pending-set-tr-dequeue.patch b/queue-6.11/usb-xhci-fix-td-invalidation-under-pending-set-tr-dequeue.patch
new file mode 100644
index 00000000000..7a01dd9ff7e
--- /dev/null
+++ b/queue-6.11/usb-xhci-fix-td-invalidation-under-pending-set-tr-dequeue.patch
@@ -0,0 +1,102 @@
+From 484c3bab2d5dfa13ff659a51a06e9a393141eefc Mon Sep 17 00:00:00 2001
+From: Michal Pecio <michal.pecio@gmail.com>
+Date: Wed, 6 Nov 2024 12:14:58 +0200
+Subject: usb: xhci: Fix TD invalidation under pending Set TR Dequeue
+
+From: Michal Pecio <michal.pecio@gmail.com>
+
+commit 484c3bab2d5dfa13ff659a51a06e9a393141eefc upstream.
+
+xhci_invalidate_cancelled_tds() may not work correctly if the hardware
+is modifying endpoint or stream contexts at the same time by executing
+a Set TR Dequeue command. And even if it worked, it would be unable to
+queue Set TR Dequeue for the next stream, failing to clear xHC cache.
+
+On stream endpoints, a chain of Set TR Dequeue commands may take some
+time to execute and we may want to cancel more TDs during this time.
+Currently this leads to Stop Endpoint completion handler calling this
+function without testing for SET_DEQ_PENDING, which will trigger the
+aforementioned problems when it happens.
+
+On all endpoints, a halt condition causes Reset Endpoint to be queued
+and an error status given to the class driver, which may unlink more
+URBs in response. Stop Endpoint is queued and its handler may execute
+concurrently with Set TR Dequeue queued by Reset Endpoint handler.
+
+(Reset Endpoint handler calls this function too, but there seems to
+be no possibility of it running concurrently with Set TR Dequeue).
+
+Fix xhci_invalidate_cancelled_tds() to work correctly under a pending
+Set TR Dequeue. Bail out of the function when SET_DEQ_PENDING is set,
+then make the completion handler call the function again and also call
+xhci_giveback_invalidated_tds(), which needs to be called next.
+
+This seems to fix another potential bug, where the handler would call
+xhci_invalidate_cancelled_tds(), which may clear some deferred TDs if
+a sanity check fails, and the TDs wouldn't be given back promptly.
+
+Said sanity check seems to be wrong and prone to false positives when
+the endpoint halts, but fixing it is beyond the scope of this change,
+besides ensuring that cleared TDs are given back properly.
+
+Fixes: 5ceac4402f5d ("xhci: Handle TD clearing for multiple streams case")
+CC: stable@vger.kernel.org
+Signed-off-by: Michal Pecio <michal.pecio@gmail.com>
+Signed-off-by: Mathias Nyman <mathias.nyman@linux.intel.com>
+Link: https://lore.kernel.org/r/20241106101459.775897-33-mathias.nyman@linux.intel.com
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/usb/host/xhci-ring.c |   18 +++++++++++++-----
+ 1 file changed, 13 insertions(+), 5 deletions(-)
+
+--- a/drivers/usb/host/xhci-ring.c
++++ b/drivers/usb/host/xhci-ring.c
+@@ -973,6 +973,13 @@ static int xhci_invalidate_cancelled_tds
+ 	unsigned int		slot_id = ep->vdev->slot_id;
+ 	int			err;
+ 
++	/*
++	 * This is not going to work if the hardware is changing its dequeue
++	 * pointers as we look at them. Completion handler will call us later.
++	 */
++	if (ep->ep_state & SET_DEQ_PENDING)
++		return 0;
++
+ 	xhci = ep->xhci;
+ 
+ 	list_for_each_entry_safe(td, tmp_td, &ep->cancelled_td_list, cancelled_td_list) {
+@@ -1359,7 +1366,6 @@ static void xhci_handle_cmd_set_deq(stru
+ 	struct xhci_ep_ctx *ep_ctx;
+ 	struct xhci_slot_ctx *slot_ctx;
+ 	struct xhci_td *td, *tmp_td;
+-	bool deferred = false;
+ 
+ 	ep_index = TRB_TO_EP_INDEX(le32_to_cpu(trb->generic.field[3]));
+ 	stream_id = TRB_TO_STREAM_ID(le32_to_cpu(trb->generic.field[2]));
+@@ -1460,8 +1466,6 @@ static void xhci_handle_cmd_set_deq(stru
+ 			xhci_dbg(ep->xhci, "%s: Giveback cancelled URB %p TD\n",
+ 				 __func__, td->urb);
+ 			xhci_td_cleanup(ep->xhci, td, ep_ring, td->status);
+-		} else if (td->cancel_status == TD_CLEARING_CACHE_DEFERRED) {
+-			deferred = true;
+ 		} else {
+ 			xhci_dbg(ep->xhci, "%s: Keep cancelled URB %p TD as cancel_status is %d\n",
+ 				 __func__, td->urb, td->cancel_status);
+@@ -1472,11 +1476,15 @@ cleanup:
+ 	ep->queued_deq_seg = NULL;
+ 	ep->queued_deq_ptr = NULL;
+ 
+-	if (deferred) {
+-		/* We have more streams to clear */
++	/* Check for deferred or newly cancelled TDs */
++	if (!list_empty(&ep->cancelled_td_list)) {
+ 		xhci_dbg(ep->xhci, "%s: Pending TDs to clear, continuing with invalidation\n",
+ 			 __func__);
+ 		xhci_invalidate_cancelled_tds(ep);
++		/* Try to restart the endpoint if all is done */
++		ring_doorbell_for_active_rings(xhci, slot_id, ep_index);
++		/* Start giving back any TDs invalidated above */
++		xhci_giveback_invalidated_tds(ep);
+ 	} else {
+ 		/* Restart any rings with pending URBs */
+ 		xhci_dbg(ep->xhci, "%s: All TDs cleared, ring doorbell\n", __func__);
diff --git a/queue-6.11/usb-xhci-limit-stop-endpoint-retries.patch b/queue-6.11/usb-xhci-limit-stop-endpoint-retries.patch
new file mode 100644
index 00000000000..a18a21f6911
--- /dev/null
+++ b/queue-6.11/usb-xhci-limit-stop-endpoint-retries.patch
@@ -0,0 +1,138 @@
+From 42b7581376015c1bbcbe5831f043cd0ac119d028 Mon Sep 17 00:00:00 2001
+From: Michal Pecio <michal.pecio@gmail.com>
+Date: Wed, 6 Nov 2024 12:14:57 +0200
+Subject: usb: xhci: Limit Stop Endpoint retries
+
+From: Michal Pecio <michal.pecio@gmail.com>
+
+commit 42b7581376015c1bbcbe5831f043cd0ac119d028 upstream.
+
+Some host controllers fail to atomically transition an endpoint to the
+Running state on a doorbell ring and enter a hidden "Restarting" state,
+which looks very much like Stopped, with the important difference that
+it will spontaneously transition to Running anytime soon.
+
+A Stop Endpoint command queued in the Restarting state typically fails
+with Context State Error and the completion handler sees the Endpoint
+Context State as either still Stopped or already Running. Even a case
+of Halted was observed, when an error occurred right after the restart.
+
+The Halted state is already recovered from by resetting the endpoint.
+The Running state is handled by retrying Stop Endpoint.
+
+The Stopped state was recognized as a problem on NEC controllers and
+worked around also by retrying, because the endpoint soon restarts and
+then stops for good. But there is a risk: the command may fail if the
+endpoint is "stopped for good" already, and retries will fail forever.
+
+The possibility of this was not realized at the time, but a number of
+cases were discovered later and reproduced. Some proved difficult to
+deal with, and it is outright impossible to predict if an endpoint may
+fail to ever start at all due to a hardware bug. One such bug (albeit
+on ASM3142, not on NEC) was found to be reliably triggered simply by
+toggling an AX88179 NIC up/down in a tight loop for a few seconds.
+
+An endless retries storm is quite nasty. Besides putting needless load
+on the xHC and CPU, it causes URBs never to be given back, paralyzing
+the device and connection/disconnection logic for the whole bus if the
+device is unplugged. User processes waiting for URBs become unkillable,
+drivers and kworker threads lock up and xhci_hcd cannot be reloaded.
+
+For peace of mind, impose a timeout on Stop Endpoint retries in this
+case. If they don't succeed in 100ms, consider the endpoint stopped
+permanently for some reason and just give back the unlinked URBs. This
+failure case is rare already and work is under way to make it rarer.
+
+Start this work today by also handling one simple case of race with
+Reset Endpoint, because it costs just two lines to implement.
+
+Fixes: fd9d55d190c0 ("xhci: retry Stop Endpoint on buggy NEC controllers")
+CC: stable@vger.kernel.org
+Signed-off-by: Michal Pecio <michal.pecio@gmail.com>
+Signed-off-by: Mathias Nyman <mathias.nyman@linux.intel.com>
+Link: https://lore.kernel.org/r/20241106101459.775897-32-mathias.nyman@linux.intel.com
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/usb/host/xhci-ring.c |   28 ++++++++++++++++++++++++----
+ drivers/usb/host/xhci.c      |    2 ++
+ drivers/usb/host/xhci.h      |    1 +
+ 3 files changed, 27 insertions(+), 4 deletions(-)
+
+--- a/drivers/usb/host/xhci-ring.c
++++ b/drivers/usb/host/xhci-ring.c
+@@ -52,6 +52,7 @@
+  *   endpoint rings; it generates events on the event ring for these.
+  */
+ 
++#include <linux/jiffies.h>
+ #include <linux/scatterlist.h>
+ #include <linux/slab.h>
+ #include <linux/dma-mapping.h>
+@@ -1151,16 +1152,35 @@ static void xhci_handle_cmd_stop_ep(stru
+ 			return;
+ 		case EP_STATE_STOPPED:
+ 			/*
+-			 * NEC uPD720200 sometimes sets this state and fails with
+-			 * Context Error while continuing to process TRBs.
+-			 * Be conservative and trust EP_CTX_STATE on other chips.
++			 * Per xHCI 4.6.9, Stop Endpoint command on a Stopped
++			 * EP is a Context State Error, and EP stays Stopped.
++			 *
++			 * But maybe it failed on Halted, and somebody ran Reset
++			 * Endpoint later. EP state is now Stopped and EP_HALTED
++			 * still set because Reset EP handler will run after us.
++			 */
++			if (ep->ep_state & EP_HALTED)
++				break;
++			/*
++			 * On some HCs EP state remains Stopped for some tens of
++			 * us to a few ms or more after a doorbell ring, and any
++			 * new Stop Endpoint fails without aborting the restart.
++			 * This handler may run quickly enough to still see this
++			 * Stopped state, but it will soon change to Running.
++			 *
++			 * Assume this bug on unexpected Stop Endpoint failures.
++			 * Keep retrying until the EP starts and stops again, on
++			 * chips where this is known to help. Wait for 100ms.
+ 			 */
+ 			if (!(xhci->quirks & XHCI_NEC_HOST))
+ 				break;
++			if (time_is_before_jiffies(ep->stop_time + msecs_to_jiffies(100)))
++				break;
+ 			fallthrough;
+ 		case EP_STATE_RUNNING:
+ 			/* Race, HW handled stop ep cmd before ep was running */
+-			xhci_dbg(xhci, "Stop ep completion ctx error, ep is running\n");
++			xhci_dbg(xhci, "Stop ep completion ctx error, ctx_state %d\n",
++					GET_EP_CTX_STATE(ep_ctx));
+ 
+ 			command = xhci_alloc_command(xhci, false, GFP_ATOMIC);
+ 			if (!command) {
+--- a/drivers/usb/host/xhci.c
++++ b/drivers/usb/host/xhci.c
+@@ -8,6 +8,7 @@
+  * Some code borrowed from the Linux EHCI driver.
+  */
+ 
++#include <linux/jiffies.h>
+ #include <linux/pci.h>
+ #include <linux/iommu.h>
+ #include <linux/iopoll.h>
+@@ -1777,6 +1778,7 @@ static int xhci_urb_dequeue(struct usb_h
+ 			ret = -ENOMEM;
+ 			goto done;
+ 		}
++		ep->stop_time = jiffies;
+ 		ep->ep_state |= EP_STOP_CMD_PENDING;
+ 		xhci_queue_stop_endpoint(xhci, command, urb->dev->slot_id,
+ 					 ep_index, 0);
+--- a/drivers/usb/host/xhci.h
++++ b/drivers/usb/host/xhci.h
+@@ -690,6 +690,7 @@ struct xhci_virt_ep {
+ 	/* Bandwidth checking storage */
+ 	struct xhci_bw_info	bw_info;
+ 	struct list_head	bw_endpoint_list;
++	unsigned long		stop_time;
+ 	/* Isoch Frame ID checking storage */
+ 	int			next_frame_id;
+ 	/* Use new Isoch TRB layout needed for extended TBC support */