6.11-stable patches

author Greg Kroah-Hartman <gregkh@linuxfoundation.org>

Tue, 3 Dec 2024 08:27:57 +0000 (09:27 +0100)

committer Greg Kroah-Hartman <gregkh@linuxfoundation.org>

Tue, 3 Dec 2024 08:27:57 +0000 (09:27 +0100)
author Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Tue, 3 Dec 2024 08:27:57 +0000 (09:27 +0100)
committer Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Tue, 3 Dec 2024 08:27:57 +0000 (09:27 +0100)
diff --git a/queue-6.11/series b/queue-6.11/series

index 8748cd64ebec02c1b9af72910f2a05eeb378f333..2bace1ecfe6c6957673054c88955ce5cabed8209 100644 (file)
--- a/queue-6.11/series
+++ b/queue-6.11/series
@@ -677,3 +677,6 @@ xhci-combine-two-if-statements-for-etron-xhci-host.patch
  xhci-don-t-perform-soft-retry-for-etron-xhci-host.patch
  xhci-don-t-issue-reset-device-command-to-etron-xhci-host.patch
  bluetooth-fix-type-of-len-in-rfcomm_sock_getsockopt-_old.patch
+usb-xhci-limit-stop-endpoint-retries.patch
+usb-xhci-fix-td-invalidation-under-pending-set-tr-dequeue.patch
+usb-xhci-avoid-queuing-redundant-stop-endpoint-commands.patch
diff --git a/queue-6.11/usb-xhci-avoid-queuing-redundant-stop-endpoint-commands.patch b/queue-6.11/usb-xhci-avoid-queuing-redundant-stop-endpoint-commands.patch

new file mode 100644 (file)

index 0000000..fea1fba
--- /dev/null
+++ b/queue-6.11/usb-xhci-avoid-queuing-redundant-stop-endpoint-commands.patch
@@ -0,0 +1,111 @@
+From 474538b8dd1cd9c666e56cfe8ef60fbb0fb513f4 Mon Sep 17 00:00:00 2001
+From: Michal Pecio <michal.pecio@gmail.com>
+Date: Wed, 6 Nov 2024 12:14:59 +0200
+Subject: usb: xhci: Avoid queuing redundant Stop Endpoint commands
+
+From: Michal Pecio <michal.pecio@gmail.com>
+
+commit 474538b8dd1cd9c666e56cfe8ef60fbb0fb513f4 upstream.
+
+Stop Endpoint command on an already stopped endpoint fails and may be
+misinterpreted as a known hardware bug by the completion handler. This
+results in an unnecessary delay with repeated retries of the command.
+
+Avoid queuing this command when endpoint state flags indicate that it's
+stopped or halted and the command will fail. If commands are pending on
+the endpoint, their completion handlers will process cancelled TDs so
+it's done. In case of waiting for external operations like clearing TT
+buffer, the endpoint is stopped and cancelled TDs can be processed now.
+
+This eliminates practically all unnecessary retries because an endpoint
+with pending URBs is maintained in Running state by the driver, unless
+aforementioned commands or other operations are pending on it. This is
+guaranteed by xhci_ring_ep_doorbell() and by the fact that it is called
+every time any of those operations completes.
+
+The only known exceptions are hardware bugs (the endpoint never starts
+at all) and Stream Protocol errors not associated with any TRB, which
+cause an endpoint reset not followed by restart. Sounds like a bug.
+
+Generally, these retries are only expected to happen when the endpoint
+fails to start for unknown/no reason, which is a worse problem itself,
+and fixing the bug eliminates the retries too.
+
+All cases were tested and found to work as expected. SET_DEQ_PENDING
+was produced by patching uvcvideo to unlink URBs in 100us intervals,
+which then runs into this case very often. EP_HALTED was produced by
+restarting 'cat /dev/ttyUSB0' on a serial dongle with broken cable.
+EP_CLEARING_TT by the same, with the dongle on an external hub.
+
+Fixes: fd9d55d190c0 ("xhci: retry Stop Endpoint on buggy NEC controllers")
+CC: stable@vger.kernel.org
+Signed-off-by: Michal Pecio <michal.pecio@gmail.com>
+Signed-off-by: Mathias Nyman <mathias.nyman@linux.intel.com>
+Link: https://lore.kernel.org/r/20241106101459.775897-34-mathias.nyman@linux.intel.com
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/usb/host/xhci-ring.c |   13 +++++++++++++
+ drivers/usb/host/xhci.c      |   19 +++++++++++++++----
+ drivers/usb/host/xhci.h      |    1 +
+ 3 files changed, 29 insertions(+), 4 deletions(-)
+
+--- a/drivers/usb/host/xhci-ring.c
++++ b/drivers/usb/host/xhci-ring.c
+@@ -1070,6 +1070,19 @@ static int xhci_invalidate_cancelled_tds
+ }
+ 
+ /*
++ * Erase queued TDs from transfer ring(s) and give back those the xHC didn't
++ * stop on. If necessary, queue commands to move the xHC off cancelled TDs it
++ * stopped on. Those will be given back later when the commands complete.
++ *
++ * Call under xhci->lock on a stopped endpoint.
++ */
++void xhci_process_cancelled_tds(struct xhci_virt_ep *ep)
++{
++      xhci_invalidate_cancelled_tds(ep);
++      xhci_giveback_invalidated_tds(ep);
++}
++
++/*
+  * Returns the TD the endpoint ring halted on.
+  * Only call for non-running rings without streams.
+  */
+--- a/drivers/usb/host/xhci.c
++++ b/drivers/usb/host/xhci.c
+@@ -1769,10 +1769,21 @@ static int xhci_urb_dequeue(struct usb_h
+               }
+       }
+ 
+-      /* Queue a stop endpoint command, but only if this is
+-       * the first cancellation to be handled.
+-       */
+-      if (!(ep->ep_state & EP_STOP_CMD_PENDING)) {
++      /* These completion handlers will sort out cancelled TDs for us */
++      if (ep->ep_state & (EP_STOP_CMD_PENDING | EP_HALTED | SET_DEQ_PENDING)) {
++              xhci_dbg(xhci, "Not queuing Stop Endpoint on slot %d ep %d in state 0x%x\n",
++                              urb->dev->slot_id, ep_index, ep->ep_state);
++              goto done;
++      }
++
++      /* In this case no commands are pending but the endpoint is stopped */
++      if (ep->ep_state & EP_CLEARING_TT) {
++              /* and cancelled TDs can be given back right away */
++              xhci_dbg(xhci, "Invalidating TDs instantly on slot %d ep %d in state 0x%x\n",
++                              urb->dev->slot_id, ep_index, ep->ep_state);
++              xhci_process_cancelled_tds(ep);
++      } else {
++              /* Otherwise, queue a new Stop Endpoint command */
+               command = xhci_alloc_command(xhci, false, GFP_ATOMIC);
+               if (!command) {
+                       ret = -ENOMEM;
+--- a/drivers/usb/host/xhci.h
++++ b/drivers/usb/host/xhci.h
+@@ -1921,6 +1921,7 @@ void xhci_ring_doorbell_for_active_rings
+ void xhci_cleanup_command_queue(struct xhci_hcd *xhci);
+ void inc_deq(struct xhci_hcd *xhci, struct xhci_ring *ring);
+ unsigned int count_trbs(u64 addr, u64 len);
++void xhci_process_cancelled_tds(struct xhci_virt_ep *ep);
+ 
+ /* xHCI roothub code */
+ void xhci_set_link_state(struct xhci_hcd *xhci, struct xhci_port *port,
diff --git a/queue-6.11/usb-xhci-fix-td-invalidation-under-pending-set-tr-dequeue.patch b/queue-6.11/usb-xhci-fix-td-invalidation-under-pending-set-tr-dequeue.patch

new file mode 100644 (file)

index 0000000..7a01dd9
--- /dev/null
+++ b/queue-6.11/usb-xhci-fix-td-invalidation-under-pending-set-tr-dequeue.patch
@@ -0,0 +1,102 @@
+From 484c3bab2d5dfa13ff659a51a06e9a393141eefc Mon Sep 17 00:00:00 2001
+From: Michal Pecio <michal.pecio@gmail.com>
+Date: Wed, 6 Nov 2024 12:14:58 +0200
+Subject: usb: xhci: Fix TD invalidation under pending Set TR Dequeue
+
+From: Michal Pecio <michal.pecio@gmail.com>
+
+commit 484c3bab2d5dfa13ff659a51a06e9a393141eefc upstream.
+
+xhci_invalidate_cancelled_tds() may not work correctly if the hardware
+is modifying endpoint or stream contexts at the same time by executing
+a Set TR Dequeue command. And even if it worked, it would be unable to
+queue Set TR Dequeue for the next stream, failing to clear xHC cache.
+
+On stream endpoints, a chain of Set TR Dequeue commands may take some
+time to execute and we may want to cancel more TDs during this time.
+Currently this leads to Stop Endpoint completion handler calling this
+function without testing for SET_DEQ_PENDING, which will trigger the
+aforementioned problems when it happens.
+
+On all endpoints, a halt condition causes Reset Endpoint to be queued
+and an error status given to the class driver, which may unlink more
+URBs in response. Stop Endpoint is queued and its handler may execute
+concurrently with Set TR Dequeue queued by Reset Endpoint handler.
+
+(Reset Endpoint handler calls this function too, but there seems to
+be no possibility of it running concurrently with Set TR Dequeue).
+
+Fix xhci_invalidate_cancelled_tds() to work correctly under a pending
+Set TR Dequeue. Bail out of the function when SET_DEQ_PENDING is set,
+then make the completion handler call the function again and also call
+xhci_giveback_invalidated_tds(), which needs to be called next.
+
+This seems to fix another potential bug, where the handler would call
+xhci_invalidate_cancelled_tds(), which may clear some deferred TDs if
+a sanity check fails, and the TDs wouldn't be given back promptly.
+
+Said sanity check seems to be wrong and prone to false positives when
+the endpoint halts, but fixing it is beyond the scope of this change,
+besides ensuring that cleared TDs are given back properly.
+
+Fixes: 5ceac4402f5d ("xhci: Handle TD clearing for multiple streams case")
+CC: stable@vger.kernel.org
+Signed-off-by: Michal Pecio <michal.pecio@gmail.com>
+Signed-off-by: Mathias Nyman <mathias.nyman@linux.intel.com>
+Link: https://lore.kernel.org/r/20241106101459.775897-33-mathias.nyman@linux.intel.com
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/usb/host/xhci-ring.c |   18 +++++++++++++-----
+ 1 file changed, 13 insertions(+), 5 deletions(-)
+
+--- a/drivers/usb/host/xhci-ring.c
++++ b/drivers/usb/host/xhci-ring.c
+@@ -973,6 +973,13 @@ static int xhci_invalidate_cancelled_tds
+       unsigned int            slot_id = ep->vdev->slot_id;
+       int                     err;
+ 
++      /*
++       * This is not going to work if the hardware is changing its dequeue
++       * pointers as we look at them. Completion handler will call us later.
++       */
++      if (ep->ep_state & SET_DEQ_PENDING)
++              return 0;
++
+       xhci = ep->xhci;
+ 
+       list_for_each_entry_safe(td, tmp_td, &ep->cancelled_td_list, cancelled_td_list) {
+@@ -1359,7 +1366,6 @@ static void xhci_handle_cmd_set_deq(stru
+       struct xhci_ep_ctx *ep_ctx;
+       struct xhci_slot_ctx *slot_ctx;
+       struct xhci_td *td, *tmp_td;
+-      bool deferred = false;
+ 
+       ep_index = TRB_TO_EP_INDEX(le32_to_cpu(trb->generic.field[3]));
+       stream_id = TRB_TO_STREAM_ID(le32_to_cpu(trb->generic.field[2]));
+@@ -1460,8 +1466,6 @@ static void xhci_handle_cmd_set_deq(stru
+                       xhci_dbg(ep->xhci, "%s: Giveback cancelled URB %p TD\n",
+                                __func__, td->urb);
+                       xhci_td_cleanup(ep->xhci, td, ep_ring, td->status);
+-              } else if (td->cancel_status == TD_CLEARING_CACHE_DEFERRED) {
+-                      deferred = true;
+               } else {
+                       xhci_dbg(ep->xhci, "%s: Keep cancelled URB %p TD as cancel_status is %d\n",
+                                __func__, td->urb, td->cancel_status);
+@@ -1472,11 +1476,15 @@ cleanup:
+       ep->queued_deq_seg = NULL;
+       ep->queued_deq_ptr = NULL;
+ 
+-      if (deferred) {
+-              /* We have more streams to clear */
++      /* Check for deferred or newly cancelled TDs */
++      if (!list_empty(&ep->cancelled_td_list)) {
+               xhci_dbg(ep->xhci, "%s: Pending TDs to clear, continuing with invalidation\n",
+                        __func__);
+               xhci_invalidate_cancelled_tds(ep);
++              /* Try to restart the endpoint if all is done */
++              ring_doorbell_for_active_rings(xhci, slot_id, ep_index);
++              /* Start giving back any TDs invalidated above */
++              xhci_giveback_invalidated_tds(ep);
+       } else {
+               /* Restart any rings with pending URBs */
+               xhci_dbg(ep->xhci, "%s: All TDs cleared, ring doorbell\n", __func__);
diff --git a/queue-6.11/usb-xhci-limit-stop-endpoint-retries.patch b/queue-6.11/usb-xhci-limit-stop-endpoint-retries.patch

new file mode 100644 (file)

index 0000000..a18a21f
--- /dev/null
+++ b/queue-6.11/usb-xhci-limit-stop-endpoint-retries.patch
@@ -0,0 +1,138 @@
+From 42b7581376015c1bbcbe5831f043cd0ac119d028 Mon Sep 17 00:00:00 2001
+From: Michal Pecio <michal.pecio@gmail.com>
+Date: Wed, 6 Nov 2024 12:14:57 +0200
+Subject: usb: xhci: Limit Stop Endpoint retries
+
+From: Michal Pecio <michal.pecio@gmail.com>
+
+commit 42b7581376015c1bbcbe5831f043cd0ac119d028 upstream.
+
+Some host controllers fail to atomically transition an endpoint to the
+Running state on a doorbell ring and enter a hidden "Restarting" state,
+which looks very much like Stopped, with the important difference that
+it will spontaneously transition to Running anytime soon.
+
+A Stop Endpoint command queued in the Restarting state typically fails
+with Context State Error and the completion handler sees the Endpoint
+Context State as either still Stopped or already Running. Even a case
+of Halted was observed, when an error occurred right after the restart.
+
+The Halted state is already recovered from by resetting the endpoint.
+The Running state is handled by retrying Stop Endpoint.
+
+The Stopped state was recognized as a problem on NEC controllers and
+worked around also by retrying, because the endpoint soon restarts and
+then stops for good. But there is a risk: the command may fail if the
+endpoint is "stopped for good" already, and retries will fail forever.
+
+The possibility of this was not realized at the time, but a number of
+cases were discovered later and reproduced. Some proved difficult to
+deal with, and it is outright impossible to predict if an endpoint may
+fail to ever start at all due to a hardware bug. One such bug (albeit
+on ASM3142, not on NEC) was found to be reliably triggered simply by
+toggling an AX88179 NIC up/down in a tight loop for a few seconds.
+
+An endless retries storm is quite nasty. Besides putting needless load
+on the xHC and CPU, it causes URBs never to be given back, paralyzing
+the device and connection/disconnection logic for the whole bus if the
+device is unplugged. User processes waiting for URBs become unkillable,
+drivers and kworker threads lock up and xhci_hcd cannot be reloaded.
+
+For peace of mind, impose a timeout on Stop Endpoint retries in this
+case. If they don't succeed in 100ms, consider the endpoint stopped
+permanently for some reason and just give back the unlinked URBs. This
+failure case is rare already and work is under way to make it rarer.
+
+Start this work today by also handling one simple case of race with
+Reset Endpoint, because it costs just two lines to implement.
+
+Fixes: fd9d55d190c0 ("xhci: retry Stop Endpoint on buggy NEC controllers")
+CC: stable@vger.kernel.org
+Signed-off-by: Michal Pecio <michal.pecio@gmail.com>
+Signed-off-by: Mathias Nyman <mathias.nyman@linux.intel.com>
+Link: https://lore.kernel.org/r/20241106101459.775897-32-mathias.nyman@linux.intel.com
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/usb/host/xhci-ring.c |   28 ++++++++++++++++++++++++----
+ drivers/usb/host/xhci.c      |    2 ++
+ drivers/usb/host/xhci.h      |    1 +
+ 3 files changed, 27 insertions(+), 4 deletions(-)
+
+--- a/drivers/usb/host/xhci-ring.c
++++ b/drivers/usb/host/xhci-ring.c
+@@ -52,6 +52,7 @@
+  *   endpoint rings; it generates events on the event ring for these.
+  */
+ 
++#include <linux/jiffies.h>
+ #include <linux/scatterlist.h>
+ #include <linux/slab.h>
+ #include <linux/dma-mapping.h>
+@@ -1151,16 +1152,35 @@ static void xhci_handle_cmd_stop_ep(stru
+                       return;
+               case EP_STATE_STOPPED:
+                       /*
+-                       * NEC uPD720200 sometimes sets this state and fails with
+-                       * Context Error while continuing to process TRBs.
+-                       * Be conservative and trust EP_CTX_STATE on other chips.
++                       * Per xHCI 4.6.9, Stop Endpoint command on a Stopped
++                       * EP is a Context State Error, and EP stays Stopped.
++                       *
++                       * But maybe it failed on Halted, and somebody ran Reset
++                       * Endpoint later. EP state is now Stopped and EP_HALTED
++                       * still set because Reset EP handler will run after us.
++                       */
++                      if (ep->ep_state & EP_HALTED)
++                              break;
++                      /*
++                       * On some HCs EP state remains Stopped for some tens of
++                       * us to a few ms or more after a doorbell ring, and any
++                       * new Stop Endpoint fails without aborting the restart.
++                       * This handler may run quickly enough to still see this
++                       * Stopped state, but it will soon change to Running.
++                       *
++                       * Assume this bug on unexpected Stop Endpoint failures.
++                       * Keep retrying until the EP starts and stops again, on
++                       * chips where this is known to help. Wait for 100ms.
+                        */
+                       if (!(xhci->quirks & XHCI_NEC_HOST))
+                               break;
++                      if (time_is_before_jiffies(ep->stop_time + msecs_to_jiffies(100)))
++                              break;
+                       fallthrough;
+               case EP_STATE_RUNNING:
+                       /* Race, HW handled stop ep cmd before ep was running */
+-                      xhci_dbg(xhci, "Stop ep completion ctx error, ep is running\n");
++                      xhci_dbg(xhci, "Stop ep completion ctx error, ctx_state %d\n",
++                                      GET_EP_CTX_STATE(ep_ctx));
+ 
+                       command = xhci_alloc_command(xhci, false, GFP_ATOMIC);
+                       if (!command) {
+--- a/drivers/usb/host/xhci.c
++++ b/drivers/usb/host/xhci.c
+@@ -8,6 +8,7 @@
+  * Some code borrowed from the Linux EHCI driver.
+  */
+ 
++#include <linux/jiffies.h>
+ #include <linux/pci.h>
+ #include <linux/iommu.h>
+ #include <linux/iopoll.h>
+@@ -1777,6 +1778,7 @@ static int xhci_urb_dequeue(struct usb_h
+                       ret = -ENOMEM;
+                       goto done;
+               }
++              ep->stop_time = jiffies;
+               ep->ep_state |= EP_STOP_CMD_PENDING;
+               xhci_queue_stop_endpoint(xhci, command, urb->dev->slot_id,
+                                        ep_index, 0);
+--- a/drivers/usb/host/xhci.h
++++ b/drivers/usb/host/xhci.h
+@@ -690,6 +690,7 @@ struct xhci_virt_ep {
+       /* Bandwidth checking storage */
+       struct xhci_bw_info     bw_info;
+       struct list_head        bw_endpoint_list;
++      unsigned long           stop_time;
+       /* Isoch Frame ID checking storage */
+       int                     next_frame_id;
+       /* Use new Isoch TRB layout needed for extended TBC support */
author	Greg Kroah-Hartman <gregkh@linuxfoundation.org>
	Tue, 3 Dec 2024 08:27:57 +0000 (09:27 +0100)
committer	Greg Kroah-Hartman <gregkh@linuxfoundation.org>
	Tue, 3 Dec 2024 08:27:57 +0000 (09:27 +0100)
queue-6.11/series		patch \| blob \| blame \| history
queue-6.11/usb-xhci-avoid-queuing-redundant-stop-endpoint-commands.patch	[new file with mode: 0644]	patch \| blob
queue-6.11/usb-xhci-fix-td-invalidation-under-pending-set-tr-dequeue.patch	[new file with mode: 0644]	patch \| blob
queue-6.11/usb-xhci-limit-stop-endpoint-retries.patch	[new file with mode: 0644]	patch \| blob