From: Greg Kroah-Hartman Date: Tue, 3 Dec 2024 08:27:57 +0000 (+0100) Subject: 6.11-stable patches X-Git-Tag: v4.19.325~65 X-Git-Url: http://git.ipfire.org/gitweb.cgi?a=commitdiff_plain;h=46cb70f868b704f895a5e4e2f1a2c382a5f32ade;p=thirdparty%2Fkernel%2Fstable-queue.git 6.11-stable patches added patches: usb-xhci-avoid-queuing-redundant-stop-endpoint-commands.patch usb-xhci-fix-td-invalidation-under-pending-set-tr-dequeue.patch usb-xhci-limit-stop-endpoint-retries.patch --- diff --git a/queue-6.11/series b/queue-6.11/series index 8748cd64ebe..2bace1ecfe6 100644 --- a/queue-6.11/series +++ b/queue-6.11/series @@ -677,3 +677,6 @@ xhci-combine-two-if-statements-for-etron-xhci-host.patch xhci-don-t-perform-soft-retry-for-etron-xhci-host.patch xhci-don-t-issue-reset-device-command-to-etron-xhci-host.patch bluetooth-fix-type-of-len-in-rfcomm_sock_getsockopt-_old.patch +usb-xhci-limit-stop-endpoint-retries.patch +usb-xhci-fix-td-invalidation-under-pending-set-tr-dequeue.patch +usb-xhci-avoid-queuing-redundant-stop-endpoint-commands.patch diff --git a/queue-6.11/usb-xhci-avoid-queuing-redundant-stop-endpoint-commands.patch b/queue-6.11/usb-xhci-avoid-queuing-redundant-stop-endpoint-commands.patch new file mode 100644 index 00000000000..fea1fba3ed1 --- /dev/null +++ b/queue-6.11/usb-xhci-avoid-queuing-redundant-stop-endpoint-commands.patch @@ -0,0 +1,111 @@ +From 474538b8dd1cd9c666e56cfe8ef60fbb0fb513f4 Mon Sep 17 00:00:00 2001 +From: Michal Pecio +Date: Wed, 6 Nov 2024 12:14:59 +0200 +Subject: usb: xhci: Avoid queuing redundant Stop Endpoint commands + +From: Michal Pecio + +commit 474538b8dd1cd9c666e56cfe8ef60fbb0fb513f4 upstream. + +Stop Endpoint command on an already stopped endpoint fails and may be +misinterpreted as a known hardware bug by the completion handler. This +results in an unnecessary delay with repeated retries of the command. + +Avoid queuing this command when endpoint state flags indicate that it's +stopped or halted and the command will fail. If commands are pending on +the endpoint, their completion handlers will process cancelled TDs so +it's done. In case of waiting for external operations like clearing TT +buffer, the endpoint is stopped and cancelled TDs can be processed now. + +This eliminates practically all unnecessary retries because an endpoint +with pending URBs is maintained in Running state by the driver, unless +aforementioned commands or other operations are pending on it. This is +guaranteed by xhci_ring_ep_doorbell() and by the fact that it is called +every time any of those operations completes. + +The only known exceptions are hardware bugs (the endpoint never starts +at all) and Stream Protocol errors not associated with any TRB, which +cause an endpoint reset not followed by restart. Sounds like a bug. + +Generally, these retries are only expected to happen when the endpoint +fails to start for unknown/no reason, which is a worse problem itself, +and fixing the bug eliminates the retries too. + +All cases were tested and found to work as expected. SET_DEQ_PENDING +was produced by patching uvcvideo to unlink URBs in 100us intervals, +which then runs into this case very often. EP_HALTED was produced by +restarting 'cat /dev/ttyUSB0' on a serial dongle with broken cable. +EP_CLEARING_TT by the same, with the dongle on an external hub. + +Fixes: fd9d55d190c0 ("xhci: retry Stop Endpoint on buggy NEC controllers") +CC: stable@vger.kernel.org +Signed-off-by: Michal Pecio +Signed-off-by: Mathias Nyman +Link: https://lore.kernel.org/r/20241106101459.775897-34-mathias.nyman@linux.intel.com +Signed-off-by: Greg Kroah-Hartman +--- + drivers/usb/host/xhci-ring.c | 13 +++++++++++++ + drivers/usb/host/xhci.c | 19 +++++++++++++++---- + drivers/usb/host/xhci.h | 1 + + 3 files changed, 29 insertions(+), 4 deletions(-) + +--- a/drivers/usb/host/xhci-ring.c ++++ b/drivers/usb/host/xhci-ring.c +@@ -1070,6 +1070,19 @@ static int xhci_invalidate_cancelled_tds + } + + /* ++ * Erase queued TDs from transfer ring(s) and give back those the xHC didn't ++ * stop on. If necessary, queue commands to move the xHC off cancelled TDs it ++ * stopped on. Those will be given back later when the commands complete. ++ * ++ * Call under xhci->lock on a stopped endpoint. ++ */ ++void xhci_process_cancelled_tds(struct xhci_virt_ep *ep) ++{ ++ xhci_invalidate_cancelled_tds(ep); ++ xhci_giveback_invalidated_tds(ep); ++} ++ ++/* + * Returns the TD the endpoint ring halted on. + * Only call for non-running rings without streams. + */ +--- a/drivers/usb/host/xhci.c ++++ b/drivers/usb/host/xhci.c +@@ -1769,10 +1769,21 @@ static int xhci_urb_dequeue(struct usb_h + } + } + +- /* Queue a stop endpoint command, but only if this is +- * the first cancellation to be handled. +- */ +- if (!(ep->ep_state & EP_STOP_CMD_PENDING)) { ++ /* These completion handlers will sort out cancelled TDs for us */ ++ if (ep->ep_state & (EP_STOP_CMD_PENDING | EP_HALTED | SET_DEQ_PENDING)) { ++ xhci_dbg(xhci, "Not queuing Stop Endpoint on slot %d ep %d in state 0x%x\n", ++ urb->dev->slot_id, ep_index, ep->ep_state); ++ goto done; ++ } ++ ++ /* In this case no commands are pending but the endpoint is stopped */ ++ if (ep->ep_state & EP_CLEARING_TT) { ++ /* and cancelled TDs can be given back right away */ ++ xhci_dbg(xhci, "Invalidating TDs instantly on slot %d ep %d in state 0x%x\n", ++ urb->dev->slot_id, ep_index, ep->ep_state); ++ xhci_process_cancelled_tds(ep); ++ } else { ++ /* Otherwise, queue a new Stop Endpoint command */ + command = xhci_alloc_command(xhci, false, GFP_ATOMIC); + if (!command) { + ret = -ENOMEM; +--- a/drivers/usb/host/xhci.h ++++ b/drivers/usb/host/xhci.h +@@ -1921,6 +1921,7 @@ void xhci_ring_doorbell_for_active_rings + void xhci_cleanup_command_queue(struct xhci_hcd *xhci); + void inc_deq(struct xhci_hcd *xhci, struct xhci_ring *ring); + unsigned int count_trbs(u64 addr, u64 len); ++void xhci_process_cancelled_tds(struct xhci_virt_ep *ep); + + /* xHCI roothub code */ + void xhci_set_link_state(struct xhci_hcd *xhci, struct xhci_port *port, diff --git a/queue-6.11/usb-xhci-fix-td-invalidation-under-pending-set-tr-dequeue.patch b/queue-6.11/usb-xhci-fix-td-invalidation-under-pending-set-tr-dequeue.patch new file mode 100644 index 00000000000..7a01dd9ff7e --- /dev/null +++ b/queue-6.11/usb-xhci-fix-td-invalidation-under-pending-set-tr-dequeue.patch @@ -0,0 +1,102 @@ +From 484c3bab2d5dfa13ff659a51a06e9a393141eefc Mon Sep 17 00:00:00 2001 +From: Michal Pecio +Date: Wed, 6 Nov 2024 12:14:58 +0200 +Subject: usb: xhci: Fix TD invalidation under pending Set TR Dequeue + +From: Michal Pecio + +commit 484c3bab2d5dfa13ff659a51a06e9a393141eefc upstream. + +xhci_invalidate_cancelled_tds() may not work correctly if the hardware +is modifying endpoint or stream contexts at the same time by executing +a Set TR Dequeue command. And even if it worked, it would be unable to +queue Set TR Dequeue for the next stream, failing to clear xHC cache. + +On stream endpoints, a chain of Set TR Dequeue commands may take some +time to execute and we may want to cancel more TDs during this time. +Currently this leads to Stop Endpoint completion handler calling this +function without testing for SET_DEQ_PENDING, which will trigger the +aforementioned problems when it happens. + +On all endpoints, a halt condition causes Reset Endpoint to be queued +and an error status given to the class driver, which may unlink more +URBs in response. Stop Endpoint is queued and its handler may execute +concurrently with Set TR Dequeue queued by Reset Endpoint handler. + +(Reset Endpoint handler calls this function too, but there seems to +be no possibility of it running concurrently with Set TR Dequeue). + +Fix xhci_invalidate_cancelled_tds() to work correctly under a pending +Set TR Dequeue. Bail out of the function when SET_DEQ_PENDING is set, +then make the completion handler call the function again and also call +xhci_giveback_invalidated_tds(), which needs to be called next. + +This seems to fix another potential bug, where the handler would call +xhci_invalidate_cancelled_tds(), which may clear some deferred TDs if +a sanity check fails, and the TDs wouldn't be given back promptly. + +Said sanity check seems to be wrong and prone to false positives when +the endpoint halts, but fixing it is beyond the scope of this change, +besides ensuring that cleared TDs are given back properly. + +Fixes: 5ceac4402f5d ("xhci: Handle TD clearing for multiple streams case") +CC: stable@vger.kernel.org +Signed-off-by: Michal Pecio +Signed-off-by: Mathias Nyman +Link: https://lore.kernel.org/r/20241106101459.775897-33-mathias.nyman@linux.intel.com +Signed-off-by: Greg Kroah-Hartman +--- + drivers/usb/host/xhci-ring.c | 18 +++++++++++++----- + 1 file changed, 13 insertions(+), 5 deletions(-) + +--- a/drivers/usb/host/xhci-ring.c ++++ b/drivers/usb/host/xhci-ring.c +@@ -973,6 +973,13 @@ static int xhci_invalidate_cancelled_tds + unsigned int slot_id = ep->vdev->slot_id; + int err; + ++ /* ++ * This is not going to work if the hardware is changing its dequeue ++ * pointers as we look at them. Completion handler will call us later. ++ */ ++ if (ep->ep_state & SET_DEQ_PENDING) ++ return 0; ++ + xhci = ep->xhci; + + list_for_each_entry_safe(td, tmp_td, &ep->cancelled_td_list, cancelled_td_list) { +@@ -1359,7 +1366,6 @@ static void xhci_handle_cmd_set_deq(stru + struct xhci_ep_ctx *ep_ctx; + struct xhci_slot_ctx *slot_ctx; + struct xhci_td *td, *tmp_td; +- bool deferred = false; + + ep_index = TRB_TO_EP_INDEX(le32_to_cpu(trb->generic.field[3])); + stream_id = TRB_TO_STREAM_ID(le32_to_cpu(trb->generic.field[2])); +@@ -1460,8 +1466,6 @@ static void xhci_handle_cmd_set_deq(stru + xhci_dbg(ep->xhci, "%s: Giveback cancelled URB %p TD\n", + __func__, td->urb); + xhci_td_cleanup(ep->xhci, td, ep_ring, td->status); +- } else if (td->cancel_status == TD_CLEARING_CACHE_DEFERRED) { +- deferred = true; + } else { + xhci_dbg(ep->xhci, "%s: Keep cancelled URB %p TD as cancel_status is %d\n", + __func__, td->urb, td->cancel_status); +@@ -1472,11 +1476,15 @@ cleanup: + ep->queued_deq_seg = NULL; + ep->queued_deq_ptr = NULL; + +- if (deferred) { +- /* We have more streams to clear */ ++ /* Check for deferred or newly cancelled TDs */ ++ if (!list_empty(&ep->cancelled_td_list)) { + xhci_dbg(ep->xhci, "%s: Pending TDs to clear, continuing with invalidation\n", + __func__); + xhci_invalidate_cancelled_tds(ep); ++ /* Try to restart the endpoint if all is done */ ++ ring_doorbell_for_active_rings(xhci, slot_id, ep_index); ++ /* Start giving back any TDs invalidated above */ ++ xhci_giveback_invalidated_tds(ep); + } else { + /* Restart any rings with pending URBs */ + xhci_dbg(ep->xhci, "%s: All TDs cleared, ring doorbell\n", __func__); diff --git a/queue-6.11/usb-xhci-limit-stop-endpoint-retries.patch b/queue-6.11/usb-xhci-limit-stop-endpoint-retries.patch new file mode 100644 index 00000000000..a18a21f6911 --- /dev/null +++ b/queue-6.11/usb-xhci-limit-stop-endpoint-retries.patch @@ -0,0 +1,138 @@ +From 42b7581376015c1bbcbe5831f043cd0ac119d028 Mon Sep 17 00:00:00 2001 +From: Michal Pecio +Date: Wed, 6 Nov 2024 12:14:57 +0200 +Subject: usb: xhci: Limit Stop Endpoint retries + +From: Michal Pecio + +commit 42b7581376015c1bbcbe5831f043cd0ac119d028 upstream. + +Some host controllers fail to atomically transition an endpoint to the +Running state on a doorbell ring and enter a hidden "Restarting" state, +which looks very much like Stopped, with the important difference that +it will spontaneously transition to Running anytime soon. + +A Stop Endpoint command queued in the Restarting state typically fails +with Context State Error and the completion handler sees the Endpoint +Context State as either still Stopped or already Running. Even a case +of Halted was observed, when an error occurred right after the restart. + +The Halted state is already recovered from by resetting the endpoint. +The Running state is handled by retrying Stop Endpoint. + +The Stopped state was recognized as a problem on NEC controllers and +worked around also by retrying, because the endpoint soon restarts and +then stops for good. But there is a risk: the command may fail if the +endpoint is "stopped for good" already, and retries will fail forever. + +The possibility of this was not realized at the time, but a number of +cases were discovered later and reproduced. Some proved difficult to +deal with, and it is outright impossible to predict if an endpoint may +fail to ever start at all due to a hardware bug. One such bug (albeit +on ASM3142, not on NEC) was found to be reliably triggered simply by +toggling an AX88179 NIC up/down in a tight loop for a few seconds. + +An endless retries storm is quite nasty. Besides putting needless load +on the xHC and CPU, it causes URBs never to be given back, paralyzing +the device and connection/disconnection logic for the whole bus if the +device is unplugged. User processes waiting for URBs become unkillable, +drivers and kworker threads lock up and xhci_hcd cannot be reloaded. + +For peace of mind, impose a timeout on Stop Endpoint retries in this +case. If they don't succeed in 100ms, consider the endpoint stopped +permanently for some reason and just give back the unlinked URBs. This +failure case is rare already and work is under way to make it rarer. + +Start this work today by also handling one simple case of race with +Reset Endpoint, because it costs just two lines to implement. + +Fixes: fd9d55d190c0 ("xhci: retry Stop Endpoint on buggy NEC controllers") +CC: stable@vger.kernel.org +Signed-off-by: Michal Pecio +Signed-off-by: Mathias Nyman +Link: https://lore.kernel.org/r/20241106101459.775897-32-mathias.nyman@linux.intel.com +Signed-off-by: Greg Kroah-Hartman +--- + drivers/usb/host/xhci-ring.c | 28 ++++++++++++++++++++++++---- + drivers/usb/host/xhci.c | 2 ++ + drivers/usb/host/xhci.h | 1 + + 3 files changed, 27 insertions(+), 4 deletions(-) + +--- a/drivers/usb/host/xhci-ring.c ++++ b/drivers/usb/host/xhci-ring.c +@@ -52,6 +52,7 @@ + * endpoint rings; it generates events on the event ring for these. + */ + ++#include + #include + #include + #include +@@ -1151,16 +1152,35 @@ static void xhci_handle_cmd_stop_ep(stru + return; + case EP_STATE_STOPPED: + /* +- * NEC uPD720200 sometimes sets this state and fails with +- * Context Error while continuing to process TRBs. +- * Be conservative and trust EP_CTX_STATE on other chips. ++ * Per xHCI 4.6.9, Stop Endpoint command on a Stopped ++ * EP is a Context State Error, and EP stays Stopped. ++ * ++ * But maybe it failed on Halted, and somebody ran Reset ++ * Endpoint later. EP state is now Stopped and EP_HALTED ++ * still set because Reset EP handler will run after us. ++ */ ++ if (ep->ep_state & EP_HALTED) ++ break; ++ /* ++ * On some HCs EP state remains Stopped for some tens of ++ * us to a few ms or more after a doorbell ring, and any ++ * new Stop Endpoint fails without aborting the restart. ++ * This handler may run quickly enough to still see this ++ * Stopped state, but it will soon change to Running. ++ * ++ * Assume this bug on unexpected Stop Endpoint failures. ++ * Keep retrying until the EP starts and stops again, on ++ * chips where this is known to help. Wait for 100ms. + */ + if (!(xhci->quirks & XHCI_NEC_HOST)) + break; ++ if (time_is_before_jiffies(ep->stop_time + msecs_to_jiffies(100))) ++ break; + fallthrough; + case EP_STATE_RUNNING: + /* Race, HW handled stop ep cmd before ep was running */ +- xhci_dbg(xhci, "Stop ep completion ctx error, ep is running\n"); ++ xhci_dbg(xhci, "Stop ep completion ctx error, ctx_state %d\n", ++ GET_EP_CTX_STATE(ep_ctx)); + + command = xhci_alloc_command(xhci, false, GFP_ATOMIC); + if (!command) { +--- a/drivers/usb/host/xhci.c ++++ b/drivers/usb/host/xhci.c +@@ -8,6 +8,7 @@ + * Some code borrowed from the Linux EHCI driver. + */ + ++#include + #include + #include + #include +@@ -1777,6 +1778,7 @@ static int xhci_urb_dequeue(struct usb_h + ret = -ENOMEM; + goto done; + } ++ ep->stop_time = jiffies; + ep->ep_state |= EP_STOP_CMD_PENDING; + xhci_queue_stop_endpoint(xhci, command, urb->dev->slot_id, + ep_index, 0); +--- a/drivers/usb/host/xhci.h ++++ b/drivers/usb/host/xhci.h +@@ -690,6 +690,7 @@ struct xhci_virt_ep { + /* Bandwidth checking storage */ + struct xhci_bw_info bw_info; + struct list_head bw_endpoint_list; ++ unsigned long stop_time; + /* Isoch Frame ID checking storage */ + int next_frame_id; + /* Use new Isoch TRB layout needed for extended TBC support */