Fixes for 4.4

author Sasha Levin <sashal@kernel.org>

Fri, 5 Feb 2021 12:39:49 +0000 (07:39 -0500)

committer Sasha Levin <sashal@kernel.org>

Fri, 5 Feb 2021 12:39:49 +0000 (07:39 -0500)
author Sasha Levin <sashal@kernel.org>
Fri, 5 Feb 2021 12:39:49 +0000 (07:39 -0500)
committer Sasha Levin <sashal@kernel.org>
Fri, 5 Feb 2021 12:39:49 +0000 (07:39 -0500)
diff --git a/queue-4.4/scsi-ibmvfc-set-default-timeout-to-avoid-crash-durin.patch b/queue-4.4/scsi-ibmvfc-set-default-timeout-to-avoid-crash-durin.patch

new file mode 100644 (file)

index 0000000..59bf655
--- /dev/null
+++ b/queue-4.4/scsi-ibmvfc-set-default-timeout-to-avoid-crash-durin.patch
@@ -0,0 +1,85 @@
+From 188578d9c2aa171163c4098019e2914a4740cdc8 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Tue, 12 Jan 2021 09:06:38 -0600
+Subject: scsi: ibmvfc: Set default timeout to avoid crash during migration
+
+From: Brian King <brking@linux.vnet.ibm.com>
+
+[ Upstream commit 764907293edc1af7ac857389af9dc858944f53dc ]
+
+While testing live partition mobility, we have observed occasional crashes
+of the Linux partition. What we've seen is that during the live migration,
+for specific configurations with large amounts of memory, slow network
+links, and workloads that are changing memory a lot, the partition can end
+up being suspended for 30 seconds or longer. This resulted in the following
+scenario:
+
+CPU 0                          CPU 1
+-------------------------------  ----------------------------------
+scsi_queue_rq                    migration_store
+ -> blk_mq_start_request          -> rtas_ibm_suspend_me
+  -> blk_add_timer                 -> on_each_cpu(rtas_percpu_suspend_me
+              _______________________________________V
+             |
+             V
+    -> IPI from CPU 1
+     -> rtas_percpu_suspend_me
+                                     -> __rtas_suspend_last_cpu
+
+-- Linux partition suspended for > 30 seconds --
+                                      -> for_each_online_cpu(cpu)
+                                           plpar_hcall_norets(H_PROD
+ -> scsi_dispatch_cmd
+                                      -> scsi_times_out
+                                       -> scsi_abort_command
+                                        -> queue_delayed_work
+  -> ibmvfc_queuecommand_lck
+   -> ibmvfc_send_event
+    -> ibmvfc_send_crq
+     - returns H_CLOSED
+   <- returns SCSI_MLQUEUE_HOST_BUSY
+-> __blk_mq_requeue_request
+
+                                      -> scmd_eh_abort_handler
+                                       -> scsi_try_to_abort_cmd
+                                         - returns SUCCESS
+                                       -> scsi_queue_insert
+
+Normally, the SCMD_STATE_COMPLETE bit would protect against the command
+completion and the timeout, but that doesn't work here, since we don't
+check that at all in the SCSI_MLQUEUE_HOST_BUSY path.
+
+In this case we end up calling scsi_queue_insert on a request that has
+already been queued, or possibly even freed, and we crash.
+
+The patch below simply increases the default I/O timeout to avoid this race
+condition. This is also the timeout value that nearly all IBM SAN storage
+recommends setting as the default value.
+
+Link: https://lore.kernel.org/r/1610463998-19791-1-git-send-email-brking@linux.vnet.ibm.com
+Signed-off-by: Brian King <brking@linux.vnet.ibm.com>
+Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ drivers/scsi/ibmvscsi/ibmvfc.c | 4 +++-
+ 1 file changed, 3 insertions(+), 1 deletion(-)
+
+diff --git a/drivers/scsi/ibmvscsi/ibmvfc.c b/drivers/scsi/ibmvscsi/ibmvfc.c
+index db80ab8335dfb..aa74f72e582ab 100644
+--- a/drivers/scsi/ibmvscsi/ibmvfc.c
++++ b/drivers/scsi/ibmvscsi/ibmvfc.c
+@@ -2883,8 +2883,10 @@ static int ibmvfc_slave_configure(struct scsi_device *sdev)
+       unsigned long flags = 0;
+ 
+       spin_lock_irqsave(shost->host_lock, flags);
+-      if (sdev->type == TYPE_DISK)
++      if (sdev->type == TYPE_DISK) {
+               sdev->allow_restart = 1;
++              blk_queue_rq_timeout(sdev->request_queue, 120 * HZ);
++      }
+       spin_unlock_irqrestore(shost->host_lock, flags);
+       return 0;
+ }
+-- 
+2.27.0
+
diff --git a/queue-4.4/scsi-libfc-avoid-invoking-response-handler-twice-if-.patch b/queue-4.4/scsi-libfc-avoid-invoking-response-handler-twice-if-.patch

new file mode 100644 (file)

index 0000000..1b5b0ef
--- /dev/null
+++ b/queue-4.4/scsi-libfc-avoid-invoking-response-handler-twice-if-.patch
@@ -0,0 +1,95 @@
+From 789368da217cd5d217f576bafd79ab6e3c283680 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Tue, 15 Dec 2020 11:47:31 -0800
+Subject: scsi: libfc: Avoid invoking response handler twice if ep is already
+ completed
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+From: Javed Hasan <jhasan@marvell.com>
+
+[ Upstream commit b2b0f16fa65e910a3ec8771206bb49ee87a54ac5 ]
+
+A race condition exists between the response handler getting called because
+of exchange_mgr_reset() (which clears out all the active XIDs) and the
+response we get via an interrupt.
+
+Sequence of events:
+
+        rport ba0200: Port timeout, state PLOGI
+        rport ba0200: Port entered PLOGI state from PLOGI state
+        xid 1052: Exchange timer armed : 20000 msecs      xid timer armed here
+        rport ba0200: Received LOGO request while in state PLOGI
+        rport ba0200: Delete port
+        rport ba0200: work event 3
+        rport ba0200: lld callback ev 3
+        bnx2fc: rport_event_hdlr: event = 3, port_id = 0xba0200
+        bnx2fc: ba0200 - rport not created Yet!!
+        /* Here we reset any outstanding exchanges before
+        freeing rport using the exch_mgr_reset() */
+        xid 1052: Exchange timer canceled
+        /* Here we got two responses for one xid */
+        xid 1052: invoking resp(), esb 20000000 state 3
+        xid 1052: invoking resp(), esb 20000000 state 3
+        xid 1052: fc_rport_plogi_resp() : ep->resp_active 2
+        xid 1052: fc_rport_plogi_resp() : ep->resp_active 2
+
+Skip the response if the exchange is already completed.
+
+Link: https://lore.kernel.org/r/20201215194731.2326-1-jhasan@marvell.com
+Signed-off-by: Javed Hasan <jhasan@marvell.com>
+Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ drivers/scsi/libfc/fc_exch.c | 16 ++++++++++++++--
+ 1 file changed, 14 insertions(+), 2 deletions(-)
+
+diff --git a/drivers/scsi/libfc/fc_exch.c b/drivers/scsi/libfc/fc_exch.c
+index b20c575564e43..a088f74a157c7 100644
+--- a/drivers/scsi/libfc/fc_exch.c
++++ b/drivers/scsi/libfc/fc_exch.c
+@@ -1577,8 +1577,13 @@ static void fc_exch_recv_seq_resp(struct fc_exch_mgr *mp, struct fc_frame *fp)
+               rc = fc_exch_done_locked(ep);
+               WARN_ON(fc_seq_exch(sp) != ep);
+               spin_unlock_bh(&ep->ex_lock);
+-              if (!rc)
++              if (!rc) {
+                       fc_exch_delete(ep);
++              } else {
++                      FC_EXCH_DBG(ep, "ep is completed already,"
++                                      "hence skip calling the resp\n");
++                      goto skip_resp;
++              }
+       }
+ 
+       /*
+@@ -1597,6 +1602,7 @@ static void fc_exch_recv_seq_resp(struct fc_exch_mgr *mp, struct fc_frame *fp)
+       if (!fc_invoke_resp(ep, sp, fp))
+               fc_frame_free(fp);
+ 
++skip_resp:
+       fc_exch_release(ep);
+       return;
+ rel:
+@@ -1841,10 +1847,16 @@ static void fc_exch_reset(struct fc_exch *ep)
+ 
+       fc_exch_hold(ep);
+ 
+-      if (!rc)
++      if (!rc) {
+               fc_exch_delete(ep);
++      } else {
++              FC_EXCH_DBG(ep, "ep is completed already,"
++                              "hence skip calling the resp\n");
++              goto skip_resp;
++      }
+ 
+       fc_invoke_resp(ep, sp, ERR_PTR(-FC_EX_CLOSED));
++skip_resp:
+       fc_seq_set_resp(sp, NULL, ep->arg);
+       fc_exch_release(ep);
+ }
+-- 
+2.27.0
+
diff --git a/queue-4.4/series b/queue-4.4/series

index 8086dab0b0c1e276cb83d8f31cd2fcb44b677f4d..eda79455b5c03bdac706506dea51ccef733325b2 100644 (file)
--- a/queue-4.4/series
+++ b/queue-4.4/series
@@ -10,3 +10,5 @@ futex-use-pi_state_update_owner-in-put_pi_state.patch
  futex-simplify-fixup_pi_state_owner.patch
  futex-handle-faults-correctly-for-pi-futexes.patch
  usb-udc-core-use-lock-when-write-to-soft_connect.patch
+scsi-libfc-avoid-invoking-response-handler-twice-if-.patch
+scsi-ibmvfc-set-default-timeout-to-avoid-crash-durin.patch
author	Sasha Levin <sashal@kernel.org>
	Fri, 5 Feb 2021 12:39:49 +0000 (07:39 -0500)
committer	Sasha Levin <sashal@kernel.org>
	Fri, 5 Feb 2021 12:39:49 +0000 (07:39 -0500)
queue-4.4/scsi-ibmvfc-set-default-timeout-to-avoid-crash-durin.patch	[new file with mode: 0644]	patch \| blob
queue-4.4/scsi-libfc-avoid-invoking-response-handler-twice-if-.patch	[new file with mode: 0644]	patch \| blob
queue-4.4/series		patch \| blob \| blame \| history