From 18b5abc7d04dbbeb9fee90a8460c2cbb2fdac7e1 Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Fri, 22 Sep 2017 12:59:55 +0200 Subject: [PATCH] 3.18-stable patches added patches: scsi-zfcp-add-handling-for-fcp_resid_over-to-the-fcp-ingress-path.patch scsi-zfcp-fix-missing-trace-records-for-early-returns-in-tmf-eh-handlers.patch scsi-zfcp-fix-payload-with-full-fcp_rsp-iu-in-scsi-trace-records.patch scsi-zfcp-fix-queuecommand-for-scsi_eh-commands-when-dix-enabled.patch scsi-zfcp-trace-hba-fsf-response-by-default-on-dismiss-or-timedout-late-response.patch scsi-zfcp-trace-high-part-of-new-64-bit-scsi-lun.patch skd-avoid-that-module-unloading-triggers-a-use-after-free.patch skd-submit-requests-to-firmware-before-triggering-the-doorbell.patch --- ...p_resid_over-to-the-fcp-ingress-path.patch | 90 ++++++++ ...for-early-returns-in-tmf-eh-handlers.patch | 62 +++++ ...ull-fcp_rsp-iu-in-scsi-trace-records.patch | 189 +++++++++++++++ ...or-scsi_eh-commands-when-dix-enabled.patch | 59 +++++ ...on-dismiss-or-timedout-late-response.patch | 217 ++++++++++++++++++ ...ace-high-part-of-new-64-bit-scsi-lun.patch | 150 ++++++++++++ queue-3.18/series | 8 + ...-unloading-triggers-a-use-after-free.patch | 78 +++++++ ...mware-before-triggering-the-doorbell.patch | 49 ++++ 9 files changed, 902 insertions(+) create mode 100644 queue-3.18/scsi-zfcp-add-handling-for-fcp_resid_over-to-the-fcp-ingress-path.patch create mode 100644 queue-3.18/scsi-zfcp-fix-missing-trace-records-for-early-returns-in-tmf-eh-handlers.patch create mode 100644 queue-3.18/scsi-zfcp-fix-payload-with-full-fcp_rsp-iu-in-scsi-trace-records.patch create mode 100644 queue-3.18/scsi-zfcp-fix-queuecommand-for-scsi_eh-commands-when-dix-enabled.patch create mode 100644 queue-3.18/scsi-zfcp-trace-hba-fsf-response-by-default-on-dismiss-or-timedout-late-response.patch create mode 100644 queue-3.18/scsi-zfcp-trace-high-part-of-new-64-bit-scsi-lun.patch create mode 100644 queue-3.18/skd-avoid-that-module-unloading-triggers-a-use-after-free.patch create mode 100644 queue-3.18/skd-submit-requests-to-firmware-before-triggering-the-doorbell.patch diff --git a/queue-3.18/scsi-zfcp-add-handling-for-fcp_resid_over-to-the-fcp-ingress-path.patch b/queue-3.18/scsi-zfcp-add-handling-for-fcp_resid_over-to-the-fcp-ingress-path.patch new file mode 100644 index 00000000000..46678d8f241 --- /dev/null +++ b/queue-3.18/scsi-zfcp-add-handling-for-fcp_resid_over-to-the-fcp-ingress-path.patch @@ -0,0 +1,90 @@ +From a099b7b1fc1f0418ab8d79ecf98153e1e134656e Mon Sep 17 00:00:00 2001 +From: Benjamin Block +Date: Fri, 28 Jul 2017 12:30:52 +0200 +Subject: scsi: zfcp: add handling for FCP_RESID_OVER to the fcp ingress path + +From: Benjamin Block + +commit a099b7b1fc1f0418ab8d79ecf98153e1e134656e upstream. + +Up until now zfcp would just ignore the FCP_RESID_OVER flag in the FCP +response IU. When this flag is set, it is possible, in regards to the +FCP standard, that the storage-server processes the command normally, up +to the point where data is missing and simply ignores those. + +In this case no CHECK CONDITION would be set, and because we ignored the +FCP_RESID_OVER flag we resulted in at least a data loss or even +-corruption as a follow-up error, depending on how the +applications/layers on top behave. To prevent this, we now set the +host-byte of the corresponding scsi_cmnd to DID_ERROR. + +Other storage-behaviors, where the same condition results in a CHECK +CONDITION set in the answer, don't need to be changed as they are +handled in the mid-layer already. + +Following is an example trace record decoded with zfcpdbf from the +s390-tools package. We forcefully injected a fc_dl which is one byte too +small: + +Timestamp : ... +Area : SCSI +Subarea : 00 +Level : 3 +Exception : - +CPU ID : .. +Caller : 0x... +Record ID : 1 +Tag : rsl_err +Request ID : 0x... +SCSI ID : 0x... +SCSI LUN : 0x... +SCSI result : 0x00070000 + ^^DID_ERROR +SCSI retries : 0x.. +SCSI allowed : 0x.. +SCSI scribble : 0x... +SCSI opcode : 2a000000 00000000 08000000 00000000 +FCP rsp inf cod: 0x00 +FCP rsp IU : 00000000 00000000 00000400 00000001 + ^^fr_flags==FCP_RESID_OVER + ^^fr_status==SAM_STAT_GOOD + ^^^^^^^^fr_resid + 00000000 00000000 + +As of now, we don't actively handle to possibility that a response IU +has both flags - FCP_RESID_OVER and FCP_RESID_UNDER - set at once. + +Reported-by: Luke M. Hopkins +Reviewed-by: Steffen Maier +Fixes: 553448f6c483 ("[SCSI] zfcp: Message cleanup") +Fixes: ea127f975424 ("[PATCH] s390 (7/7): zfcp host adapter.") (tglx/history.git) +Signed-off-by: Benjamin Block +Signed-off-by: Martin K. Petersen +Signed-off-by: Greg Kroah-Hartman + +--- + drivers/s390/scsi/zfcp_fc.h | 6 +++++- + 1 file changed, 5 insertions(+), 1 deletion(-) + +--- a/drivers/s390/scsi/zfcp_fc.h ++++ b/drivers/s390/scsi/zfcp_fc.h +@@ -4,7 +4,7 @@ + * Fibre Channel related definitions and inline functions for the zfcp + * device driver + * +- * Copyright IBM Corp. 2009 ++ * Copyright IBM Corp. 2009, 2017 + */ + + #ifndef ZFCP_FC_H +@@ -291,6 +291,10 @@ void zfcp_fc_eval_fcp_rsp(struct fcp_res + !(rsp_flags & FCP_SNS_LEN_VAL) && + fcp_rsp->resp.fr_status == SAM_STAT_GOOD) + set_host_byte(scsi, DID_ERROR); ++ } else if (unlikely(rsp_flags & FCP_RESID_OVER)) { ++ /* FCP_DL was not sufficient for SCSI data length */ ++ if (fcp_rsp->resp.fr_status == SAM_STAT_GOOD) ++ set_host_byte(scsi, DID_ERROR); + } + } + diff --git a/queue-3.18/scsi-zfcp-fix-missing-trace-records-for-early-returns-in-tmf-eh-handlers.patch b/queue-3.18/scsi-zfcp-fix-missing-trace-records-for-early-returns-in-tmf-eh-handlers.patch new file mode 100644 index 00000000000..de978b17110 --- /dev/null +++ b/queue-3.18/scsi-zfcp-fix-missing-trace-records-for-early-returns-in-tmf-eh-handlers.patch @@ -0,0 +1,62 @@ +From 1a5d999ebfc7bfe28deb48931bb57faa8e4102b6 Mon Sep 17 00:00:00 2001 +From: Steffen Maier +Date: Fri, 28 Jul 2017 12:30:55 +0200 +Subject: scsi: zfcp: fix missing trace records for early returns in TMF eh handlers + +From: Steffen Maier + +commit 1a5d999ebfc7bfe28deb48931bb57faa8e4102b6 upstream. + +For problem determination we need to see that we were in scsi_eh +as well as whether and why we were successful or not. + +The following commits introduced new early returns without adding +a trace record: + +v2.6.35 commit a1dbfddd02d2 +("[SCSI] zfcp: Pass return code from fc_block_scsi_eh to scsi eh") +on fc_block_scsi_eh() returning != 0 which is FAST_IO_FAIL, + +v2.6.30 commit 63caf367e1c9 +("[SCSI] zfcp: Improve reliability of SCSI eh handlers in zfcp") +on not having gotten an FSF request after the maximum number of retry +attempts and thus could not issue a TMF and has to return FAILED. + +Signed-off-by: Steffen Maier +Fixes: a1dbfddd02d2 ("[SCSI] zfcp: Pass return code from fc_block_scsi_eh to scsi eh") +Fixes: 63caf367e1c9 ("[SCSI] zfcp: Improve reliability of SCSI eh handlers in zfcp") +Reviewed-by: Benjamin Block +Signed-off-by: Benjamin Block +Signed-off-by: Martin K. Petersen +Signed-off-by: Greg Kroah-Hartman + +--- + drivers/s390/scsi/zfcp_scsi.c | 8 ++++++-- + 1 file changed, 6 insertions(+), 2 deletions(-) + +--- a/drivers/s390/scsi/zfcp_scsi.c ++++ b/drivers/s390/scsi/zfcp_scsi.c +@@ -245,8 +245,10 @@ static int zfcp_task_mgmt_function(struc + + zfcp_erp_wait(adapter); + ret = fc_block_scsi_eh(scpnt); +- if (ret) ++ if (ret) { ++ zfcp_dbf_scsi_devreset("fiof", scpnt, tm_flags, NULL); + return ret; ++ } + + if (!(atomic_read(&adapter->status) & + ZFCP_STATUS_COMMON_RUNNING)) { +@@ -254,8 +256,10 @@ static int zfcp_task_mgmt_function(struc + return SUCCESS; + } + } +- if (!fsf_req) ++ if (!fsf_req) { ++ zfcp_dbf_scsi_devreset("reqf", scpnt, tm_flags, NULL); + return FAILED; ++ } + + wait_for_completion(&fsf_req->completion); + diff --git a/queue-3.18/scsi-zfcp-fix-payload-with-full-fcp_rsp-iu-in-scsi-trace-records.patch b/queue-3.18/scsi-zfcp-fix-payload-with-full-fcp_rsp-iu-in-scsi-trace-records.patch new file mode 100644 index 00000000000..44f0ad6de08 --- /dev/null +++ b/queue-3.18/scsi-zfcp-fix-payload-with-full-fcp_rsp-iu-in-scsi-trace-records.patch @@ -0,0 +1,189 @@ +From 12c3e5754c8022a4f2fd1e9f00d19e99ee0d3cc1 Mon Sep 17 00:00:00 2001 +From: Steffen Maier +Date: Fri, 28 Jul 2017 12:30:56 +0200 +Subject: scsi: zfcp: fix payload with full FCP_RSP IU in SCSI trace records + +From: Steffen Maier + +commit 12c3e5754c8022a4f2fd1e9f00d19e99ee0d3cc1 upstream. + +If the FCP_RSP UI has optional parts (FCP_SNS_INFO or FCP_RSP_INFO) and +thus does not fit into the fsp_rsp field built into a SCSI trace record, +trace the full FCP_RSP UI with all optional parts as payload record +instead of just FCP_SNS_INFO as payload and +a 1 byte RSP_INFO_CODE part of FCP_RSP_INFO built into the SCSI record. + +That way we would also get the full FCP_SNS_INFO in case a +target would ever send more than +min(SCSI_SENSE_BUFFERSIZE==96, ZFCP_DBF_PAY_MAX_REC==256)==96. + +The mandatory part of FCP_RSP IU is only 24 bytes. +PAYload costs at least one full PAY record of 256 bytes anyway. +We cap to the hardware response size which is only FSF_FCP_RSP_SIZE==128. +So we can just put the whole FCP_RSP IU with any optional parts into +PAYload similarly as we do for SAN PAY since v4.9 commit aceeffbb59bb +("zfcp: trace full payload of all SAN records (req,resp,iels)"). +This does not cause any additional trace records wasting memory. + +Decoded trace records were confusing because they showed a hard-coded +sense data length of 96 even if the FCP_RSP_IU field FCP_SNS_LEN showed +actually less. + +Since the same commit, we set pl_len for SAN traces to the full length of a +request/response even if we cap the corresponding trace. +In contrast, here for SCSI traces we set pl_len to the pre-computed +length of FCP_RSP IU considering SNS_LEN or RSP_LEN if valid. +Nonetheless we trace a hardcoded payload of length FSF_FCP_RSP_SIZE==128 +if there were optional parts. +This makes it easier for the zfcpdbf tool to format only the relevant +part of the long FCP_RSP UI buffer. And any trailing information is still +available in the payload trace record just in case. + +Rename the payload record tag from "fcp_sns" to "fcp_riu" to make the new +content explicit to zfcpdbf which can then pick a suitable field name such +as "FCP rsp IU all:" instead of "Sense info :" +Also, the same zfcpdbf can still be backwards compatible with "fcp_sns". + +Old example trace record before this fix, formatted with the tool zfcpdbf +from s390-tools: + +Timestamp : ... +Area : SCSI +Subarea : 00 +Level : 3 +Exception : - +CPU id : .. +Caller : 0x... +Record id : 1 +Tag : rsl_err +Request id : 0x +SCSI ID : 0x... +SCSI LUN : 0x... +SCSI result : 0x00000002 +SCSI retries : 0x00 +SCSI allowed : 0x05 +SCSI scribble : 0x +SCSI opcode : 00000000 00000000 00000000 00000000 +FCP rsp inf cod: 0x00 +FCP rsp IU : 00000000 00000000 00000202 00000000 + ^^==FCP_SNS_LEN_VALID + 00000020 00000000 + ^^^^^^^^==FCP_SNS_LEN==32 +Sense len : 96 <==min(SCSI_SENSE_BUFFERSIZE,ZFCP_DBF_PAY_MAX_REC) +Sense info : 70000600 00000018 00000000 29000000 + 00000400 00000000 00000000 00000000 + 00000000 00000000 00000000 00000000<==superfluous + 00000000 00000000 00000000 00000000<==superfluous + 00000000 00000000 00000000 00000000<==superfluous + 00000000 00000000 00000000 00000000<==superfluous + +New example trace records with this fix: + +Timestamp : ... +Area : SCSI +Subarea : 00 +Level : 3 +Exception : - +CPU ID : .. +Caller : 0x... +Record ID : 1 +Tag : rsl_err +Request ID : 0x +SCSI ID : 0x... +SCSI LUN : 0x... +SCSI result : 0x00000002 +SCSI retries : 0x00 +SCSI allowed : 0x03 +SCSI scribble : 0x +SCSI opcode : a30c0112 00000000 02000000 00000000 +FCP rsp inf cod: 0x00 +FCP rsp IU : 00000000 00000000 00000a02 00000200 + 00000020 00000000 +FCP rsp IU len : 56 +FCP rsp IU all : 00000000 00000000 00000a02 00000200 + ^^=FCP_RESID_UNDER|FCP_SNS_LEN_VALID + 00000020 00000000 70000500 00000018 + ^^^^^^^^==FCP_SNS_LEN + ^^^^^^^^^^^^^^^^^ + 00000000 240000cb 00011100 00000000 + ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + 00000000 00000000 + ^^^^^^^^^^^^^^^^^==FCP_SNS_INFO + +Timestamp : ... +Area : SCSI +Subarea : 00 +Level : 1 +Exception : - +CPU ID : .. +Caller : 0x... +Record ID : 1 +Tag : lr_okay +Request ID : 0x +SCSI ID : 0x... +SCSI LUN : 0x... +SCSI result : 0x00000000 +SCSI retries : 0x00 +SCSI allowed : 0x05 +SCSI scribble : 0x +SCSI opcode : +FCP rsp inf cod: 0x00 +FCP rsp IU : 00000000 00000000 00000100 00000000 + 00000000 00000008 +FCP rsp IU len : 32 +FCP rsp IU all : 00000000 00000000 00000100 00000000 + ^^==FCP_RSP_LEN_VALID + 00000000 00000008 00000000 00000000 + ^^^^^^^^==FCP_RSP_LEN + ^^^^^^^^^^^^^^^^^==FCP_RSP_INFO + +Signed-off-by: Steffen Maier +Fixes: 250a1352b95e ("[SCSI] zfcp: Redesign of the debug tracing for SCSI records.") +Reviewed-by: Benjamin Block +Signed-off-by: Benjamin Block +Signed-off-by: Martin K. Petersen +Signed-off-by: Greg Kroah-Hartman + +--- + drivers/s390/scsi/zfcp_dbf.c | 21 +++++++++++++++++---- + 1 file changed, 17 insertions(+), 4 deletions(-) + +--- a/drivers/s390/scsi/zfcp_dbf.c ++++ b/drivers/s390/scsi/zfcp_dbf.c +@@ -427,19 +427,32 @@ void zfcp_dbf_scsi(char *tag, struct scs + + if (fsf) { + rec->fsf_req_id = fsf->req_id; ++ rec->pl_len = FCP_RESP_WITH_EXT; + fcp_rsp = (struct fcp_resp_with_ext *) + &(fsf->qtcb->bottom.io.fcp_rsp); ++ /* mandatory parts of FCP_RSP IU in this SCSI record */ + memcpy(&rec->fcp_rsp, fcp_rsp, FCP_RESP_WITH_EXT); + if (fcp_rsp->resp.fr_flags & FCP_RSP_LEN_VAL) { + fcp_rsp_info = (struct fcp_resp_rsp_info *) &fcp_rsp[1]; + rec->fcp_rsp_info = fcp_rsp_info->rsp_code; ++ rec->pl_len += be32_to_cpu(fcp_rsp->ext.fr_rsp_len); + } + if (fcp_rsp->resp.fr_flags & FCP_SNS_LEN_VAL) { +- rec->pl_len = min((u16)SCSI_SENSE_BUFFERSIZE, +- (u16)ZFCP_DBF_PAY_MAX_REC); +- zfcp_dbf_pl_write(dbf, sc->sense_buffer, rec->pl_len, +- "fcp_sns", fsf->req_id); ++ rec->pl_len += be32_to_cpu(fcp_rsp->ext.fr_sns_len); + } ++ /* complete FCP_RSP IU in associated PAYload record ++ * but only if there are optional parts ++ */ ++ if (fcp_rsp->resp.fr_flags != 0) ++ zfcp_dbf_pl_write( ++ dbf, fcp_rsp, ++ /* at least one full PAY record ++ * but not beyond hardware response field ++ */ ++ min_t(u16, max_t(u16, rec->pl_len, ++ ZFCP_DBF_PAY_MAX_REC), ++ FSF_FCP_RSP_SIZE), ++ "fcp_riu", fsf->req_id); + } + + debug_event(dbf->scsi, 1, rec, sizeof(*rec)); diff --git a/queue-3.18/scsi-zfcp-fix-queuecommand-for-scsi_eh-commands-when-dix-enabled.patch b/queue-3.18/scsi-zfcp-fix-queuecommand-for-scsi_eh-commands-when-dix-enabled.patch new file mode 100644 index 00000000000..9ee2176be6f --- /dev/null +++ b/queue-3.18/scsi-zfcp-fix-queuecommand-for-scsi_eh-commands-when-dix-enabled.patch @@ -0,0 +1,59 @@ +From 71b8e45da51a7b64a23378221c0a5868bd79da4f Mon Sep 17 00:00:00 2001 +From: Steffen Maier +Date: Fri, 28 Jul 2017 12:30:51 +0200 +Subject: scsi: zfcp: fix queuecommand for scsi_eh commands when DIX enabled + +From: Steffen Maier + +commit 71b8e45da51a7b64a23378221c0a5868bd79da4f upstream. + +Since commit db007fc5e20c ("[SCSI] Command protection operation"), +scsi_eh_prep_cmnd() saves scmd->prot_op and temporarily resets it to +SCSI_PROT_NORMAL. +Other FCP LLDDs such as qla2xxx and lpfc shield their queuecommand() +to only access any of scsi_prot_sg...() if +(scsi_get_prot_op(cmd) != SCSI_PROT_NORMAL). + +Do the same thing for zfcp, which introduced DIX support with +commit ef3eb71d8ba4 ("[SCSI] zfcp: Introduce experimental support for +DIF/DIX"). + +Otherwise, TUR SCSI commands as part of scsi_eh likely fail in zfcp, +because the regular SCSI command with DIX protection data, that scsi_eh +re-uses in scsi_send_eh_cmnd(), of course still has +(scsi_prot_sg_count() != 0) and so zfcp sends down bogus requests to the +FCP channel hardware. + +This causes scsi_eh_test_devices() to have (finish_cmds == 0) +[not SCSI device is online or not scsi_eh_tur() failed] +so regular SCSI commands, that caused / were affected by scsi_eh, +are moved to work_q and scsi_eh_test_devices() itself returns false. +In turn, it unnecessarily escalates in our case in scsi_eh_ready_devs() +beyond host reset to finally scsi_eh_offline_sdevs() +which sets affected SCSI devices offline with the following kernel message: + +"kernel: sd H:0:T:L: Device offlined - not ready after error recovery" + +Signed-off-by: Steffen Maier +Fixes: ef3eb71d8ba4 ("[SCSI] zfcp: Introduce experimental support for DIF/DIX") +Reviewed-by: Benjamin Block +Signed-off-by: Benjamin Block +Signed-off-by: Martin K. Petersen +Signed-off-by: Greg Kroah-Hartman + +--- + drivers/s390/scsi/zfcp_fsf.c | 3 ++- + 1 file changed, 2 insertions(+), 1 deletion(-) + +--- a/drivers/s390/scsi/zfcp_fsf.c ++++ b/drivers/s390/scsi/zfcp_fsf.c +@@ -2247,7 +2247,8 @@ int zfcp_fsf_fcp_cmnd(struct scsi_cmnd * + fcp_cmnd = (struct fcp_cmnd *) &req->qtcb->bottom.io.fcp_cmnd; + zfcp_fc_scsi_to_fcp(fcp_cmnd, scsi_cmnd, 0); + +- if (scsi_prot_sg_count(scsi_cmnd)) { ++ if ((scsi_get_prot_op(scsi_cmnd) != SCSI_PROT_NORMAL) && ++ scsi_prot_sg_count(scsi_cmnd)) { + zfcp_qdio_set_data_div(qdio, &req->qdio_req, + scsi_prot_sg_count(scsi_cmnd)); + retval = zfcp_qdio_sbals_from_sg(qdio, &req->qdio_req, diff --git a/queue-3.18/scsi-zfcp-trace-hba-fsf-response-by-default-on-dismiss-or-timedout-late-response.patch b/queue-3.18/scsi-zfcp-trace-hba-fsf-response-by-default-on-dismiss-or-timedout-late-response.patch new file mode 100644 index 00000000000..59c3aacd5d7 --- /dev/null +++ b/queue-3.18/scsi-zfcp-trace-hba-fsf-response-by-default-on-dismiss-or-timedout-late-response.patch @@ -0,0 +1,217 @@ +From fdb7cee3b9e3c561502e58137a837341f10cbf8b Mon Sep 17 00:00:00 2001 +From: Steffen Maier +Date: Fri, 28 Jul 2017 12:30:57 +0200 +Subject: scsi: zfcp: trace HBA FSF response by default on dismiss or timedout late response + +From: Steffen Maier + +commit fdb7cee3b9e3c561502e58137a837341f10cbf8b upstream. + +At the default trace level, we only trace unsuccessful events including +FSF responses. + +zfcp_dbf_hba_fsf_response() only used protocol status and FSF status to +decide on an unsuccessful response. However, this is only one of multiple +possible sources determining a failed struct zfcp_fsf_req. + +An FSF request can also "fail" if its response runs into an ERP timeout +or if it gets dismissed because a higher level recovery was triggered +[trace tags "erscf_1" or "erscf_2" in zfcp_erp_strategy_check_fsfreq()]. +FSF requests with ERP timeout are: +FSF_QTCB_EXCHANGE_CONFIG_DATA, FSF_QTCB_EXCHANGE_PORT_DATA, +FSF_QTCB_OPEN_PORT_WITH_DID or FSF_QTCB_CLOSE_PORT or +FSF_QTCB_CLOSE_PHYSICAL_PORT for target ports, +FSF_QTCB_OPEN_LUN, FSF_QTCB_CLOSE_LUN. +One example is slow queue processing which can cause follow-on errors, +e.g. FSF_PORT_ALREADY_OPEN after FSF_QTCB_OPEN_PORT_WITH_DID timed out. +In order to see the root cause, we need to see late responses even if the +channel presented them successfully with FSF_PROT_GOOD and FSF_GOOD. +Example trace records formatted with zfcpdbf from the s390-tools package: + +Timestamp : ... +Area : REC +Subarea : 00 +Level : 1 +Exception : - +CPU ID : .. +Caller : ... +Record ID : 1 +Tag : fcegpf1 +LUN : 0xffffffffffffffff +WWPN : 0x +D_ID : 0x00 +Adapter status : 0x5400050b +Port status : 0x41200000 +LUN status : 0x00000000 +Ready count : 0x00000001 +Running count : 0x... +ERP want : 0x02 ZFCP_ERP_ACTION_REOPEN_PORT +ERP need : 0x02 ZFCP_ERP_ACTION_REOPEN_PORT +| +Timestamp : ... 30 seconds later +Area : REC +Subarea : 00 +Level : 1 +Exception : - +CPU ID : .. +Caller : ... +Record ID : 2 +Tag : erscf_2 +LUN : 0xffffffffffffffff +WWPN : 0x +D_ID : 0x00 +Adapter status : 0x5400050b +Port status : 0x41200000 +LUN status : 0x00000000 +Request ID : 0x +ERP status : 0x10000000 ZFCP_STATUS_ERP_TIMEDOUT +ERP step : 0x0800 ZFCP_ERP_STEP_PORT_OPENING +ERP action : 0x02 ZFCP_ERP_ACTION_REOPEN_PORT +ERP count : 0x00 +| +Timestamp : ... later than previous record +Area : HBA +Subarea : 00 +Level : 5 > default level => 3 <= default level +Exception : - +CPU ID : 00 +Caller : ... +Record ID : 1 +Tag : fs_qtcb => fs_rerr +Request ID : 0x +Request status : 0x00001010 ZFCP_STATUS_FSFREQ_DISMISSED + | ZFCP_STATUS_FSFREQ_CLEANUP +FSF cmnd : 0x00000005 +FSF sequence no: 0x... +FSF issued : ... > 30 seconds ago +FSF stat : 0x00000000 FSF_GOOD +FSF stat qual : 00000000 00000000 00000000 00000000 +Prot stat : 0x00000001 FSF_PROT_GOOD +Prot stat qual : 00000000 00000000 00000000 00000000 +Port handle : 0x... +LUN handle : 0x00000000 +QTCB log length: ... +QTCB log info : ... + +In case of problems detecting that new responses are waiting on the input +queue, we sooner or later trigger adapter recovery due to an FSF request +timeout (trace tag "fsrth_1"). +FSF requests with FSF request timeout are: +typically FSF_QTCB_ABORT_FCP_CMND; but theoretically also +FSF_QTCB_EXCHANGE_CONFIG_DATA or FSF_QTCB_EXCHANGE_PORT_DATA via sysfs, +FSF_QTCB_OPEN_PORT_WITH_DID or FSF_QTCB_CLOSE_PORT for WKA ports, +FSF_QTCB_FCP_CMND for task management function (LUN / target reset). +One or more pending requests can meanwhile have FSF_PROT_GOOD and FSF_GOOD +because the channel filled in the response via DMA into the request's QTCB. + +In a theroretical case, inject code can create an erroneous FSF request +on purpose. If data router is enabled, it uses deferred error reporting. +A READ SCSI command can succeed with FSF_PROT_GOOD, FSF_GOOD, and +SAM_STAT_GOOD. But on writing the read data to host memory via DMA, +it can still fail, e.g. if an intentionally wrong scatter list does not +provide enough space. Rather than getting an unsuccessful response, +we get a QDIO activate check which in turn triggers adapter recovery. +One or more pending requests can meanwhile have FSF_PROT_GOOD and FSF_GOOD +because the channel filled in the response via DMA into the request's QTCB. +Example trace records formatted with zfcpdbf from the s390-tools package: + +Timestamp : ... +Area : HBA +Subarea : 00 +Level : 6 > default level => 3 <= default level +Exception : - +CPU ID : .. +Caller : ... +Record ID : 1 +Tag : fs_norm => fs_rerr +Request ID : 0x +Request status : 0x00001010 ZFCP_STATUS_FSFREQ_DISMISSED + | ZFCP_STATUS_FSFREQ_CLEANUP +FSF cmnd : 0x00000001 +FSF sequence no: 0x... +FSF issued : ... +FSF stat : 0x00000000 FSF_GOOD +FSF stat qual : 00000000 00000000 00000000 00000000 +Prot stat : 0x00000001 FSF_PROT_GOOD +Prot stat qual : ........ ........ 00000000 00000000 +Port handle : 0x... +LUN handle : 0x... +| +Timestamp : ... +Area : SCSI +Subarea : 00 +Level : 3 +Exception : - +CPU ID : .. +Caller : ... +Record ID : 1 +Tag : rsl_err +Request ID : 0x +SCSI ID : 0x... +SCSI LUN : 0x... +SCSI result : 0x000e0000 DID_TRANSPORT_DISRUPTED +SCSI retries : 0x00 +SCSI allowed : 0x05 +SCSI scribble : 0x +SCSI opcode : 28... Read(10) +FCP rsp inf cod: 0x00 +FCP rsp IU : 00000000 00000000 00000000 00000000 + ^^ SAM_STAT_GOOD + 00000000 00000000 + +Only with luck in both above cases, we could see a follow-on trace record +of an unsuccesful event following a successful but late FSF response with +FSF_PROT_GOOD and FSF_GOOD. Typically this was the case for I/O requests +resulting in a SCSI trace record "rsl_err" with DID_TRANSPORT_DISRUPTED +[On ZFCP_STATUS_FSFREQ_DISMISSED, zfcp_fsf_protstatus_eval() sets +ZFCP_STATUS_FSFREQ_ERROR seen by the request handler functions as failure]. +However, the reason for this follow-on trace was invisible because the +corresponding HBA trace record was missing at the default trace level +(by default hidden records with tags "fs_norm", "fs_qtcb", or "fs_open"). + +On adapter recovery, after we had shut down the QDIO queues, we perform +unsuccessful pseudo completions with flag ZFCP_STATUS_FSFREQ_DISMISSED +for each pending FSF request in zfcp_fsf_req_dismiss_all(). +In order to find the root cause, we need to see all pseudo responses even +if the channel presented them successfully with FSF_PROT_GOOD and FSF_GOOD. + +Therefore, check zfcp_fsf_req.status for ZFCP_STATUS_FSFREQ_DISMISSED +or ZFCP_STATUS_FSFREQ_ERROR and trace with a new tag "fs_rerr". + +It does not matter that there are numerous places which set +ZFCP_STATUS_FSFREQ_ERROR after the location where we trace an FSF response +early. These cases are based on protocol status != FSF_PROT_GOOD or +== FSF_PROT_FSF_STATUS_PRESENTED and are thus already traced by default +as trace tag "fs_perr" or "fs_ferr" respectively. + +NB: The trace record with tag "fssrh_1" for status read buffers on dismiss +all remains. zfcp_fsf_req_complete() handles this and returns early. +All other FSF request types are handled separately and as described above. + +Signed-off-by: Steffen Maier +Fixes: 8a36e4532ea1 ("[SCSI] zfcp: enhancement of zfcp debug features") +Fixes: 2e261af84cdb ("[SCSI] zfcp: Only collect FSF/HBA debug data for matching trace levels") +Reviewed-by: Benjamin Block +Signed-off-by: Benjamin Block +Signed-off-by: Martin K. Petersen +Signed-off-by: Greg Kroah-Hartman + +--- + drivers/s390/scsi/zfcp_dbf.h | 6 +++++- + 1 file changed, 5 insertions(+), 1 deletion(-) + +--- a/drivers/s390/scsi/zfcp_dbf.h ++++ b/drivers/s390/scsi/zfcp_dbf.h +@@ -291,7 +291,11 @@ void zfcp_dbf_hba_fsf_response(struct zf + { + struct fsf_qtcb *qtcb = req->qtcb; + +- if ((qtcb->prefix.prot_status != FSF_PROT_GOOD) && ++ if (unlikely(req->status & (ZFCP_STATUS_FSFREQ_DISMISSED | ++ ZFCP_STATUS_FSFREQ_ERROR))) { ++ zfcp_dbf_hba_fsf_resp("fs_rerr", 3, req); ++ ++ } else if ((qtcb->prefix.prot_status != FSF_PROT_GOOD) && + (qtcb->prefix.prot_status != FSF_PROT_FSF_STATUS_PRESENTED)) { + zfcp_dbf_hba_fsf_resp("fs_perr", 1, req); + diff --git a/queue-3.18/scsi-zfcp-trace-high-part-of-new-64-bit-scsi-lun.patch b/queue-3.18/scsi-zfcp-trace-high-part-of-new-64-bit-scsi-lun.patch new file mode 100644 index 00000000000..80855b5c02e --- /dev/null +++ b/queue-3.18/scsi-zfcp-trace-high-part-of-new-64-bit-scsi-lun.patch @@ -0,0 +1,150 @@ +From 5d4a3d0a2ff23799b956e5962b886287614e7fad Mon Sep 17 00:00:00 2001 +From: Steffen Maier +Date: Fri, 28 Jul 2017 12:30:58 +0200 +Subject: scsi: zfcp: trace high part of "new" 64 bit SCSI LUN + +From: Steffen Maier + +commit 5d4a3d0a2ff23799b956e5962b886287614e7fad upstream. + +Complements debugging aspects of the otherwise functionally complete +v3.17 commit 9cb78c16f5da ("scsi: use 64-bit LUNs"). + +While I don't have access to a target exporting 3 or 4 level LUNs, +I did test it by explicitly attaching a non-existent fake 4 level LUN +by means of zfcp sysfs attribute "unit_add". +In order to see corresponding trace records of otherwise successful +events, we had to increase the trace level of area SCSI and HBA to 6. + +$ echo 6 > /sys/kernel/debug/s390dbf/zfcp_0.0.1880_scsi/level +$ echo 6 > /sys/kernel/debug/s390dbf/zfcp_0.0.1880_hba/level + +$ echo 0x4011402240334044 > \ + /sys/bus/ccw/drivers/zfcp/0.0.1880/0x50050763031bd327/unit_add + +Example output formatted by an updated zfcpdbf from the s390-tools +package interspersed with kernel messages at scsi_logging_level=4605: + +Timestamp : ... +Area : REC +Subarea : 00 +Level : 1 +Exception : - +CPU ID : .. +Caller : 0x... +Record ID : 1 +Tag : scsla_1 +LUN : 0x4011402240334044 +WWPN : 0x50050763031bd327 +D_ID : 0x00...... +Adapter status : 0x5400050b +Port status : 0x54000001 +LUN status : 0x41000000 +Ready count : 0x00000001 +Running count : 0x00000000 +ERP want : 0x01 +ERP need : 0x01 + +scsi 2:0:0:4630896905707208721: scsi scan: INQUIRY pass 1 length 36 +scsi 2:0:0:4630896905707208721: scsi scan: INQUIRY successful with code 0x0 + +Timestamp : ... +Area : HBA +Subarea : 00 +Level : 6 +Exception : - +CPU ID : .. +Caller : 0x... +Record ID : 1 +Tag : fs_norm +Request ID : 0x +Request status : 0x00000010 +FSF cmnd : 0x00000001 +FSF sequence no: 0x... +FSF issued : ... +FSF stat : 0x00000000 +FSF stat qual : 00000000 00000000 00000000 00000000 +Prot stat : 0x00000001 +Prot stat qual : ........ ........ 00000000 00000000 +Port handle : 0x... +LUN handle : 0x... +| +Timestamp : ... +Area : SCSI +Subarea : 00 +Level : 6 +Exception : - +CPU ID : .. +Caller : 0x... +Record ID : 1 +Tag : rsl_nor +Request ID : 0x +SCSI ID : 0x00000000 +SCSI LUN : 0x40224011 +SCSI LUN high : 0x40444033 <======================= +SCSI result : 0x00000000 +SCSI retries : 0x00 +SCSI allowed : 0x03 +SCSI scribble : 0x +SCSI opcode : 12000000 a4000000 00000000 00000000 +FCP rsp inf cod: 0x00 +FCP rsp IU : 00000000 00000000 00000000 00000000 + 00000000 00000000 + +scsi 2:0:0:4630896905707208721: scsi scan: INQUIRY pass 2 length 164 +scsi 2:0:0:4630896905707208721: scsi scan: INQUIRY successful with code 0x0 +scsi 2:0:0:4630896905707208721: scsi scan: peripheral device type of 31, \ +no device added + +Signed-off-by: Steffen Maier +Fixes: 9cb78c16f5da ("scsi: use 64-bit LUNs") +Reviewed-by: Benjamin Block +Reviewed-by: Jens Remus +Signed-off-by: Benjamin Block +Signed-off-by: Martin K. Petersen +Signed-off-by: Greg Kroah-Hartman + +--- + drivers/s390/scsi/zfcp_dbf.c | 2 +- + drivers/s390/scsi/zfcp_dbf.h | 4 +++- + 2 files changed, 4 insertions(+), 2 deletions(-) + +--- a/drivers/s390/scsi/zfcp_dbf.c ++++ b/drivers/s390/scsi/zfcp_dbf.c +@@ -418,8 +418,8 @@ void zfcp_dbf_scsi(char *tag, struct scs + rec->scsi_retries = sc->retries; + rec->scsi_allowed = sc->allowed; + rec->scsi_id = sc->device->id; +- /* struct zfcp_dbf_scsi needs to be updated to handle 64bit LUNs */ + rec->scsi_lun = (u32)sc->device->lun; ++ rec->scsi_lun_64_hi = (u32)(sc->device->lun >> 32); + rec->host_scribble = (unsigned long)sc->host_scribble; + + memcpy(rec->scsi_opcode, sc->cmnd, +--- a/drivers/s390/scsi/zfcp_dbf.h ++++ b/drivers/s390/scsi/zfcp_dbf.h +@@ -196,7 +196,7 @@ enum zfcp_dbf_scsi_id { + * @id: unique number of recovery record type + * @tag: identifier string specifying the location of initiation + * @scsi_id: scsi device id +- * @scsi_lun: scsi device logical unit number ++ * @scsi_lun: scsi device logical unit number, low part of 64 bit, old 32 bit + * @scsi_result: scsi result + * @scsi_retries: current retry number of scsi request + * @scsi_allowed: allowed retries +@@ -206,6 +206,7 @@ enum zfcp_dbf_scsi_id { + * @host_scribble: LLD specific data attached to SCSI request + * @pl_len: length of paload stored as zfcp_dbf_pay + * @fsf_rsp: response for fsf request ++ * @scsi_lun_64_hi: scsi device logical unit number, high part of 64 bit + */ + struct zfcp_dbf_scsi { + u8 id; +@@ -222,6 +223,7 @@ struct zfcp_dbf_scsi { + u64 host_scribble; + u16 pl_len; + struct fcp_resp_with_ext fcp_rsp; ++ u32 scsi_lun_64_hi; + } __packed; + + /** diff --git a/queue-3.18/series b/queue-3.18/series index f78378bbd9e..a981176b34e 100644 --- a/queue-3.18/series +++ b/queue-3.18/series @@ -13,3 +13,11 @@ ext4-fix-incorrect-quotaoff-if-the-quota-feature-is-enabled.patch powerpc-fix-dar-reporting-when-alignment-handler-faults.patch block-relax-a-check-in-blk_start_queue.patch md-bitmap-disable-bitmap_resize-for-file-backed-bitmaps.patch +skd-avoid-that-module-unloading-triggers-a-use-after-free.patch +skd-submit-requests-to-firmware-before-triggering-the-doorbell.patch +scsi-zfcp-fix-queuecommand-for-scsi_eh-commands-when-dix-enabled.patch +scsi-zfcp-add-handling-for-fcp_resid_over-to-the-fcp-ingress-path.patch +scsi-zfcp-fix-missing-trace-records-for-early-returns-in-tmf-eh-handlers.patch +scsi-zfcp-fix-payload-with-full-fcp_rsp-iu-in-scsi-trace-records.patch +scsi-zfcp-trace-hba-fsf-response-by-default-on-dismiss-or-timedout-late-response.patch +scsi-zfcp-trace-high-part-of-new-64-bit-scsi-lun.patch diff --git a/queue-3.18/skd-avoid-that-module-unloading-triggers-a-use-after-free.patch b/queue-3.18/skd-avoid-that-module-unloading-triggers-a-use-after-free.patch new file mode 100644 index 00000000000..996cb54fe25 --- /dev/null +++ b/queue-3.18/skd-avoid-that-module-unloading-triggers-a-use-after-free.patch @@ -0,0 +1,78 @@ +From 7277cc67b3916eed47558c64f9c9c0de00a35cda Mon Sep 17 00:00:00 2001 +From: Bart Van Assche +Date: Thu, 17 Aug 2017 13:12:45 -0700 +Subject: skd: Avoid that module unloading triggers a use-after-free + +From: Bart Van Assche + +commit 7277cc67b3916eed47558c64f9c9c0de00a35cda upstream. + +Since put_disk() triggers a disk_release() call and since that +last function calls blk_put_queue() if disk->queue != NULL, clear +the disk->queue pointer before calling put_disk(). This avoids +that unloading the skd kernel module triggers the following +use-after-free: + +WARNING: CPU: 8 PID: 297 at lib/refcount.c:128 refcount_sub_and_test+0x70/0x80 +refcount_t: underflow; use-after-free. +CPU: 8 PID: 297 Comm: kworker/8:1 Not tainted 4.11.10-300.fc26.x86_64 #1 +Workqueue: events work_for_cpu_fn +Call Trace: + dump_stack+0x63/0x84 + __warn+0xcb/0xf0 + warn_slowpath_fmt+0x5a/0x80 + refcount_sub_and_test+0x70/0x80 + refcount_dec_and_test+0x11/0x20 + kobject_put+0x1f/0x50 + blk_put_queue+0x15/0x20 + disk_release+0xae/0xf0 + device_release+0x32/0x90 + kobject_release+0x67/0x170 + kobject_put+0x2b/0x50 + put_disk+0x17/0x20 + skd_destruct+0x5c/0x890 [skd] + skd_pci_probe+0x124d/0x13a0 [skd] + local_pci_probe+0x42/0xa0 + work_for_cpu_fn+0x14/0x20 + process_one_work+0x19e/0x470 + worker_thread+0x1dc/0x4a0 + kthread+0x125/0x140 + ret_from_fork+0x25/0x30 + +Signed-off-by: Bart Van Assche +Cc: Christoph Hellwig +Cc: Hannes Reinecke +Cc: Johannes Thumshirn +Signed-off-by: Jens Axboe +Signed-off-by: Greg Kroah-Hartman + +--- + drivers/block/skd_main.c | 15 ++++++++------- + 1 file changed, 8 insertions(+), 7 deletions(-) + +--- a/drivers/block/skd_main.c ++++ b/drivers/block/skd_main.c +@@ -4679,15 +4679,16 @@ static void skd_free_disk(struct skd_dev + { + struct gendisk *disk = skdev->disk; + +- if (disk != NULL) { +- struct request_queue *q = disk->queue; ++ if (disk && (disk->flags & GENHD_FL_UP)) ++ del_gendisk(disk); + +- if (disk->flags & GENHD_FL_UP) +- del_gendisk(disk); +- if (q) +- blk_cleanup_queue(q); +- put_disk(disk); ++ if (skdev->queue) { ++ blk_cleanup_queue(skdev->queue); ++ skdev->queue = NULL; ++ disk->queue = NULL; + } ++ ++ put_disk(disk); + skdev->disk = NULL; + } + diff --git a/queue-3.18/skd-submit-requests-to-firmware-before-triggering-the-doorbell.patch b/queue-3.18/skd-submit-requests-to-firmware-before-triggering-the-doorbell.patch new file mode 100644 index 00000000000..5d86baa9c6a --- /dev/null +++ b/queue-3.18/skd-submit-requests-to-firmware-before-triggering-the-doorbell.patch @@ -0,0 +1,49 @@ +From 5fbd545cd3fd311ea1d6e8be4cedddd0ee5684c7 Mon Sep 17 00:00:00 2001 +From: Bart Van Assche +Date: Thu, 17 Aug 2017 13:12:46 -0700 +Subject: skd: Submit requests to firmware before triggering the doorbell + +From: Bart Van Assche + +commit 5fbd545cd3fd311ea1d6e8be4cedddd0ee5684c7 upstream. + +Ensure that the members of struct skd_msg_buf have been transferred +to the PCIe adapter before the doorbell is triggered. This patch +avoids that I/O fails sporadically and that the following error +message is reported: + +(skd0:STM000196603:[0000:00:09.0]): Completion mismatch comp_id=0x0000 skreq=0x0400 new=0x0000 + +Signed-off-by: Bart Van Assche +Cc: Christoph Hellwig +Cc: Hannes Reinecke +Cc: Johannes Thumshirn +Signed-off-by: Jens Axboe +Signed-off-by: Greg Kroah-Hartman + +--- + drivers/block/skd_main.c | 6 ++++++ + 1 file changed, 6 insertions(+) + +--- a/drivers/block/skd_main.c ++++ b/drivers/block/skd_main.c +@@ -2214,6 +2214,9 @@ static void skd_send_fitmsg(struct skd_d + */ + qcmd |= FIT_QCMD_MSGSIZE_64; + ++ /* Make sure skd_msg_buf is written before the doorbell is triggered. */ ++ smp_wmb(); ++ + SKD_WRITEQ(skdev, qcmd, FIT_Q_COMMAND); + + } +@@ -2260,6 +2263,9 @@ static void skd_send_special_fitmsg(stru + qcmd = skspcl->mb_dma_address; + qcmd |= FIT_QCMD_QID_NORMAL + FIT_QCMD_MSGSIZE_128; + ++ /* Make sure skd_msg_buf is written before the doorbell is triggered. */ ++ smp_wmb(); ++ + SKD_WRITEQ(skdev, qcmd, FIT_Q_COMMAND); + } + -- 2.47.3