From: Mike Reed Subject: panic in qla_1280_done following adapter reset References: bnc#493991 Customer has had repeated crashes after a scsi adapter reset. This looks like the same issue as bug 232908. Unfortunately, the customer's solution to this was to replace the Qlogic card with an LSI card (just like the customer did in bug 232908). Mike Reed has submitted a patch upstream to fix this problem. We would like to see this patch in a SLES10 SP2 update (and SLES11 update) so that other customers with this Qlogic card won't hit it. Signed-off-by: Hannes Reinecke --- linux-2.6.27.19-5/drivers/scsi/qla1280.h 2008-10-09 17:13:53.000000000 -0500 +++ linux-2.6.27.19-5-modified/drivers/scsi/qla1280.h 2009-03-30 13:19:03.447145988 -0500 @@ -88,7 +88,8 @@ /* Maximum outstanding commands in ISP queues */ #define MAX_OUTSTANDING_COMMANDS 512 -#define INVALID_HANDLE (MAX_OUTSTANDING_COMMANDS + 2) +#define COMPLETED_HANDLE ((unsigned char *) \ + (MAX_OUTSTANDING_COMMANDS + 2)) /* ISP request and response entry counts (37-65535) */ #define REQUEST_ENTRY_CNT 255 /* Number of request entries. */ --- linux-2.6.27.19-5/drivers/scsi/qla1280.c 2009-02-28 01:19:41.000000000 -0600 +++ linux-2.6.27.19-5-modified/drivers/scsi/qla1280.c 2009-04-17 12:46:11.810488760 -0500 @@ -17,9 +17,12 @@ * General Public License for more details. * ******************************************************************************/ -#define QLA1280_VERSION "3.26" +#define QLA1280_VERSION "3.27" /***************************************************************************** Revision History: + Rev 3.27, February 10, 2009, Michael Reed + - General code cleanup. + - Improve error recovery. Rev 3.26, January 16, 2006 Jes Sorensen - Ditch all < 2.6 support Rev 3.25.1, February 10, 2005 Christoph Hellwig @@ -438,7 +441,6 @@ static int qla1280_mailbox_command(struc uint8_t, uint16_t *); static int qla1280_bus_reset(struct scsi_qla_host *, int); static int qla1280_device_reset(struct scsi_qla_host *, int, int); -static int qla1280_abort_device(struct scsi_qla_host *, int, int, int); static int qla1280_abort_command(struct scsi_qla_host *, struct srb *, int); static int qla1280_abort_isp(struct scsi_qla_host *); #ifdef QLA_64BIT_PTR @@ -710,7 +712,7 @@ qla1280_info(struct Scsi_Host *host) } /************************************************************************** - * qla1200_queuecommand + * qla1280_queuecommand * Queue a command to the controller. * * Note: @@ -725,12 +727,14 @@ qla1280_queuecommand(struct scsi_cmnd *c { struct Scsi_Host *host = cmd->device->host; struct scsi_qla_host *ha = (struct scsi_qla_host *)host->hostdata; - struct srb *sp = (struct srb *)&cmd->SCp; + struct srb *sp = (struct srb *)CMD_SP(cmd); int status; cmd->scsi_done = fn; sp->cmd = cmd; sp->flags = 0; + sp->wait = NULL; + CMD_HANDLE(cmd) = (unsigned char *)NULL; qla1280_print_scsi_cmd(5, cmd); @@ -750,21 +754,11 @@ qla1280_queuecommand(struct scsi_cmnd *c enum action { ABORT_COMMAND, - ABORT_DEVICE, DEVICE_RESET, BUS_RESET, ADAPTER_RESET, - FAIL }; -/* timer action for error action processor */ -static void qla1280_error_wait_timeout(unsigned long __data) -{ - struct scsi_cmnd *cmd = (struct scsi_cmnd *)__data; - struct srb *sp = (struct srb *)CMD_SP(cmd); - - complete(sp->wait); -} static void qla1280_mailbox_timeout(unsigned long __data) { @@ -779,8 +773,67 @@ static void qla1280_mailbox_timeout(unsi complete(ha->mailbox_wait); } +static int +_qla1280_wait_for_single_command(struct scsi_qla_host *ha, struct srb *sp, + struct completion *wait) +{ + int status = FAILED; + struct scsi_cmnd *cmd = sp->cmd; + + spin_unlock_irq(ha->host->host_lock); + wait_for_completion_timeout(wait, 4*HZ); + spin_lock_irq(ha->host->host_lock); + sp->wait = NULL; + if(CMD_HANDLE(cmd) == COMPLETED_HANDLE) { + status = SUCCESS; + (*cmd->scsi_done)(cmd); + } + return status; +} + +static int +qla1280_wait_for_single_command(struct scsi_qla_host *ha, struct srb *sp) +{ + DECLARE_COMPLETION_ONSTACK(wait); + + sp->wait = &wait; + return _qla1280_wait_for_single_command(ha, sp, &wait); +} + +static int +qla1280_wait_for_pending_commands(struct scsi_qla_host *ha, int bus, int target) +{ + int cnt; + int status; + struct srb *sp; + struct scsi_cmnd *cmd; + + status = SUCCESS; + + /* + * Wait for all commands with the designated bus/target + * to be completed by the firmware + */ + for (cnt = 0; cnt < MAX_OUTSTANDING_COMMANDS; cnt++) { + sp = ha->outstanding_cmds[cnt]; + if (sp) { + cmd = sp->cmd; + + if (bus >= 0 && SCSI_BUS_32(cmd) != bus) + continue; + if (target >= 0 && SCSI_TCN_32(cmd) != target) + continue; + + status = qla1280_wait_for_single_command(ha, sp); + if (status == FAILED) + break; + } + } + return status; +} + /************************************************************************** - * qla1200_error_action + * qla1280_error_action * The function will attempt to perform a specified error action and * wait for the results (or time out). * @@ -792,11 +845,6 @@ static void qla1280_mailbox_timeout(unsi * Returns: * SUCCESS or FAILED * - * Note: - * Resetting the bus always succeeds - is has to, otherwise the - * kernel will panic! Try a surgical technique - sending a BUS - * DEVICE RESET message - on the offending target before pulling - * the SCSI bus reset line. **************************************************************************/ static int qla1280_error_action(struct scsi_cmnd *cmd, enum action action) @@ -804,13 +852,19 @@ qla1280_error_action(struct scsi_cmnd *c struct scsi_qla_host *ha; int bus, target, lun; struct srb *sp; - uint16_t data; - unsigned char *handle; - int result, i; + int i, found; + int result=FAILED; + int wait_for_bus=-1; + int wait_for_target = -1; DECLARE_COMPLETION_ONSTACK(wait); - struct timer_list timer; + + ENTER("qla1280_error_action"); ha = (struct scsi_qla_host *)(CMD_HOST(cmd)->hostdata); + sp = (struct srb *)CMD_SP(cmd); + bus = SCSI_BUS_32(cmd); + target = SCSI_TCN_32(cmd); + lun = SCSI_LUN_32(cmd); dprintk(4, "error_action %i, istatus 0x%04x\n", action, RD_REG_WORD(&ha->iobase->istatus)); @@ -819,99 +873,47 @@ qla1280_error_action(struct scsi_cmnd *c RD_REG_WORD(&ha->iobase->host_cmd), RD_REG_WORD(&ha->iobase->ictrl), jiffies); - ENTER("qla1280_error_action"); if (qla1280_verbose) printk(KERN_INFO "scsi(%li): Resetting Cmnd=0x%p, " "Handle=0x%p, action=0x%x\n", ha->host_no, cmd, CMD_HANDLE(cmd), action); - if (cmd == NULL) { - printk(KERN_WARNING "(scsi?:?:?:?) Reset called with NULL " - "si_Cmnd pointer, failing.\n"); - LEAVE("qla1280_error_action"); - return FAILED; - } - - ha = (struct scsi_qla_host *)cmd->device->host->hostdata; - sp = (struct srb *)CMD_SP(cmd); - handle = CMD_HANDLE(cmd); - - /* Check for pending interrupts. */ - data = qla1280_debounce_register(&ha->iobase->istatus); - /* - * The io_request_lock is held when the reset handler is called, hence - * the interrupt handler cannot be running in parallel as it also - * grabs the lock. /Jes - */ - if (data & RISC_INT) - qla1280_isr(ha, &ha->done_q); - /* - * Determine the suggested action that the mid-level driver wants - * us to perform. + * Check to see if we have the command in the outstanding_cmds[] + * array. If not then it must have completed before this error + * action was initiated. If the error_action isn't ABORT_COMMAND + * then the driver must proceed with the requested action. */ - if (handle == (unsigned char *)INVALID_HANDLE || handle == NULL) { - if(action == ABORT_COMMAND) { - /* we never got this command */ - printk(KERN_INFO "qla1280: Aborting a NULL handle\n"); - return SUCCESS; /* no action - we don't have command */ + found = -1; + for (i = 0; i < MAX_OUTSTANDING_COMMANDS; i++) { + if (sp == ha->outstanding_cmds[i]) { + found = i; + sp->wait = &wait; /* we'll wait for it to complete */ + break; } - } else { - sp->wait = &wait; } - bus = SCSI_BUS_32(cmd); - target = SCSI_TCN_32(cmd); - lun = SCSI_LUN_32(cmd); + if (found < 0) { /* driver doesn't have command */ + result = SUCCESS; + if (qla1280_verbose) { + printk(KERN_INFO + "scsi(%ld:%d:%d:%d): specified command has " + "already completed.\n", ha->host_no, bus, + target, lun); + } + } - /* Overloading result. Here it means the success or fail of the - * *issue* of the action. When we return from the routine, it must - * mean the actual success or fail of the action */ - result = FAILED; switch (action) { - case FAIL: - break; case ABORT_COMMAND: - if ((sp->flags & SRB_ABORT_PENDING)) { - printk(KERN_WARNING - "scsi(): Command has a pending abort " - "message - ABORT_PENDING.\n"); - /* This should technically be impossible since we - * now wait for abort completion */ - break; - } - - for (i = 0; i < MAX_OUTSTANDING_COMMANDS; i++) { - if (sp == ha->outstanding_cmds[i]) { - dprintk(1, "qla1280: RISC aborting command\n"); - if (qla1280_abort_command(ha, sp, i) == 0) - result = SUCCESS; - else { - /* - * Since we don't know what might - * have happend to the command, it - * is unsafe to remove it from the - * device's queue at this point. - * Wait and let the escalation - * process take care of it. - */ - printk(KERN_WARNING - "scsi(%li:%i:%i:%i): Unable" - " to abort command!\n", - ha->host_no, bus, target, lun); - } - } - } - break; - - case ABORT_DEVICE: - if (qla1280_verbose) - printk(KERN_INFO - "scsi(%ld:%d:%d:%d): Queueing abort device " - "command.\n", ha->host_no, bus, target, lun); - if (qla1280_abort_device(ha, bus, target, lun) == 0) - result = SUCCESS; + dprintk(1, "qla1280: RISC aborting command\n"); + /* + * The abort might fail due to race when the host_lock + * is released to issue the abort. As such, we + * don't bother to check the return status. + */ + if (found >= 0) + qla1280_abort_command(ha, sp, found); break; case DEVICE_RESET: @@ -919,16 +921,21 @@ qla1280_error_action(struct scsi_cmnd *c printk(KERN_INFO "scsi(%ld:%d:%d:%d): Queueing device reset " "command.\n", ha->host_no, bus, target, lun); - if (qla1280_device_reset(ha, bus, target) == 0) - result = SUCCESS; + if (qla1280_device_reset(ha, bus, target) == 0) { + /* issued device reset, set wait conditions */ + wait_for_bus = bus; + wait_for_target = target; + } break; case BUS_RESET: if (qla1280_verbose) printk(KERN_INFO "qla1280(%ld:%d): Issued bus " "reset.\n", ha->host_no, bus); - if (qla1280_bus_reset(ha, bus) == 0) - result = SUCCESS; + if (qla1280_bus_reset(ha, bus) == 0) { + /* issued bus reset, set wait conditions */ + wait_for_bus = bus; + } break; case ADAPTER_RESET: @@ -941,55 +948,48 @@ qla1280_error_action(struct scsi_cmnd *c "continue automatically\n", ha->host_no); } ha->flags.reset_active = 1; - /* - * We restarted all of the commands automatically, so the - * mid-level code can expect completions momentitarily. - */ - if (qla1280_abort_isp(ha) == 0) - result = SUCCESS; + + if (qla1280_abort_isp(ha) != 0) { /* it's dead */ + result = FAILED; + } ha->flags.reset_active = 0; } - if (!list_empty(&ha->done_q)) - qla1280_done(ha); + /* + * At this point, the host_lock has been released and retaken + * by the issuance of the mailbox command. + * Wait for the command passed in by the mid-layer if it + * was found by the driver. It might have been returned + * between eh recovery steps, hence the check of the "found" + * variable. + */ - /* If we didn't manage to issue the action, or we have no - * command to wait for, exit here */ - if (result == FAILED || handle == NULL || - handle == (unsigned char *)INVALID_HANDLE) { - /* - * Clear completion queue to avoid qla1280_done() trying - * to complete the command at a later stage after we - * have exited the current context - */ - sp->wait = NULL; - goto leave; - } + if (found >= 0) + result = _qla1280_wait_for_single_command(ha, sp, &wait); - /* set up a timer just in case we're really jammed */ - init_timer(&timer); - timer.expires = jiffies + 4*HZ; - timer.data = (unsigned long)cmd; - timer.function = qla1280_error_wait_timeout; - add_timer(&timer); - - /* wait for the action to complete (or the timer to expire) */ - spin_unlock_irq(ha->host->host_lock); - wait_for_completion(&wait); - del_timer_sync(&timer); - spin_lock_irq(ha->host->host_lock); - sp->wait = NULL; + if (action == ABORT_COMMAND && result != SUCCESS) { + printk(KERN_WARNING + "scsi(%li:%i:%i:%i): " + "Unable to abort command!\n", + ha->host_no, bus, target, lun); + } - /* the only action we might get a fail for is abort */ - if (action == ABORT_COMMAND) { - if(sp->flags & SRB_ABORTED) - result = SUCCESS; - else - result = FAILED; + /* + * If the command passed in by the mid-layer has been + * returned by the board, then wait for any additional + * commands which are supposed to complete based upon + * the error action. + * + * All commands are unconditionally returned during a + * call to qla1280_abort_isp(), ADAPTER_RESET. No need + * to wait for them. + */ + if (result == SUCCESS && wait_for_bus >= 0) { + result = qla1280_wait_for_pending_commands(ha, + wait_for_bus, wait_for_target); } - leave: dprintk(1, "RESET returning %d\n", result); LEAVE("qla1280_error_action"); @@ -1292,13 +1292,12 @@ qla1280_done(struct scsi_qla_host *ha) switch ((CMD_RESULT(cmd) >> 16)) { case DID_RESET: /* Issue marker command. */ - qla1280_marker(ha, bus, target, 0, MK_SYNC_ID); + if (!ha->flags.abort_isp_active) + qla1280_marker(ha, bus, target, 0, MK_SYNC_ID); break; case DID_ABORT: sp->flags &= ~SRB_ABORT_PENDING; sp->flags |= SRB_ABORTED; - if (sp->flags & SRB_TIMEOUT) - CMD_RESULT(sp->cmd) = DID_TIME_OUT << 16; break; default: break; @@ -1308,12 +1307,11 @@ qla1280_done(struct scsi_qla_host *ha) scsi_dma_unmap(cmd); /* Call the mid-level driver interrupt handler */ - CMD_HANDLE(sp->cmd) = (unsigned char *)INVALID_HANDLE; ha->actthreads--; - (*(cmd)->scsi_done)(cmd); - - if(sp->wait != NULL) + if (sp->wait == NULL) + (*(cmd)->scsi_done)(cmd); + else complete(sp->wait); } LEAVE("qla1280_done"); @@ -2386,9 +2384,6 @@ static int qla1280_mailbox_command(struct scsi_qla_host *ha, uint8_t mr, uint16_t *mb) { struct device_reg __iomem *reg = ha->iobase; -#if 0 - LIST_HEAD(done_q); -#endif int status = 0; int cnt; uint16_t *optr, *iptr; @@ -2462,19 +2457,9 @@ qla1280_mailbox_command(struct scsi_qla_ mr = MAILBOX_REGISTER_COUNT; memcpy(optr, iptr, MAILBOX_REGISTER_COUNT * sizeof(uint16_t)); -#if 0 - /* Go check for any response interrupts pending. */ - qla1280_isr(ha, &done_q); -#endif - if (ha->flags.reset_marker) qla1280_rst_aen(ha); -#if 0 - if (!list_empty(&done_q)) - qla1280_done(ha, &done_q); -#endif - if (status) dprintk(2, "qla1280_mailbox_command: **** FAILED, mailbox0 = " "0x%x ****\n", mb[0]); @@ -2610,41 +2595,6 @@ qla1280_device_reset(struct scsi_qla_hos } /* - * qla1280_abort_device - * Issue an abort message to the device - * - * Input: - * ha = adapter block pointer. - * bus = SCSI BUS. - * target = SCSI ID. - * lun = SCSI LUN. - * - * Returns: - * 0 = success - */ -static int -qla1280_abort_device(struct scsi_qla_host *ha, int bus, int target, int lun) -{ - uint16_t mb[MAILBOX_REGISTER_COUNT]; - int status; - - ENTER("qla1280_abort_device"); - - mb[0] = MBC_ABORT_DEVICE; - mb[1] = (bus ? target | BIT_7 : target) << 8 | lun; - status = qla1280_mailbox_command(ha, BIT_1 | BIT_0, &mb[0]); - - /* Issue marker command. */ - qla1280_marker(ha, bus, target, lun, MK_SYNC_ID_LUN); - - if (status) - dprintk(2, "qla1280_abort_device: **** FAILED ****\n"); - - LEAVE("qla1280_abort_device"); - return status; -} - -/* * qla1280_abort_command * Abort command aborts a specified IOCB. * @@ -2802,7 +2752,7 @@ qla1280_64bit_start_scsi(struct scsi_qla /* If room for request in request ring. */ if ((req_cnt + 2) >= ha->req_q_cnt) { - status = 1; + status = SCSI_MLQUEUE_HOST_BUSY; dprintk(2, "qla1280_start_scsi: in-ptr=0x%x req_q_cnt=" "0x%xreq_cnt=0x%x", ha->req_ring_index, ha->req_q_cnt, req_cnt); @@ -2814,7 +2764,7 @@ qla1280_64bit_start_scsi(struct scsi_qla ha->outstanding_cmds[cnt] != NULL; cnt++); if (cnt >= MAX_OUTSTANDING_COMMANDS) { - status = 1; + status = SCSI_MLQUEUE_HOST_BUSY; dprintk(2, "qla1280_start_scsi: NO ROOM IN " "OUTSTANDING ARRAY, req_q_cnt=0x%x", ha->req_q_cnt); goto out; @@ -3077,7 +3027,7 @@ qla1280_32bit_start_scsi(struct scsi_qla ha->req_q_cnt, seg_cnt); /* If room for request in request ring. */ if ((req_cnt + 2) >= ha->req_q_cnt) { - status = 1; + status = SCSI_MLQUEUE_HOST_BUSY; dprintk(2, "qla1280_32bit_start_scsi: in-ptr=0x%x, " "req_q_cnt=0x%x, req_cnt=0x%x", ha->req_ring_index, ha->req_q_cnt, req_cnt); @@ -3089,7 +3039,7 @@ qla1280_32bit_start_scsi(struct scsi_qla (ha->outstanding_cmds[cnt] != 0); cnt++) ; if (cnt >= MAX_OUTSTANDING_COMMANDS) { - status = 1; + status = SCSI_MLQUEUE_HOST_BUSY; dprintk(2, "qla1280_32bit_start_scsi: NO ROOM IN OUTSTANDING " "ARRAY, req_q_cnt=0x%x\n", ha->req_q_cnt); goto out; @@ -3456,6 +3406,7 @@ qla1280_isr(struct scsi_qla_host *ha, st /* Save ISP completion status */ CMD_RESULT(sp->cmd) = 0; + CMD_HANDLE(sp->cmd) = COMPLETED_HANDLE; /* Place block on done queue */ list_add_tail(&sp->list, done_q); @@ -3464,7 +3415,7 @@ qla1280_isr(struct scsi_qla_host *ha, st * If we get here we have a real problem! */ printk(KERN_WARNING - "qla1280: ISP invalid handle"); + "qla1280: ISP invalid handle\n"); } } break; @@ -3722,6 +3673,8 @@ qla1280_status_entry(struct scsi_qla_hos } } + CMD_HANDLE(sp->cmd) = COMPLETED_HANDLE; + /* Place command on done queue. */ list_add_tail(&sp->list, done_q); out: @@ -3777,6 +3730,8 @@ qla1280_error_entry(struct scsi_qla_host CMD_RESULT(sp->cmd) = DID_ERROR << 16; } + CMD_HANDLE(sp->cmd) = COMPLETED_HANDLE; + /* Place command on done queue. */ list_add_tail(&sp->list, done_q); } @@ -3827,19 +3782,16 @@ qla1280_abort_isp(struct scsi_qla_host * struct scsi_cmnd *cmd; sp = ha->outstanding_cmds[cnt]; if (sp) { - cmd = sp->cmd; CMD_RESULT(cmd) = DID_RESET << 16; - - sp->cmd = NULL; + CMD_HANDLE(cmd) = COMPLETED_HANDLE; ha->outstanding_cmds[cnt] = NULL; - - (*cmd->scsi_done)(cmd); - - sp->flags = 0; + list_add_tail(&sp->list, &ha->done_q); } } + qla1280_done(ha); + status = qla1280_load_firmware(ha); if (status) goto out; @@ -3924,13 +3876,6 @@ qla1280_check_for_dead_scsi_bus(struct s if (scsi_control == SCSI_PHASE_INVALID) { ha->bus_settings[bus].scsi_bus_dead = 1; -#if 0 - CMD_RESULT(cp) = DID_NO_CONNECT << 16; - CMD_HANDLE(cp) = INVALID_HANDLE; - /* ha->actthreads--; */ - - (*(cp)->scsi_done)(cp); -#endif return 1; /* bus is dead */ } else { ha->bus_settings[bus].scsi_bus_dead = 0;