1 From: Mike Reed <mdr@sgi.com>
2 Subject: panic in qla_1280_done following adapter reset
5 Customer has had repeated crashes after a scsi adapter reset. This looks like
6 the same issue as bug 232908. Unfortunately, the customer's solution to this
7 was to replace the Qlogic card with an LSI card (just like the customer did in
10 Mike Reed has submitted a patch upstream to fix this problem. We would like to
11 see this patch in a SLES10 SP2 update (and SLES11 update) so that other
12 customers with this Qlogic card won't hit it.
14 Signed-off-by: Hannes Reinecke <hare@suse.de>
16 --- linux-2.6.27.19-5/drivers/scsi/qla1280.h 2008-10-09 17:13:53.000000000 -0500
17 +++ linux-2.6.27.19-5-modified/drivers/scsi/qla1280.h 2009-03-30 13:19:03.447145988 -0500
20 /* Maximum outstanding commands in ISP queues */
21 #define MAX_OUTSTANDING_COMMANDS 512
22 -#define INVALID_HANDLE (MAX_OUTSTANDING_COMMANDS + 2)
23 +#define COMPLETED_HANDLE ((unsigned char *) \
24 + (MAX_OUTSTANDING_COMMANDS + 2))
26 /* ISP request and response entry counts (37-65535) */
27 #define REQUEST_ENTRY_CNT 255 /* Number of request entries. */
28 --- linux-2.6.27.19-5/drivers/scsi/qla1280.c 2009-02-28 01:19:41.000000000 -0600
29 +++ linux-2.6.27.19-5-modified/drivers/scsi/qla1280.c 2009-04-17 12:46:11.810488760 -0500
31 * General Public License for more details.
33 ******************************************************************************/
34 -#define QLA1280_VERSION "3.26"
35 +#define QLA1280_VERSION "3.27"
36 /*****************************************************************************
38 + Rev 3.27, February 10, 2009, Michael Reed
39 + - General code cleanup.
40 + - Improve error recovery.
41 Rev 3.26, January 16, 2006 Jes Sorensen
42 - Ditch all < 2.6 support
43 Rev 3.25.1, February 10, 2005 Christoph Hellwig
44 @@ -438,7 +441,6 @@ static int qla1280_mailbox_command(struc
46 static int qla1280_bus_reset(struct scsi_qla_host *, int);
47 static int qla1280_device_reset(struct scsi_qla_host *, int, int);
48 -static int qla1280_abort_device(struct scsi_qla_host *, int, int, int);
49 static int qla1280_abort_command(struct scsi_qla_host *, struct srb *, int);
50 static int qla1280_abort_isp(struct scsi_qla_host *);
52 @@ -710,7 +712,7 @@ qla1280_info(struct Scsi_Host *host)
55 /**************************************************************************
56 - * qla1200_queuecommand
57 + * qla1280_queuecommand
58 * Queue a command to the controller.
61 @@ -725,12 +727,14 @@ qla1280_queuecommand(struct scsi_cmnd *c
63 struct Scsi_Host *host = cmd->device->host;
64 struct scsi_qla_host *ha = (struct scsi_qla_host *)host->hostdata;
65 - struct srb *sp = (struct srb *)&cmd->SCp;
66 + struct srb *sp = (struct srb *)CMD_SP(cmd);
73 + CMD_HANDLE(cmd) = (unsigned char *)NULL;
75 qla1280_print_scsi_cmd(5, cmd);
77 @@ -750,21 +754,11 @@ qla1280_queuecommand(struct scsi_cmnd *c
88 -/* timer action for error action processor */
89 -static void qla1280_error_wait_timeout(unsigned long __data)
91 - struct scsi_cmnd *cmd = (struct scsi_cmnd *)__data;
92 - struct srb *sp = (struct srb *)CMD_SP(cmd);
97 static void qla1280_mailbox_timeout(unsigned long __data)
99 @@ -779,8 +773,67 @@ static void qla1280_mailbox_timeout(unsi
100 complete(ha->mailbox_wait);
104 +_qla1280_wait_for_single_command(struct scsi_qla_host *ha, struct srb *sp,
105 + struct completion *wait)
107 + int status = FAILED;
108 + struct scsi_cmnd *cmd = sp->cmd;
110 + spin_unlock_irq(ha->host->host_lock);
111 + wait_for_completion_timeout(wait, 4*HZ);
112 + spin_lock_irq(ha->host->host_lock);
114 + if(CMD_HANDLE(cmd) == COMPLETED_HANDLE) {
116 + (*cmd->scsi_done)(cmd);
122 +qla1280_wait_for_single_command(struct scsi_qla_host *ha, struct srb *sp)
124 + DECLARE_COMPLETION_ONSTACK(wait);
127 + return _qla1280_wait_for_single_command(ha, sp, &wait);
131 +qla1280_wait_for_pending_commands(struct scsi_qla_host *ha, int bus, int target)
136 + struct scsi_cmnd *cmd;
141 + * Wait for all commands with the designated bus/target
142 + * to be completed by the firmware
144 + for (cnt = 0; cnt < MAX_OUTSTANDING_COMMANDS; cnt++) {
145 + sp = ha->outstanding_cmds[cnt];
149 + if (bus >= 0 && SCSI_BUS_32(cmd) != bus)
151 + if (target >= 0 && SCSI_TCN_32(cmd) != target)
154 + status = qla1280_wait_for_single_command(ha, sp);
155 + if (status == FAILED)
162 /**************************************************************************
163 - * qla1200_error_action
164 + * qla1280_error_action
165 * The function will attempt to perform a specified error action and
166 * wait for the results (or time out).
168 @@ -792,11 +845,6 @@ static void qla1280_mailbox_timeout(unsi
173 - * Resetting the bus always succeeds - is has to, otherwise the
174 - * kernel will panic! Try a surgical technique - sending a BUS
175 - * DEVICE RESET message - on the offending target before pulling
176 - * the SCSI bus reset line.
177 **************************************************************************/
179 qla1280_error_action(struct scsi_cmnd *cmd, enum action action)
180 @@ -804,13 +852,19 @@ qla1280_error_action(struct scsi_cmnd *c
181 struct scsi_qla_host *ha;
182 int bus, target, lun;
185 - unsigned char *handle;
189 + int wait_for_bus=-1;
190 + int wait_for_target = -1;
191 DECLARE_COMPLETION_ONSTACK(wait);
192 - struct timer_list timer;
194 + ENTER("qla1280_error_action");
196 ha = (struct scsi_qla_host *)(CMD_HOST(cmd)->hostdata);
197 + sp = (struct srb *)CMD_SP(cmd);
198 + bus = SCSI_BUS_32(cmd);
199 + target = SCSI_TCN_32(cmd);
200 + lun = SCSI_LUN_32(cmd);
202 dprintk(4, "error_action %i, istatus 0x%04x\n", action,
203 RD_REG_WORD(&ha->iobase->istatus));
204 @@ -819,99 +873,47 @@ qla1280_error_action(struct scsi_cmnd *c
205 RD_REG_WORD(&ha->iobase->host_cmd),
206 RD_REG_WORD(&ha->iobase->ictrl), jiffies);
208 - ENTER("qla1280_error_action");
210 printk(KERN_INFO "scsi(%li): Resetting Cmnd=0x%p, "
211 "Handle=0x%p, action=0x%x\n",
212 ha->host_no, cmd, CMD_HANDLE(cmd), action);
215 - printk(KERN_WARNING "(scsi?:?:?:?) Reset called with NULL "
216 - "si_Cmnd pointer, failing.\n");
217 - LEAVE("qla1280_error_action");
221 - ha = (struct scsi_qla_host *)cmd->device->host->hostdata;
222 - sp = (struct srb *)CMD_SP(cmd);
223 - handle = CMD_HANDLE(cmd);
225 - /* Check for pending interrupts. */
226 - data = qla1280_debounce_register(&ha->iobase->istatus);
228 - * The io_request_lock is held when the reset handler is called, hence
229 - * the interrupt handler cannot be running in parallel as it also
230 - * grabs the lock. /Jes
232 - if (data & RISC_INT)
233 - qla1280_isr(ha, &ha->done_q);
236 - * Determine the suggested action that the mid-level driver wants
238 + * Check to see if we have the command in the outstanding_cmds[]
239 + * array. If not then it must have completed before this error
240 + * action was initiated. If the error_action isn't ABORT_COMMAND
241 + * then the driver must proceed with the requested action.
243 - if (handle == (unsigned char *)INVALID_HANDLE || handle == NULL) {
244 - if(action == ABORT_COMMAND) {
245 - /* we never got this command */
246 - printk(KERN_INFO "qla1280: Aborting a NULL handle\n");
247 - return SUCCESS; /* no action - we don't have command */
249 + for (i = 0; i < MAX_OUTSTANDING_COMMANDS; i++) {
250 + if (sp == ha->outstanding_cmds[i]) {
252 + sp->wait = &wait; /* we'll wait for it to complete */
259 - bus = SCSI_BUS_32(cmd);
260 - target = SCSI_TCN_32(cmd);
261 - lun = SCSI_LUN_32(cmd);
262 + if (found < 0) { /* driver doesn't have command */
264 + if (qla1280_verbose) {
266 + "scsi(%ld:%d:%d:%d): specified command has "
267 + "already completed.\n", ha->host_no, bus,
272 - /* Overloading result. Here it means the success or fail of the
273 - * *issue* of the action. When we return from the routine, it must
274 - * mean the actual success or fail of the action */
281 - if ((sp->flags & SRB_ABORT_PENDING)) {
282 - printk(KERN_WARNING
283 - "scsi(): Command has a pending abort "
284 - "message - ABORT_PENDING.\n");
285 - /* This should technically be impossible since we
286 - * now wait for abort completion */
290 - for (i = 0; i < MAX_OUTSTANDING_COMMANDS; i++) {
291 - if (sp == ha->outstanding_cmds[i]) {
292 - dprintk(1, "qla1280: RISC aborting command\n");
293 - if (qla1280_abort_command(ha, sp, i) == 0)
297 - * Since we don't know what might
298 - * have happend to the command, it
299 - * is unsafe to remove it from the
300 - * device's queue at this point.
301 - * Wait and let the escalation
302 - * process take care of it.
304 - printk(KERN_WARNING
305 - "scsi(%li:%i:%i:%i): Unable"
306 - " to abort command!\n",
307 - ha->host_no, bus, target, lun);
314 - if (qla1280_verbose)
316 - "scsi(%ld:%d:%d:%d): Queueing abort device "
317 - "command.\n", ha->host_no, bus, target, lun);
318 - if (qla1280_abort_device(ha, bus, target, lun) == 0)
320 + dprintk(1, "qla1280: RISC aborting command\n");
322 + * The abort might fail due to race when the host_lock
323 + * is released to issue the abort. As such, we
324 + * don't bother to check the return status.
327 + qla1280_abort_command(ha, sp, found);
331 @@ -919,16 +921,21 @@ qla1280_error_action(struct scsi_cmnd *c
333 "scsi(%ld:%d:%d:%d): Queueing device reset "
334 "command.\n", ha->host_no, bus, target, lun);
335 - if (qla1280_device_reset(ha, bus, target) == 0)
337 + if (qla1280_device_reset(ha, bus, target) == 0) {
338 + /* issued device reset, set wait conditions */
339 + wait_for_bus = bus;
340 + wait_for_target = target;
346 printk(KERN_INFO "qla1280(%ld:%d): Issued bus "
347 "reset.\n", ha->host_no, bus);
348 - if (qla1280_bus_reset(ha, bus) == 0)
350 + if (qla1280_bus_reset(ha, bus) == 0) {
351 + /* issued bus reset, set wait conditions */
352 + wait_for_bus = bus;
357 @@ -941,55 +948,48 @@ qla1280_error_action(struct scsi_cmnd *c
358 "continue automatically\n", ha->host_no);
360 ha->flags.reset_active = 1;
362 - * We restarted all of the commands automatically, so the
363 - * mid-level code can expect completions momentitarily.
365 - if (qla1280_abort_isp(ha) == 0)
368 + if (qla1280_abort_isp(ha) != 0) { /* it's dead */
372 ha->flags.reset_active = 0;
375 - if (!list_empty(&ha->done_q))
378 + * At this point, the host_lock has been released and retaken
379 + * by the issuance of the mailbox command.
380 + * Wait for the command passed in by the mid-layer if it
381 + * was found by the driver. It might have been returned
382 + * between eh recovery steps, hence the check of the "found"
386 - /* If we didn't manage to issue the action, or we have no
387 - * command to wait for, exit here */
388 - if (result == FAILED || handle == NULL ||
389 - handle == (unsigned char *)INVALID_HANDLE) {
391 - * Clear completion queue to avoid qla1280_done() trying
392 - * to complete the command at a later stage after we
393 - * have exited the current context
399 + result = _qla1280_wait_for_single_command(ha, sp, &wait);
401 - /* set up a timer just in case we're really jammed */
402 - init_timer(&timer);
403 - timer.expires = jiffies + 4*HZ;
404 - timer.data = (unsigned long)cmd;
405 - timer.function = qla1280_error_wait_timeout;
408 - /* wait for the action to complete (or the timer to expire) */
409 - spin_unlock_irq(ha->host->host_lock);
410 - wait_for_completion(&wait);
411 - del_timer_sync(&timer);
412 - spin_lock_irq(ha->host->host_lock);
414 + if (action == ABORT_COMMAND && result != SUCCESS) {
415 + printk(KERN_WARNING
416 + "scsi(%li:%i:%i:%i): "
417 + "Unable to abort command!\n",
418 + ha->host_no, bus, target, lun);
421 - /* the only action we might get a fail for is abort */
422 - if (action == ABORT_COMMAND) {
423 - if(sp->flags & SRB_ABORTED)
428 + * If the command passed in by the mid-layer has been
429 + * returned by the board, then wait for any additional
430 + * commands which are supposed to complete based upon
431 + * the error action.
433 + * All commands are unconditionally returned during a
434 + * call to qla1280_abort_isp(), ADAPTER_RESET. No need
435 + * to wait for them.
437 + if (result == SUCCESS && wait_for_bus >= 0) {
438 + result = qla1280_wait_for_pending_commands(ha,
439 + wait_for_bus, wait_for_target);
443 dprintk(1, "RESET returning %d\n", result);
445 LEAVE("qla1280_error_action");
446 @@ -1292,13 +1292,12 @@ qla1280_done(struct scsi_qla_host *ha)
447 switch ((CMD_RESULT(cmd) >> 16)) {
449 /* Issue marker command. */
450 - qla1280_marker(ha, bus, target, 0, MK_SYNC_ID);
451 + if (!ha->flags.abort_isp_active)
452 + qla1280_marker(ha, bus, target, 0, MK_SYNC_ID);
455 sp->flags &= ~SRB_ABORT_PENDING;
456 sp->flags |= SRB_ABORTED;
457 - if (sp->flags & SRB_TIMEOUT)
458 - CMD_RESULT(sp->cmd) = DID_TIME_OUT << 16;
462 @@ -1308,12 +1307,11 @@ qla1280_done(struct scsi_qla_host *ha)
465 /* Call the mid-level driver interrupt handler */
466 - CMD_HANDLE(sp->cmd) = (unsigned char *)INVALID_HANDLE;
469 - (*(cmd)->scsi_done)(cmd);
471 - if(sp->wait != NULL)
472 + if (sp->wait == NULL)
473 + (*(cmd)->scsi_done)(cmd);
477 LEAVE("qla1280_done");
478 @@ -2386,9 +2384,6 @@ static int
479 qla1280_mailbox_command(struct scsi_qla_host *ha, uint8_t mr, uint16_t *mb)
481 struct device_reg __iomem *reg = ha->iobase;
487 uint16_t *optr, *iptr;
488 @@ -2462,19 +2457,9 @@ qla1280_mailbox_command(struct scsi_qla_
489 mr = MAILBOX_REGISTER_COUNT;
490 memcpy(optr, iptr, MAILBOX_REGISTER_COUNT * sizeof(uint16_t));
493 - /* Go check for any response interrupts pending. */
494 - qla1280_isr(ha, &done_q);
497 if (ha->flags.reset_marker)
501 - if (!list_empty(&done_q))
502 - qla1280_done(ha, &done_q);
506 dprintk(2, "qla1280_mailbox_command: **** FAILED, mailbox0 = "
507 "0x%x ****\n", mb[0]);
508 @@ -2610,41 +2595,6 @@ qla1280_device_reset(struct scsi_qla_hos
512 - * qla1280_abort_device
513 - * Issue an abort message to the device
516 - * ha = adapter block pointer.
518 - * target = SCSI ID.
525 -qla1280_abort_device(struct scsi_qla_host *ha, int bus, int target, int lun)
527 - uint16_t mb[MAILBOX_REGISTER_COUNT];
530 - ENTER("qla1280_abort_device");
532 - mb[0] = MBC_ABORT_DEVICE;
533 - mb[1] = (bus ? target | BIT_7 : target) << 8 | lun;
534 - status = qla1280_mailbox_command(ha, BIT_1 | BIT_0, &mb[0]);
536 - /* Issue marker command. */
537 - qla1280_marker(ha, bus, target, lun, MK_SYNC_ID_LUN);
540 - dprintk(2, "qla1280_abort_device: **** FAILED ****\n");
542 - LEAVE("qla1280_abort_device");
547 * qla1280_abort_command
548 * Abort command aborts a specified IOCB.
550 @@ -2802,7 +2752,7 @@ qla1280_64bit_start_scsi(struct scsi_qla
552 /* If room for request in request ring. */
553 if ((req_cnt + 2) >= ha->req_q_cnt) {
555 + status = SCSI_MLQUEUE_HOST_BUSY;
556 dprintk(2, "qla1280_start_scsi: in-ptr=0x%x req_q_cnt="
557 "0x%xreq_cnt=0x%x", ha->req_ring_index, ha->req_q_cnt,
559 @@ -2814,7 +2764,7 @@ qla1280_64bit_start_scsi(struct scsi_qla
560 ha->outstanding_cmds[cnt] != NULL; cnt++);
562 if (cnt >= MAX_OUTSTANDING_COMMANDS) {
564 + status = SCSI_MLQUEUE_HOST_BUSY;
565 dprintk(2, "qla1280_start_scsi: NO ROOM IN "
566 "OUTSTANDING ARRAY, req_q_cnt=0x%x", ha->req_q_cnt);
568 @@ -3077,7 +3027,7 @@ qla1280_32bit_start_scsi(struct scsi_qla
569 ha->req_q_cnt, seg_cnt);
570 /* If room for request in request ring. */
571 if ((req_cnt + 2) >= ha->req_q_cnt) {
573 + status = SCSI_MLQUEUE_HOST_BUSY;
574 dprintk(2, "qla1280_32bit_start_scsi: in-ptr=0x%x, "
575 "req_q_cnt=0x%x, req_cnt=0x%x", ha->req_ring_index,
576 ha->req_q_cnt, req_cnt);
577 @@ -3089,7 +3039,7 @@ qla1280_32bit_start_scsi(struct scsi_qla
578 (ha->outstanding_cmds[cnt] != 0); cnt++) ;
580 if (cnt >= MAX_OUTSTANDING_COMMANDS) {
582 + status = SCSI_MLQUEUE_HOST_BUSY;
583 dprintk(2, "qla1280_32bit_start_scsi: NO ROOM IN OUTSTANDING "
584 "ARRAY, req_q_cnt=0x%x\n", ha->req_q_cnt);
586 @@ -3456,6 +3406,7 @@ qla1280_isr(struct scsi_qla_host *ha, st
588 /* Save ISP completion status */
589 CMD_RESULT(sp->cmd) = 0;
590 + CMD_HANDLE(sp->cmd) = COMPLETED_HANDLE;
592 /* Place block on done queue */
593 list_add_tail(&sp->list, done_q);
594 @@ -3464,7 +3415,7 @@ qla1280_isr(struct scsi_qla_host *ha, st
595 * If we get here we have a real problem!
598 - "qla1280: ISP invalid handle");
599 + "qla1280: ISP invalid handle\n");
603 @@ -3722,6 +3673,8 @@ qla1280_status_entry(struct scsi_qla_hos
607 + CMD_HANDLE(sp->cmd) = COMPLETED_HANDLE;
609 /* Place command on done queue. */
610 list_add_tail(&sp->list, done_q);
612 @@ -3777,6 +3730,8 @@ qla1280_error_entry(struct scsi_qla_host
613 CMD_RESULT(sp->cmd) = DID_ERROR << 16;
616 + CMD_HANDLE(sp->cmd) = COMPLETED_HANDLE;
618 /* Place command on done queue. */
619 list_add_tail(&sp->list, done_q);
621 @@ -3827,19 +3782,16 @@ qla1280_abort_isp(struct scsi_qla_host *
622 struct scsi_cmnd *cmd;
623 sp = ha->outstanding_cmds[cnt];
627 CMD_RESULT(cmd) = DID_RESET << 16;
630 + CMD_HANDLE(cmd) = COMPLETED_HANDLE;
631 ha->outstanding_cmds[cnt] = NULL;
633 - (*cmd->scsi_done)(cmd);
636 + list_add_tail(&sp->list, &ha->done_q);
642 status = qla1280_load_firmware(ha);
645 @@ -3924,13 +3876,6 @@ qla1280_check_for_dead_scsi_bus(struct s
647 if (scsi_control == SCSI_PHASE_INVALID) {
648 ha->bus_settings[bus].scsi_bus_dead = 1;
650 - CMD_RESULT(cp) = DID_NO_CONNECT << 16;
651 - CMD_HANDLE(cp) = INVALID_HANDLE;
652 - /* ha->actthreads--; */
654 - (*(cp)->scsi_done)(cp);
656 return 1; /* bus is dead */
658 ha->bus_settings[bus].scsi_bus_dead = 0;