[SCSI] cciss: scsi error handling
This patch adds SCSI error handling code to the SCSI portion of the cciss driver. Signed-off-by: Stephen M. Cameron <steve.cameron@hp.com> Acked-by: Mike Miller <mike.miller@hp.com> Signed-off-by: James Bottomley <James.Bottomley@SteelEye.com>
This commit is contained in:
parent
13bf50d1f2
commit
3da8b713da
|
@ -133,3 +133,32 @@ hardware and it is important to prevent the kernel from attempting to directly
|
|||
access these devices too, as if the array controller were merely a SCSI
|
||||
controller in the same way that we are allowing it to access SCSI tape drives.
|
||||
|
||||
SCSI error handling for tape drives and medium changers
|
||||
-------------------------------------------------------
|
||||
|
||||
The linux SCSI mid layer provides an error handling protocol which
|
||||
kicks into gear whenever a SCSI command fails to complete within a
|
||||
certain amount of time (which can vary depending on the command).
|
||||
The cciss driver participates in this protocol to some extent. The
|
||||
normal protocol is a four step process. First the device is told
|
||||
to abort the command. If that doesn't work, the device is reset.
|
||||
If that doesn't work, the SCSI bus is reset. If that doesn't work
|
||||
the host bus adapter is reset. Because the cciss driver is a block
|
||||
driver as well as a SCSI driver and only the tape drives and medium
|
||||
changers are presented to the SCSI mid layer, and unlike more
|
||||
straightforward SCSI drivers, disk i/o continues through the block
|
||||
side during the SCSI error recovery process, the cciss driver only
|
||||
implements the first two of these actions, aborting the command, and
|
||||
resetting the device. Additionally, most tape drives will not oblige
|
||||
in aborting commands, and sometimes it appears they will not even
|
||||
obey a reset coommand, though in most circumstances they will. In
|
||||
the case that the command cannot be aborted and the device cannot be
|
||||
reset, the device will be set offline.
|
||||
|
||||
In the event the error handling code is triggered and a tape drive is
|
||||
successfully reset or the tardy command is successfully aborted, the
|
||||
tape drive may still not allow i/o to continue until some command
|
||||
is issued which positions the tape to a known position. Typically you
|
||||
must rewind the tape (by issuing "mt -f /dev/st0 rewind" for example)
|
||||
before i/o can proceed again to a tape drive which was reset.
|
||||
|
||||
|
|
|
@ -148,6 +148,7 @@ static struct board_type products[] = {
|
|||
static ctlr_info_t *hba[MAX_CTLR];
|
||||
|
||||
static void do_cciss_request(request_queue_t *q);
|
||||
static irqreturn_t do_cciss_intr(int irq, void *dev_id, struct pt_regs *regs);
|
||||
static int cciss_open(struct inode *inode, struct file *filep);
|
||||
static int cciss_release(struct inode *inode, struct file *filep);
|
||||
static int cciss_ioctl(struct inode *inode, struct file *filep,
|
||||
|
@ -1586,6 +1587,24 @@ static int fill_cmd(CommandList_struct *c, __u8 cmd, int ctlr, void *buff,
|
|||
}
|
||||
} else if (cmd_type == TYPE_MSG) {
|
||||
switch (cmd) {
|
||||
case 0: /* ABORT message */
|
||||
c->Request.CDBLen = 12;
|
||||
c->Request.Type.Attribute = ATTR_SIMPLE;
|
||||
c->Request.Type.Direction = XFER_WRITE;
|
||||
c->Request.Timeout = 0;
|
||||
c->Request.CDB[0] = cmd; /* abort */
|
||||
c->Request.CDB[1] = 0; /* abort a command */
|
||||
/* buff contains the tag of the command to abort */
|
||||
memcpy(&c->Request.CDB[4], buff, 8);
|
||||
break;
|
||||
case 1: /* RESET message */
|
||||
c->Request.CDBLen = 12;
|
||||
c->Request.Type.Attribute = ATTR_SIMPLE;
|
||||
c->Request.Type.Direction = XFER_WRITE;
|
||||
c->Request.Timeout = 0;
|
||||
memset(&c->Request.CDB[0], 0, sizeof(c->Request.CDB));
|
||||
c->Request.CDB[0] = cmd; /* reset */
|
||||
c->Request.CDB[1] = 0x04; /* reset a LUN */
|
||||
case 3: /* No-Op message */
|
||||
c->Request.CDBLen = 1;
|
||||
c->Request.Type.Attribute = ATTR_SIMPLE;
|
||||
|
@ -1872,6 +1891,52 @@ static unsigned long pollcomplete(int ctlr)
|
|||
/* Invalid address to tell caller we ran out of time */
|
||||
return 1;
|
||||
}
|
||||
|
||||
static int add_sendcmd_reject(__u8 cmd, int ctlr, unsigned long complete)
|
||||
{
|
||||
/* We get in here if sendcmd() is polling for completions
|
||||
and gets some command back that it wasn't expecting --
|
||||
something other than that which it just sent down.
|
||||
Ordinarily, that shouldn't happen, but it can happen when
|
||||
the scsi tape stuff gets into error handling mode, and
|
||||
starts using sendcmd() to try to abort commands and
|
||||
reset tape drives. In that case, sendcmd may pick up
|
||||
completions of commands that were sent to logical drives
|
||||
through the block i/o system, or cciss ioctls completing, etc.
|
||||
In that case, we need to save those completions for later
|
||||
processing by the interrupt handler.
|
||||
*/
|
||||
|
||||
#ifdef CONFIG_CISS_SCSI_TAPE
|
||||
struct sendcmd_reject_list *srl = &hba[ctlr]->scsi_rejects;
|
||||
|
||||
/* If it's not the scsi tape stuff doing error handling, (abort */
|
||||
/* or reset) then we don't expect anything weird. */
|
||||
if (cmd != CCISS_RESET_MSG && cmd != CCISS_ABORT_MSG) {
|
||||
#endif
|
||||
printk( KERN_WARNING "cciss cciss%d: SendCmd "
|
||||
"Invalid command list address returned! (%lx)\n",
|
||||
ctlr, complete);
|
||||
/* not much we can do. */
|
||||
#ifdef CONFIG_CISS_SCSI_TAPE
|
||||
return 1;
|
||||
}
|
||||
|
||||
/* We've sent down an abort or reset, but something else
|
||||
has completed */
|
||||
if (srl->ncompletions >= (NR_CMDS + 2)) {
|
||||
/* Uh oh. No room to save it for later... */
|
||||
printk(KERN_WARNING "cciss%d: Sendcmd: Invalid command addr, "
|
||||
"reject list overflow, command lost!\n", ctlr);
|
||||
return 1;
|
||||
}
|
||||
/* Save it for later */
|
||||
srl->complete[srl->ncompletions] = complete;
|
||||
srl->ncompletions++;
|
||||
#endif
|
||||
return 0;
|
||||
}
|
||||
|
||||
/*
|
||||
* Send a command to the controller, and wait for it to complete.
|
||||
* Only used at init time.
|
||||
|
@ -1894,7 +1959,7 @@ static int sendcmd(
|
|||
unsigned long complete;
|
||||
ctlr_info_t *info_p= hba[ctlr];
|
||||
u64bit buff_dma_handle;
|
||||
int status;
|
||||
int status, done = 0;
|
||||
|
||||
if ((c = cmd_alloc(info_p, 1)) == NULL) {
|
||||
printk(KERN_WARNING "cciss: unable to get memory");
|
||||
|
@ -1916,7 +1981,9 @@ resend_cmd1:
|
|||
info_p->access.set_intr_mask(info_p, CCISS_INTR_OFF);
|
||||
|
||||
/* Make sure there is room in the command FIFO */
|
||||
/* Actually it should be completely empty at this time. */
|
||||
/* Actually it should be completely empty at this time */
|
||||
/* unless we are in here doing error handling for the scsi */
|
||||
/* tape side of the driver. */
|
||||
for (i = 200000; i > 0; i--)
|
||||
{
|
||||
/* if fifo isn't full go */
|
||||
|
@ -1933,13 +2000,25 @@ resend_cmd1:
|
|||
* Send the cmd
|
||||
*/
|
||||
info_p->access.submit_command(info_p, c);
|
||||
done = 0;
|
||||
do {
|
||||
complete = pollcomplete(ctlr);
|
||||
|
||||
#ifdef CCISS_DEBUG
|
||||
printk(KERN_DEBUG "cciss: command completed\n");
|
||||
#endif /* CCISS_DEBUG */
|
||||
|
||||
if (complete != 1) {
|
||||
if (complete == 1) {
|
||||
printk( KERN_WARNING
|
||||
"cciss cciss%d: SendCmd Timeout out, "
|
||||
"No command list address returned!\n",
|
||||
ctlr);
|
||||
status = IO_ERROR;
|
||||
done = 1;
|
||||
break;
|
||||
}
|
||||
|
||||
/* This will need to change for direct lookup completions */
|
||||
if ( (complete & CISS_ERROR_BIT)
|
||||
&& (complete & ~CISS_ERROR_BIT) == c->busaddr)
|
||||
{
|
||||
|
@ -1979,6 +2058,10 @@ resend_cmd1:
|
|||
status = IO_ERROR;
|
||||
goto cleanup1;
|
||||
}
|
||||
} else if (c->err_info->CommandStatus == CMD_UNABORTABLE) {
|
||||
printk(KERN_WARNING "cciss%d: command could not be aborted.\n", ctlr);
|
||||
status = IO_ERROR;
|
||||
goto cleanup1;
|
||||
}
|
||||
printk(KERN_WARNING "ciss ciss%d: sendcmd"
|
||||
" Error %x \n", ctlr,
|
||||
|
@ -1993,20 +2076,15 @@ resend_cmd1:
|
|||
goto cleanup1;
|
||||
}
|
||||
}
|
||||
/* This will need changing for direct lookup completions */
|
||||
if (complete != c->busaddr) {
|
||||
printk( KERN_WARNING "cciss cciss%d: SendCmd "
|
||||
"Invalid command list address returned! (%lx)\n",
|
||||
ctlr, complete);
|
||||
status = IO_ERROR;
|
||||
goto cleanup1;
|
||||
}
|
||||
} else {
|
||||
printk( KERN_WARNING
|
||||
"cciss cciss%d: SendCmd Timeout out, "
|
||||
"No command list address returned!\n",
|
||||
ctlr);
|
||||
status = IO_ERROR;
|
||||
if (add_sendcmd_reject(cmd, ctlr, complete) != 0) {
|
||||
BUG(); /* we are pretty much hosed if we get here. */
|
||||
}
|
||||
continue;
|
||||
} else
|
||||
done = 1;
|
||||
} while (!done);
|
||||
|
||||
cleanup1:
|
||||
/* unlock the data buffer from DMA */
|
||||
|
@ -2014,6 +2092,11 @@ cleanup1:
|
|||
buff_dma_handle.val32.upper = c->SG[0].Addr.upper;
|
||||
pci_unmap_single(info_p->pdev, (dma_addr_t) buff_dma_handle.val,
|
||||
c->SG[0].Len, PCI_DMA_BIDIRECTIONAL);
|
||||
#ifdef CONFIG_CISS_SCSI_TAPE
|
||||
/* if we saved some commands for later, process them now. */
|
||||
if (info_p->scsi_rejects.ncompletions > 0)
|
||||
do_cciss_intr(0, info_p, NULL);
|
||||
#endif
|
||||
cmd_free(info_p, c, 1);
|
||||
return (status);
|
||||
}
|
||||
|
@ -2338,6 +2421,48 @@ startio:
|
|||
start_io(h);
|
||||
}
|
||||
|
||||
static inline unsigned long get_next_completion(ctlr_info_t *h)
|
||||
{
|
||||
#ifdef CONFIG_CISS_SCSI_TAPE
|
||||
/* Any rejects from sendcmd() lying around? Process them first */
|
||||
if (h->scsi_rejects.ncompletions == 0)
|
||||
return h->access.command_completed(h);
|
||||
else {
|
||||
struct sendcmd_reject_list *srl;
|
||||
int n;
|
||||
srl = &h->scsi_rejects;
|
||||
n = --srl->ncompletions;
|
||||
/* printk("cciss%d: processing saved reject\n", h->ctlr); */
|
||||
printk("p");
|
||||
return srl->complete[n];
|
||||
}
|
||||
#else
|
||||
return h->access.command_completed(h);
|
||||
#endif
|
||||
}
|
||||
|
||||
static inline int interrupt_pending(ctlr_info_t *h)
|
||||
{
|
||||
#ifdef CONFIG_CISS_SCSI_TAPE
|
||||
return ( h->access.intr_pending(h)
|
||||
|| (h->scsi_rejects.ncompletions > 0));
|
||||
#else
|
||||
return h->access.intr_pending(h);
|
||||
#endif
|
||||
}
|
||||
|
||||
static inline long interrupt_not_for_us(ctlr_info_t *h)
|
||||
{
|
||||
#ifdef CONFIG_CISS_SCSI_TAPE
|
||||
return (((h->access.intr_pending(h) == 0) ||
|
||||
(h->interrupts_enabled == 0))
|
||||
&& (h->scsi_rejects.ncompletions == 0));
|
||||
#else
|
||||
return (((h->access.intr_pending(h) == 0) ||
|
||||
(h->interrupts_enabled == 0)));
|
||||
#endif
|
||||
}
|
||||
|
||||
static irqreturn_t do_cciss_intr(int irq, void *dev_id, struct pt_regs *regs)
|
||||
{
|
||||
ctlr_info_t *h = dev_id;
|
||||
|
@ -2347,19 +2472,15 @@ static irqreturn_t do_cciss_intr(int irq, void *dev_id, struct pt_regs *regs)
|
|||
int j;
|
||||
int start_queue = h->next_to_run;
|
||||
|
||||
/* Is this interrupt for us? */
|
||||
if (( h->access.intr_pending(h) == 0) || (h->interrupts_enabled == 0))
|
||||
if (interrupt_not_for_us(h))
|
||||
return IRQ_NONE;
|
||||
|
||||
/*
|
||||
* If there are completed commands in the completion queue,
|
||||
* we had better do something about it.
|
||||
*/
|
||||
spin_lock_irqsave(CCISS_LOCK(h->ctlr), flags);
|
||||
while( h->access.intr_pending(h))
|
||||
{
|
||||
while((a = h->access.command_completed(h)) != FIFO_EMPTY)
|
||||
{
|
||||
while (interrupt_pending(h)) {
|
||||
while((a = get_next_completion(h)) != FIFO_EMPTY) {
|
||||
a1 = a;
|
||||
if ((a & 0x04)) {
|
||||
a2 = (a >> 3);
|
||||
|
@ -2966,7 +3087,15 @@ static int __devinit cciss_init_one(struct pci_dev *pdev,
|
|||
printk( KERN_ERR "cciss: out of memory");
|
||||
goto clean4;
|
||||
}
|
||||
|
||||
#ifdef CONFIG_CISS_SCSI_TAPE
|
||||
hba[i]->scsi_rejects.complete =
|
||||
kmalloc(sizeof(hba[i]->scsi_rejects.complete[0]) *
|
||||
(NR_CMDS + 5), GFP_KERNEL);
|
||||
if (hba[i]->scsi_rejects.complete == NULL) {
|
||||
printk( KERN_ERR "cciss: out of memory");
|
||||
goto clean4;
|
||||
}
|
||||
#endif
|
||||
spin_lock_init(&hba[i]->lock);
|
||||
|
||||
/* Initialize the pdev driver private data.
|
||||
|
@ -3034,6 +3163,10 @@ static int __devinit cciss_init_one(struct pci_dev *pdev,
|
|||
return(1);
|
||||
|
||||
clean4:
|
||||
#ifdef CONFIG_CISS_SCSI_TAPE
|
||||
if(hba[i]->scsi_rejects.complete)
|
||||
kfree(hba[i]->scsi_rejects.complete);
|
||||
#endif
|
||||
if(hba[i]->cmd_pool_bits)
|
||||
kfree(hba[i]->cmd_pool_bits);
|
||||
if(hba[i]->cmd_pool)
|
||||
|
@ -3107,6 +3240,9 @@ static void __devexit cciss_remove_one (struct pci_dev *pdev)
|
|||
pci_free_consistent(hba[i]->pdev, NR_CMDS * sizeof( ErrorInfo_struct),
|
||||
hba[i]->errinfo_pool, hba[i]->errinfo_pool_dhandle);
|
||||
kfree(hba[i]->cmd_pool_bits);
|
||||
#ifdef CONFIG_CISS_SCSI_TAPE
|
||||
kfree(hba[i]->scsi_rejects.complete);
|
||||
#endif
|
||||
release_io_mem(hba[i]);
|
||||
free_hba(i);
|
||||
}
|
||||
|
|
|
@ -44,6 +44,14 @@ typedef struct _drive_info_struct
|
|||
*/
|
||||
} drive_info_struct;
|
||||
|
||||
#ifdef CONFIG_CISS_SCSI_TAPE
|
||||
|
||||
struct sendcmd_reject_list {
|
||||
int ncompletions;
|
||||
unsigned long *complete; /* array of NR_CMDS tags */
|
||||
};
|
||||
|
||||
#endif
|
||||
struct ctlr_info
|
||||
{
|
||||
int ctlr;
|
||||
|
@ -100,6 +108,9 @@ struct ctlr_info
|
|||
struct gendisk *gendisk[NWD];
|
||||
#ifdef CONFIG_CISS_SCSI_TAPE
|
||||
void *scsi_ctlr; /* ptr to structure containing scsi related stuff */
|
||||
/* list of block side commands the scsi error handling sucked up */
|
||||
/* and saved for later processing */
|
||||
struct sendcmd_reject_list scsi_rejects;
|
||||
#endif
|
||||
unsigned char alive;
|
||||
};
|
||||
|
|
|
@ -42,6 +42,9 @@
|
|||
|
||||
#include "cciss_scsi.h"
|
||||
|
||||
#define CCISS_ABORT_MSG 0x00
|
||||
#define CCISS_RESET_MSG 0x01
|
||||
|
||||
/* some prototypes... */
|
||||
static int sendcmd(
|
||||
__u8 cmd,
|
||||
|
@ -67,6 +70,8 @@ static int cciss_scsi_proc_info(
|
|||
|
||||
static int cciss_scsi_queue_command (struct scsi_cmnd *cmd,
|
||||
void (* done)(struct scsi_cmnd *));
|
||||
static int cciss_eh_device_reset_handler(struct scsi_cmnd *);
|
||||
static int cciss_eh_abort_handler(struct scsi_cmnd *);
|
||||
|
||||
static struct cciss_scsi_hba_t ccissscsi[MAX_CTLR] = {
|
||||
{ .name = "cciss0", .ndevices = 0 },
|
||||
|
@ -90,6 +95,9 @@ static struct scsi_host_template cciss_driver_template = {
|
|||
.sg_tablesize = MAXSGENTRIES,
|
||||
.cmd_per_lun = 1,
|
||||
.use_clustering = DISABLE_CLUSTERING,
|
||||
/* Can't have eh_bus_reset_handler or eh_host_reset_handler for cciss */
|
||||
.eh_device_reset_handler= cciss_eh_device_reset_handler,
|
||||
.eh_abort_handler = cciss_eh_abort_handler,
|
||||
};
|
||||
|
||||
#pragma pack(1)
|
||||
|
@ -247,7 +255,7 @@ scsi_cmd_stack_free(int ctlr)
|
|||
#define DEVICETYPE(n) (n<0 || n>MAX_SCSI_DEVICE_CODE) ? \
|
||||
"Unknown" : scsi_device_types[n]
|
||||
|
||||
#if 0
|
||||
#if 1
|
||||
static int xmargin=8;
|
||||
static int amargin=60;
|
||||
|
||||
|
@ -1448,6 +1456,78 @@ cciss_proc_tape_report(int ctlr, unsigned char *buffer, off_t *pos, off_t *len)
|
|||
*pos += size; *len += size;
|
||||
}
|
||||
|
||||
/* Need at least one of these error handlers to keep ../scsi/hosts.c from
|
||||
* complaining. Doing a host- or bus-reset can't do anything good here.
|
||||
* Despite what it might say in scsi_error.c, there may well be commands
|
||||
* on the controller, as the cciss driver registers twice, once as a block
|
||||
* device for the logical drives, and once as a scsi device, for any tape
|
||||
* drives. So we know there are no commands out on the tape drives, but we
|
||||
* don't know there are no commands on the controller, and it is likely
|
||||
* that there probably are, as the cciss block device is most commonly used
|
||||
* as a boot device (embedded controller on HP/Compaq systems.)
|
||||
*/
|
||||
|
||||
static int cciss_eh_device_reset_handler(struct scsi_cmnd *scsicmd)
|
||||
{
|
||||
int rc;
|
||||
CommandList_struct *cmd_in_trouble;
|
||||
ctlr_info_t **c;
|
||||
int ctlr;
|
||||
|
||||
/* find the controller to which the command to be aborted was sent */
|
||||
c = (ctlr_info_t **) &scsicmd->device->host->hostdata[0];
|
||||
if (c == NULL) /* paranoia */
|
||||
return FAILED;
|
||||
ctlr = (*c)->ctlr;
|
||||
printk(KERN_WARNING "cciss%d: resetting tape drive or medium changer.\n", ctlr);
|
||||
|
||||
/* find the command that's giving us trouble */
|
||||
cmd_in_trouble = (CommandList_struct *) scsicmd->host_scribble;
|
||||
if (cmd_in_trouble == NULL) { /* paranoia */
|
||||
return FAILED;
|
||||
}
|
||||
/* send a reset to the SCSI LUN which the command was sent to */
|
||||
rc = sendcmd(CCISS_RESET_MSG, ctlr, NULL, 0, 2, 0, 0,
|
||||
(unsigned char *) &cmd_in_trouble->Header.LUN.LunAddrBytes[0],
|
||||
TYPE_MSG);
|
||||
/* sendcmd turned off interrputs on the board, turn 'em back on. */
|
||||
(*c)->access.set_intr_mask(*c, CCISS_INTR_ON);
|
||||
if (rc == 0)
|
||||
return SUCCESS;
|
||||
printk(KERN_WARNING "cciss%d: resetting device failed.\n", ctlr);
|
||||
return FAILED;
|
||||
}
|
||||
|
||||
static int cciss_eh_abort_handler(struct scsi_cmnd *scsicmd)
|
||||
{
|
||||
int rc;
|
||||
CommandList_struct *cmd_to_abort;
|
||||
ctlr_info_t **c;
|
||||
int ctlr;
|
||||
|
||||
/* find the controller to which the command to be aborted was sent */
|
||||
c = (ctlr_info_t **) &scsicmd->device->host->hostdata[0];
|
||||
if (c == NULL) /* paranoia */
|
||||
return FAILED;
|
||||
ctlr = (*c)->ctlr;
|
||||
printk(KERN_WARNING "cciss%d: aborting tardy SCSI cmd\n", ctlr);
|
||||
|
||||
/* find the command to be aborted */
|
||||
cmd_to_abort = (CommandList_struct *) scsicmd->host_scribble;
|
||||
if (cmd_to_abort == NULL) /* paranoia */
|
||||
return FAILED;
|
||||
rc = sendcmd(CCISS_ABORT_MSG, ctlr, &cmd_to_abort->Header.Tag,
|
||||
0, 2, 0, 0,
|
||||
(unsigned char *) &cmd_to_abort->Header.LUN.LunAddrBytes[0],
|
||||
TYPE_MSG);
|
||||
/* sendcmd turned off interrputs on the board, turn 'em back on. */
|
||||
(*c)->access.set_intr_mask(*c, CCISS_INTR_ON);
|
||||
if (rc == 0)
|
||||
return SUCCESS;
|
||||
return FAILED;
|
||||
|
||||
}
|
||||
|
||||
#else /* no CONFIG_CISS_SCSI_TAPE */
|
||||
|
||||
/* If no tape support, then these become defined out of existence */
|
||||
|
|
Loading…
Reference in New Issue