[SCSI] zfcp: minor erp bug fixes

Bug fixes for zfcp's erp:
- trigger adapter reopen if do_QDIO fails
- avoid erp deadlock if registration of scsi target or remote port hang
- do not treat as error if exchange port data fails
- decrease timeout for target reset and aborts
- mark unit failed if slave_destroy is called

Additionally some code cleanup was done:
- made some functions void when retval is not of interest
- shortened initialization of zfcp's host_template
- corrected some comments

Signed-off-by: Andreas Herrmann <aherrman@de.ibm.com>
Signed-off-by: James Bottomley <James.Bottomley@SteelEye.com>
This commit is contained in:
Andreas Herrmann 2006-08-02 11:05:52 +02:00 committed by James Bottomley
parent fea9d6c7bc
commit f6c0e7a7b3
4 changed files with 106 additions and 166 deletions

View File

@ -80,7 +80,7 @@ zfcp_address_to_sg(void *address, struct scatterlist *list)
#define REQUEST_LIST_SIZE 128
/********************* SCSI SPECIFIC DEFINES *********************************/
#define ZFCP_SCSI_ER_TIMEOUT (100*HZ)
#define ZFCP_SCSI_ER_TIMEOUT (10*HZ)
/********************* CIO/QDIO SPECIFIC DEFINES *****************************/

View File

@ -64,8 +64,8 @@ static int zfcp_erp_strategy_check_action(struct zfcp_erp_action *, int);
static int zfcp_erp_adapter_strategy(struct zfcp_erp_action *);
static int zfcp_erp_adapter_strategy_generic(struct zfcp_erp_action *, int);
static int zfcp_erp_adapter_strategy_close(struct zfcp_erp_action *);
static int zfcp_erp_adapter_strategy_close_qdio(struct zfcp_erp_action *);
static int zfcp_erp_adapter_strategy_close_fsf(struct zfcp_erp_action *);
static void zfcp_erp_adapter_strategy_close_qdio(struct zfcp_erp_action *);
static void zfcp_erp_adapter_strategy_close_fsf(struct zfcp_erp_action *);
static int zfcp_erp_adapter_strategy_open(struct zfcp_erp_action *);
static int zfcp_erp_adapter_strategy_open_qdio(struct zfcp_erp_action *);
static int zfcp_erp_adapter_strategy_open_fsf(struct zfcp_erp_action *);
@ -93,10 +93,9 @@ static int zfcp_erp_unit_strategy_clearstati(struct zfcp_unit *);
static int zfcp_erp_unit_strategy_close(struct zfcp_erp_action *);
static int zfcp_erp_unit_strategy_open(struct zfcp_erp_action *);
static int zfcp_erp_action_dismiss_adapter(struct zfcp_adapter *);
static int zfcp_erp_action_dismiss_port(struct zfcp_port *);
static int zfcp_erp_action_dismiss_unit(struct zfcp_unit *);
static int zfcp_erp_action_dismiss(struct zfcp_erp_action *);
static void zfcp_erp_action_dismiss_port(struct zfcp_port *);
static void zfcp_erp_action_dismiss_unit(struct zfcp_unit *);
static void zfcp_erp_action_dismiss(struct zfcp_erp_action *);
static int zfcp_erp_action_enqueue(int, struct zfcp_adapter *,
struct zfcp_port *, struct zfcp_unit *);
@ -135,29 +134,39 @@ zfcp_fsf_request_timeout_handler(unsigned long data)
zfcp_erp_adapter_reopen(adapter, 0);
}
/*
* function: zfcp_fsf_scsi_er_timeout_handler
/**
* zfcp_fsf_scsi_er_timeout_handler - timeout handler for scsi eh tasks
*
* purpose: This function needs to be called whenever a SCSI error recovery
* action (abort/reset) does not return.
* Re-opening the adapter means that the command can be returned
* by zfcp (it is guarranteed that it does not return via the
* adapter anymore). The buffer can then be used again.
*
* returns: sod all
* This function needs to be called whenever a SCSI error recovery
* action (abort/reset) does not return. Re-opening the adapter means
* that the abort/reset command can be returned by zfcp. It won't complete
* via the adapter anymore (because qdio queues are closed). If ERP is
* already running on this adapter it will be stopped.
*/
void
zfcp_fsf_scsi_er_timeout_handler(unsigned long data)
void zfcp_fsf_scsi_er_timeout_handler(unsigned long data)
{
struct zfcp_adapter *adapter = (struct zfcp_adapter *) data;
unsigned long flags;
ZFCP_LOG_NORMAL("warning: SCSI error recovery timed out. "
"Restarting all operations on the adapter %s\n",
zfcp_get_busid_by_adapter(adapter));
debug_text_event(adapter->erp_dbf, 1, "eh_lmem_tout");
zfcp_erp_adapter_reopen(adapter, 0);
return;
write_lock_irqsave(&adapter->erp_lock, flags);
if (atomic_test_mask(ZFCP_STATUS_ADAPTER_ERP_PENDING,
&adapter->status)) {
zfcp_erp_modify_adapter_status(adapter,
ZFCP_STATUS_COMMON_UNBLOCKED|ZFCP_STATUS_COMMON_OPEN,
ZFCP_CLEAR);
zfcp_erp_action_dismiss_adapter(adapter);
write_unlock_irqrestore(&adapter->erp_lock, flags);
/* dismiss all pending requests including requests for ERP */
zfcp_fsf_req_dismiss_all(adapter);
adapter->fsf_req_seq_no = 0;
} else
write_unlock_irqrestore(&adapter->erp_lock, flags);
zfcp_erp_adapter_reopen(adapter, 0);
}
/*
@ -670,17 +679,10 @@ zfcp_erp_unit_reopen(struct zfcp_unit *unit, int clear_mask)
return retval;
}
/*
* function:
*
* purpose: disable I/O,
* return any open requests and clean them up,
* aim: no pending and incoming I/O
*
* returns:
/**
* zfcp_erp_adapter_block - mark adapter as blocked, block scsi requests
*/
static void
zfcp_erp_adapter_block(struct zfcp_adapter *adapter, int clear_mask)
static void zfcp_erp_adapter_block(struct zfcp_adapter *adapter, int clear_mask)
{
debug_text_event(adapter->erp_dbf, 6, "a_bl");
zfcp_erp_modify_adapter_status(adapter,
@ -688,15 +690,10 @@ zfcp_erp_adapter_block(struct zfcp_adapter *adapter, int clear_mask)
clear_mask, ZFCP_CLEAR);
}
/*
* function:
*
* purpose: enable I/O
*
* returns:
/**
* zfcp_erp_adapter_unblock - mark adapter as unblocked, allow scsi requests
*/
static void
zfcp_erp_adapter_unblock(struct zfcp_adapter *adapter)
static void zfcp_erp_adapter_unblock(struct zfcp_adapter *adapter)
{
debug_text_event(adapter->erp_dbf, 6, "a_ubl");
atomic_set_mask(ZFCP_STATUS_COMMON_UNBLOCKED, &adapter->status);
@ -897,23 +894,15 @@ zfcp_erp_strategy_check_fsfreq(struct zfcp_erp_action *erp_action)
return retval;
}
/*
* purpose: generic handler for asynchronous events related to erp_action events
* (normal completion, time-out, dismissing, retry after
* low memory condition)
/**
* zfcp_erp_async_handler_nolock - complete erp_action
*
* note: deletion of timer is not required (e.g. in case of a time-out),
* but a second try does no harm,
* we leave it in here to allow for greater simplification
*
* returns: 0 - there was an action to handle
* !0 - otherwise
* Used for normal completion, time-out, dismissal and failure after
* low memory condition.
*/
static int
zfcp_erp_async_handler_nolock(struct zfcp_erp_action *erp_action,
unsigned long set_mask)
static void zfcp_erp_async_handler_nolock(struct zfcp_erp_action *erp_action,
unsigned long set_mask)
{
int retval;
struct zfcp_adapter *adapter = erp_action->adapter;
if (zfcp_erp_action_exists(erp_action) == ZFCP_ERP_ACTION_RUNNING) {
@ -924,43 +913,26 @@ zfcp_erp_async_handler_nolock(struct zfcp_erp_action *erp_action,
del_timer(&erp_action->timer);
erp_action->status |= set_mask;
zfcp_erp_action_ready(erp_action);
retval = 0;
} else {
/* action is ready or gone - nothing to do */
debug_text_event(adapter->erp_dbf, 3, "a_asyh_gone");
debug_event(adapter->erp_dbf, 3, &erp_action->action,
sizeof (int));
retval = 1;
}
return retval;
}
/*
* purpose: generic handler for asynchronous events related to erp_action
* events (normal completion, time-out, dismissing, retry after
* low memory condition)
*
* note: deletion of timer is not required (e.g. in case of a time-out),
* but a second try does no harm,
* we leave it in here to allow for greater simplification
*
* returns: 0 - there was an action to handle
* !0 - otherwise
/**
* zfcp_erp_async_handler - wrapper for erp_async_handler_nolock w/ locking
*/
int
zfcp_erp_async_handler(struct zfcp_erp_action *erp_action,
unsigned long set_mask)
void zfcp_erp_async_handler(struct zfcp_erp_action *erp_action,
unsigned long set_mask)
{
struct zfcp_adapter *adapter = erp_action->adapter;
unsigned long flags;
int retval;
write_lock_irqsave(&adapter->erp_lock, flags);
retval = zfcp_erp_async_handler_nolock(erp_action, set_mask);
zfcp_erp_async_handler_nolock(erp_action, set_mask);
write_unlock_irqrestore(&adapter->erp_lock, flags);
return retval;
}
/*
@ -997,17 +969,15 @@ zfcp_erp_timeout_handler(unsigned long data)
zfcp_erp_async_handler(erp_action, ZFCP_STATUS_ERP_TIMEDOUT);
}
/*
* purpose: is called for an erp_action which needs to be ended
* though not being done,
* this is usually required if an higher is generated,
* action gets an appropriate flag and will be processed
* accordingly
/**
* zfcp_erp_action_dismiss - dismiss an erp_action
*
* locks: erp_lock held (thus we need to call another handler variant)
* adapter->erp_lock must be held
*
* Dismissal of an erp_action is usually required if an erp_action of
* higher priority is generated.
*/
static int
zfcp_erp_action_dismiss(struct zfcp_erp_action *erp_action)
static void zfcp_erp_action_dismiss(struct zfcp_erp_action *erp_action)
{
struct zfcp_adapter *adapter = erp_action->adapter;
@ -1015,8 +985,6 @@ zfcp_erp_action_dismiss(struct zfcp_erp_action *erp_action)
debug_event(adapter->erp_dbf, 2, &erp_action->action, sizeof (int));
zfcp_erp_async_handler_nolock(erp_action, ZFCP_STATUS_ERP_DISMISSED);
return 0;
}
int
@ -2072,18 +2040,12 @@ zfcp_erp_adapter_strategy_open_qdio(struct zfcp_erp_action *erp_action)
return retval;
}
/*
* function: zfcp_qdio_cleanup
*
* purpose: cleans up QDIO operation for the specified adapter
*
* returns: 0 - successful cleanup
* !0 - failed cleanup
/**
* zfcp_erp_adapter_strategy_close_qdio - close qdio queues for an adapter
*/
int
static void
zfcp_erp_adapter_strategy_close_qdio(struct zfcp_erp_action *erp_action)
{
int retval = ZFCP_ERP_SUCCEEDED;
int first_used;
int used_count;
struct zfcp_adapter *adapter = erp_action->adapter;
@ -2092,15 +2054,13 @@ zfcp_erp_adapter_strategy_close_qdio(struct zfcp_erp_action *erp_action)
ZFCP_LOG_DEBUG("error: attempt to shut down inactive QDIO "
"queues on adapter %s\n",
zfcp_get_busid_by_adapter(adapter));
retval = ZFCP_ERP_FAILED;
goto out;
return;
}
/*
* Get queue_lock and clear QDIOUP flag. Thus it's guaranteed that
* do_QDIO won't be called while qdio_shutdown is in progress.
*/
write_lock_irq(&adapter->request_queue.queue_lock);
atomic_clear_mask(ZFCP_STATUS_ADAPTER_QDIOUP, &adapter->status);
write_unlock_irq(&adapter->request_queue.queue_lock);
@ -2132,8 +2092,6 @@ zfcp_erp_adapter_strategy_close_qdio(struct zfcp_erp_action *erp_action)
adapter->request_queue.free_index = 0;
atomic_set(&adapter->request_queue.free_count, 0);
adapter->request_queue.distance_from_int = 0;
out:
return retval;
}
static int
@ -2256,11 +2214,11 @@ zfcp_erp_adapter_strategy_open_fsf_xport(struct zfcp_erp_action *erp_action)
"%s)\n", zfcp_get_busid_by_adapter(adapter));
ret = ZFCP_ERP_FAILED;
}
if (!atomic_test_mask(ZFCP_STATUS_ADAPTER_XPORT_OK, &adapter->status)) {
ZFCP_LOG_INFO("error: exchange port data failed (adapter "
/* don't treat as error for the sake of compatibility */
if (!atomic_test_mask(ZFCP_STATUS_ADAPTER_XPORT_OK, &adapter->status))
ZFCP_LOG_INFO("warning: exchange port data failed (adapter "
"%s\n", zfcp_get_busid_by_adapter(adapter));
ret = ZFCP_ERP_FAILED;
}
return ret;
}
@ -2290,18 +2248,12 @@ zfcp_erp_adapter_strategy_open_fsf_statusread(struct zfcp_erp_action
return retval;
}
/*
* function: zfcp_fsf_cleanup
*
* purpose: cleanup FSF operation for specified adapter
*
* returns: 0 - FSF operation successfully cleaned up
* !0 - failed to cleanup FSF operation for this adapter
/**
* zfcp_erp_adapter_strategy_close_fsf - stop FSF operations for an adapter
*/
static int
static void
zfcp_erp_adapter_strategy_close_fsf(struct zfcp_erp_action *erp_action)
{
int retval = ZFCP_ERP_SUCCEEDED;
struct zfcp_adapter *adapter = erp_action->adapter;
/*
@ -2315,8 +2267,6 @@ zfcp_erp_adapter_strategy_close_fsf(struct zfcp_erp_action *erp_action)
/* all ports and units are closed */
zfcp_erp_modify_adapter_status(adapter,
ZFCP_STATUS_COMMON_OPEN, ZFCP_CLEAR);
return retval;
}
/*
@ -3291,10 +3241,8 @@ zfcp_erp_action_cleanup(int action, struct zfcp_adapter *adapter,
}
static int
zfcp_erp_action_dismiss_adapter(struct zfcp_adapter *adapter)
void zfcp_erp_action_dismiss_adapter(struct zfcp_adapter *adapter)
{
int retval = 0;
struct zfcp_port *port;
debug_text_event(adapter->erp_dbf, 5, "a_actab");
@ -3303,14 +3251,10 @@ zfcp_erp_action_dismiss_adapter(struct zfcp_adapter *adapter)
else
list_for_each_entry(port, &adapter->port_list_head, list)
zfcp_erp_action_dismiss_port(port);
return retval;
}
static int
zfcp_erp_action_dismiss_port(struct zfcp_port *port)
static void zfcp_erp_action_dismiss_port(struct zfcp_port *port)
{
int retval = 0;
struct zfcp_unit *unit;
struct zfcp_adapter *adapter = port->adapter;
@ -3321,22 +3265,16 @@ zfcp_erp_action_dismiss_port(struct zfcp_port *port)
else
list_for_each_entry(unit, &port->unit_list_head, list)
zfcp_erp_action_dismiss_unit(unit);
return retval;
}
static int
zfcp_erp_action_dismiss_unit(struct zfcp_unit *unit)
static void zfcp_erp_action_dismiss_unit(struct zfcp_unit *unit)
{
int retval = 0;
struct zfcp_adapter *adapter = unit->port->adapter;
debug_text_event(adapter->erp_dbf, 5, "u_actab");
debug_event(adapter->erp_dbf, 5, &unit->fcp_lun, sizeof (fcp_lun_t));
if (atomic_test_mask(ZFCP_STATUS_COMMON_ERP_INUSE, &unit->status))
zfcp_erp_action_dismiss(&unit->erp_action);
return retval;
}
static inline void

View File

@ -139,6 +139,7 @@ extern void zfcp_erp_modify_adapter_status(struct zfcp_adapter *, u32, int);
extern int zfcp_erp_adapter_reopen(struct zfcp_adapter *, int);
extern int zfcp_erp_adapter_shutdown(struct zfcp_adapter *, int);
extern void zfcp_erp_adapter_failed(struct zfcp_adapter *);
extern void zfcp_erp_action_dismiss_adapter(struct zfcp_adapter *);
extern void zfcp_erp_modify_port_status(struct zfcp_port *, u32, int);
extern int zfcp_erp_port_reopen(struct zfcp_port *, int);
@ -155,7 +156,7 @@ extern void zfcp_erp_unit_failed(struct zfcp_unit *);
extern int zfcp_erp_thread_setup(struct zfcp_adapter *);
extern int zfcp_erp_thread_kill(struct zfcp_adapter *);
extern int zfcp_erp_wait(struct zfcp_adapter *);
extern int zfcp_erp_async_handler(struct zfcp_erp_action *, unsigned long);
extern void zfcp_erp_async_handler(struct zfcp_erp_action *, unsigned long);
extern int zfcp_test_link(struct zfcp_port *);

View File

@ -30,7 +30,6 @@ static int zfcp_scsi_queuecommand(struct scsi_cmnd *,
void (*done) (struct scsi_cmnd *));
static int zfcp_scsi_eh_abort_handler(struct scsi_cmnd *);
static int zfcp_scsi_eh_device_reset_handler(struct scsi_cmnd *);
static int zfcp_scsi_eh_bus_reset_handler(struct scsi_cmnd *);
static int zfcp_scsi_eh_host_reset_handler(struct scsi_cmnd *);
static int zfcp_task_management_function(struct zfcp_unit *, u8,
struct scsi_cmnd *);
@ -46,30 +45,22 @@ struct zfcp_data zfcp_data = {
.scsi_host_template = {
.name = ZFCP_NAME,
.proc_name = "zfcp",
.proc_info = NULL,
.detect = NULL,
.slave_alloc = zfcp_scsi_slave_alloc,
.slave_configure = zfcp_scsi_slave_configure,
.slave_destroy = zfcp_scsi_slave_destroy,
.queuecommand = zfcp_scsi_queuecommand,
.eh_abort_handler = zfcp_scsi_eh_abort_handler,
.eh_device_reset_handler = zfcp_scsi_eh_device_reset_handler,
.eh_bus_reset_handler = zfcp_scsi_eh_bus_reset_handler,
.eh_bus_reset_handler = zfcp_scsi_eh_host_reset_handler,
.eh_host_reset_handler = zfcp_scsi_eh_host_reset_handler,
.can_queue = 4096,
.this_id = -1,
/*
* FIXME:
* one less? can zfcp_create_sbale cope with it?
*/
.sg_tablesize = ZFCP_MAX_SBALES_PER_REQ,
.cmd_per_lun = 1,
.unchecked_isa_dma = 0,
.use_clustering = 1,
.sdev_attrs = zfcp_sysfs_sdev_attrs,
},
.driver_version = ZFCP_VERSION,
/* rest initialised with zeros */
};
/* Find start of Response Information in FCP response unit*/
@ -176,8 +167,14 @@ zfcp_scsi_slave_alloc(struct scsi_device *sdp)
return retval;
}
static void
zfcp_scsi_slave_destroy(struct scsi_device *sdpnt)
/**
* zfcp_scsi_slave_destroy - called when scsi device is removed
*
* Remove reference to associated scsi device for an zfcp_unit.
* Mark zfcp_unit as failed. The scsi device might be deleted via sysfs
* or a scan for this device might have failed.
*/
static void zfcp_scsi_slave_destroy(struct scsi_device *sdpnt)
{
struct zfcp_unit *unit = (struct zfcp_unit *) sdpnt->hostdata;
@ -185,6 +182,7 @@ zfcp_scsi_slave_destroy(struct scsi_device *sdpnt)
atomic_clear_mask(ZFCP_STATUS_UNIT_REGISTERED, &unit->status);
sdpnt->hostdata = NULL;
unit->device = NULL;
zfcp_erp_unit_failed(unit);
zfcp_unit_put(unit);
} else {
ZFCP_LOG_NORMAL("bug: no unit associated with SCSI device at "
@ -549,35 +547,38 @@ zfcp_task_management_function(struct zfcp_unit *unit, u8 tm_flags,
}
/**
* zfcp_scsi_eh_bus_reset_handler - reset bus (reopen adapter)
* zfcp_scsi_eh_host_reset_handler - handler for host and bus reset
*
* If ERP is already running it will be stopped.
*/
int
zfcp_scsi_eh_bus_reset_handler(struct scsi_cmnd *scpnt)
int zfcp_scsi_eh_host_reset_handler(struct scsi_cmnd *scpnt)
{
struct zfcp_unit *unit = (struct zfcp_unit*) scpnt->device->hostdata;
struct zfcp_adapter *adapter = unit->port->adapter;
struct zfcp_unit *unit;
struct zfcp_adapter *adapter;
unsigned long flags;
ZFCP_LOG_NORMAL("bus reset because of problems with "
unit = (struct zfcp_unit*) scpnt->device->hostdata;
adapter = unit->port->adapter;
ZFCP_LOG_NORMAL("host/bus reset because of problems with "
"unit 0x%016Lx\n", unit->fcp_lun);
zfcp_erp_adapter_reopen(adapter, 0);
zfcp_erp_wait(adapter);
return SUCCESS;
}
/**
* zfcp_scsi_eh_host_reset_handler - reset host (reopen adapter)
*/
int
zfcp_scsi_eh_host_reset_handler(struct scsi_cmnd *scpnt)
{
struct zfcp_unit *unit = (struct zfcp_unit*) scpnt->device->hostdata;
struct zfcp_adapter *adapter = unit->port->adapter;
ZFCP_LOG_NORMAL("host reset because of problems with "
"unit 0x%016Lx\n", unit->fcp_lun);
zfcp_erp_adapter_reopen(adapter, 0);
zfcp_erp_wait(adapter);
write_lock_irqsave(&adapter->erp_lock, flags);
if (atomic_test_mask(ZFCP_STATUS_ADAPTER_ERP_PENDING,
&adapter->status)) {
zfcp_erp_modify_adapter_status(adapter,
ZFCP_STATUS_COMMON_UNBLOCKED|ZFCP_STATUS_COMMON_OPEN,
ZFCP_CLEAR);
zfcp_erp_action_dismiss_adapter(adapter);
write_unlock_irqrestore(&adapter->erp_lock, flags);
zfcp_fsf_req_dismiss_all(adapter);
adapter->fsf_req_seq_no = 0;
zfcp_erp_adapter_reopen(adapter, 0);
} else {
write_unlock_irqrestore(&adapter->erp_lock, flags);
zfcp_erp_adapter_reopen(adapter, 0);
zfcp_erp_wait(adapter);
}
return SUCCESS;
}