2008-03-31 17:15:28 +08:00
|
|
|
/*
|
2010-12-02 22:16:14 +08:00
|
|
|
* zfcp device driver
|
|
|
|
* debug feature declarations
|
2008-03-31 17:15:28 +08:00
|
|
|
*
|
2017-07-28 18:30:54 +08:00
|
|
|
* Copyright IBM Corp. 2008, 2017
|
2008-03-31 17:15:28 +08:00
|
|
|
*/
|
|
|
|
|
|
|
|
#ifndef ZFCP_DBF_H
|
|
|
|
#define ZFCP_DBF_H
|
|
|
|
|
2009-11-24 23:54:08 +08:00
|
|
|
#include <scsi/fc/fc_fcp.h>
|
2009-08-18 21:43:08 +08:00
|
|
|
#include "zfcp_ext.h"
|
2008-03-31 17:15:28 +08:00
|
|
|
#include "zfcp_fsf.h"
|
2009-08-18 21:43:08 +08:00
|
|
|
#include "zfcp_def.h"
|
2008-03-31 17:15:28 +08:00
|
|
|
|
2010-12-02 22:16:12 +08:00
|
|
|
#define ZFCP_DBF_TAG_LEN 7
|
2008-03-31 17:15:28 +08:00
|
|
|
|
2010-02-17 18:18:54 +08:00
|
|
|
#define ZFCP_DBF_INVALID_LUN 0xFFFFFFFFFFFFFFFFull
|
|
|
|
|
2016-08-11 00:30:46 +08:00
|
|
|
enum zfcp_dbf_pseudo_erp_act_type {
|
|
|
|
ZFCP_PSEUDO_ERP_ACTION_RPORT_ADD = 0xff,
|
|
|
|
ZFCP_PSEUDO_ERP_ACTION_RPORT_DEL = 0xfe,
|
|
|
|
};
|
|
|
|
|
2010-12-02 22:16:12 +08:00
|
|
|
/**
|
|
|
|
* struct zfcp_dbf_rec_trigger - trace record for triggered recovery action
|
|
|
|
* @ready: number of ready recovery actions
|
|
|
|
* @running: number of running recovery actions
|
|
|
|
* @want: wanted recovery action
|
|
|
|
* @need: needed recovery action
|
|
|
|
*/
|
|
|
|
struct zfcp_dbf_rec_trigger {
|
2008-03-31 17:15:28 +08:00
|
|
|
u32 ready;
|
|
|
|
u32 running;
|
|
|
|
u8 want;
|
|
|
|
u8 need;
|
2010-12-02 22:16:12 +08:00
|
|
|
} __packed;
|
|
|
|
|
|
|
|
/**
|
|
|
|
* struct zfcp_dbf_rec_running - trace record for running recovery
|
|
|
|
* @fsf_req_id: request id for fsf requests
|
|
|
|
* @rec_status: status of the fsf request
|
|
|
|
* @rec_step: current step of the recovery action
|
|
|
|
* rec_count: recovery counter
|
|
|
|
*/
|
|
|
|
struct zfcp_dbf_rec_running {
|
|
|
|
u64 fsf_req_id;
|
|
|
|
u32 rec_status;
|
|
|
|
u16 rec_step;
|
|
|
|
u8 rec_action;
|
|
|
|
u8 rec_count;
|
|
|
|
} __packed;
|
2008-03-31 17:15:28 +08:00
|
|
|
|
2010-12-02 22:16:12 +08:00
|
|
|
/**
|
|
|
|
* enum zfcp_dbf_rec_id - recovery trace record id
|
|
|
|
* @ZFCP_DBF_REC_TRIG: triggered recovery identifier
|
|
|
|
* @ZFCP_DBF_REC_RUN: running recovery identifier
|
|
|
|
*/
|
|
|
|
enum zfcp_dbf_rec_id {
|
|
|
|
ZFCP_DBF_REC_TRIG = 1,
|
|
|
|
ZFCP_DBF_REC_RUN = 2,
|
2008-05-19 18:17:46 +08:00
|
|
|
};
|
2008-03-31 17:15:28 +08:00
|
|
|
|
2010-12-02 22:16:12 +08:00
|
|
|
/**
|
|
|
|
* struct zfcp_dbf_rec - trace record for error recovery actions
|
|
|
|
* @id: unique number of recovery record type
|
|
|
|
* @tag: identifier string specifying the location of initiation
|
|
|
|
* @lun: logical unit number
|
|
|
|
* @wwpn: word wide port number
|
|
|
|
* @d_id: destination ID
|
|
|
|
* @adapter_status: current status of the adapter
|
|
|
|
* @port_status: current status of the port
|
|
|
|
* @lun_status: current status of the lun
|
|
|
|
* @u.trig: structure zfcp_dbf_rec_trigger
|
|
|
|
* @u.run: structure zfcp_dbf_rec_running
|
|
|
|
*/
|
|
|
|
struct zfcp_dbf_rec {
|
2008-03-31 17:15:28 +08:00
|
|
|
u8 id;
|
2010-12-02 22:16:12 +08:00
|
|
|
char tag[ZFCP_DBF_TAG_LEN];
|
|
|
|
u64 lun;
|
|
|
|
u64 wwpn;
|
|
|
|
u32 d_id;
|
|
|
|
u32 adapter_status;
|
|
|
|
u32 port_status;
|
|
|
|
u32 lun_status;
|
2008-03-31 17:15:28 +08:00
|
|
|
union {
|
2010-12-02 22:16:12 +08:00
|
|
|
struct zfcp_dbf_rec_trigger trig;
|
|
|
|
struct zfcp_dbf_rec_running run;
|
2008-03-31 17:15:28 +08:00
|
|
|
} u;
|
2010-12-02 22:16:12 +08:00
|
|
|
} __packed;
|
2008-03-31 17:15:28 +08:00
|
|
|
|
2010-12-02 22:16:13 +08:00
|
|
|
/**
|
|
|
|
* enum zfcp_dbf_san_id - SAN trace record identifier
|
|
|
|
* @ZFCP_DBF_SAN_REQ: request trace record id
|
|
|
|
* @ZFCP_DBF_SAN_RES: response trace record id
|
|
|
|
* @ZFCP_DBF_SAN_ELS: extended link service record id
|
|
|
|
*/
|
|
|
|
enum zfcp_dbf_san_id {
|
|
|
|
ZFCP_DBF_SAN_REQ = 1,
|
|
|
|
ZFCP_DBF_SAN_RES = 2,
|
|
|
|
ZFCP_DBF_SAN_ELS = 3,
|
|
|
|
};
|
|
|
|
|
|
|
|
/** struct zfcp_dbf_san - trace record for SAN requests and responses
|
|
|
|
* @id: unique number of recovery record type
|
|
|
|
* @tag: identifier string specifying the location of initiation
|
|
|
|
* @fsf_req_id: request id for fsf requests
|
|
|
|
* @payload: unformatted information related to request/response
|
|
|
|
* @d_id: destination id
|
|
|
|
*/
|
|
|
|
struct zfcp_dbf_san {
|
|
|
|
u8 id;
|
|
|
|
char tag[ZFCP_DBF_TAG_LEN];
|
|
|
|
u64 fsf_req_id;
|
|
|
|
u32 d_id;
|
|
|
|
#define ZFCP_DBF_SAN_MAX_PAYLOAD (FC_CT_HDR_LEN + 32)
|
|
|
|
char payload[ZFCP_DBF_SAN_MAX_PAYLOAD];
|
2016-08-11 00:30:53 +08:00
|
|
|
u16 pl_len;
|
2010-12-02 22:16:13 +08:00
|
|
|
} __packed;
|
|
|
|
|
2010-12-02 22:16:14 +08:00
|
|
|
/**
|
|
|
|
* struct zfcp_dbf_hba_res - trace record for hba responses
|
|
|
|
* @req_issued: timestamp when request was issued
|
|
|
|
* @prot_status: protocol status
|
|
|
|
* @prot_status_qual: protocol status qualifier
|
|
|
|
* @fsf_status: fsf status
|
|
|
|
* @fsf_status_qual: fsf status qualifier
|
|
|
|
*/
|
|
|
|
struct zfcp_dbf_hba_res {
|
|
|
|
u64 req_issued;
|
|
|
|
u32 prot_status;
|
|
|
|
u8 prot_status_qual[FSF_PROT_STATUS_QUAL_SIZE];
|
2008-03-31 17:15:28 +08:00
|
|
|
u32 fsf_status;
|
2010-12-02 22:16:14 +08:00
|
|
|
u8 fsf_status_qual[FSF_STATUS_QUALIFIER_SIZE];
|
2016-08-11 00:30:50 +08:00
|
|
|
u32 port_handle;
|
|
|
|
u32 lun_handle;
|
2010-12-02 22:16:14 +08:00
|
|
|
} __packed;
|
2008-03-31 17:15:28 +08:00
|
|
|
|
2010-12-02 22:16:14 +08:00
|
|
|
/**
|
|
|
|
* struct zfcp_dbf_hba_uss - trace record for unsolicited status
|
|
|
|
* @status_type: type of unsolicited status
|
|
|
|
* @status_subtype: subtype of unsolicited status
|
|
|
|
* @d_id: destination ID
|
|
|
|
* @lun: logical unit number
|
|
|
|
* @queue_designator: queue designator
|
|
|
|
*/
|
|
|
|
struct zfcp_dbf_hba_uss {
|
2008-03-31 17:15:28 +08:00
|
|
|
u32 status_type;
|
|
|
|
u32 status_subtype;
|
2010-12-02 22:16:14 +08:00
|
|
|
u32 d_id;
|
|
|
|
u64 lun;
|
|
|
|
u64 queue_designator;
|
|
|
|
} __packed;
|
2008-03-31 17:15:28 +08:00
|
|
|
|
2010-12-02 22:16:14 +08:00
|
|
|
/**
|
|
|
|
* enum zfcp_dbf_hba_id - HBA trace record identifier
|
|
|
|
* @ZFCP_DBF_HBA_RES: response trace record
|
|
|
|
* @ZFCP_DBF_HBA_USS: unsolicited status trace record
|
|
|
|
* @ZFCP_DBF_HBA_BIT: bit error trace record
|
|
|
|
*/
|
|
|
|
enum zfcp_dbf_hba_id {
|
|
|
|
ZFCP_DBF_HBA_RES = 1,
|
|
|
|
ZFCP_DBF_HBA_USS = 2,
|
|
|
|
ZFCP_DBF_HBA_BIT = 3,
|
[SCSI] zfcp: Do not wakeup while suspended
If the mapping of FCP device bus ID and corresponding subchannel
is modified while the Linux image is suspended, the resume of FCP
devices can fail. During resume, zfcp gets callbacks from cio regarding
the modified subchannels but they can be arbitrarily mixed with the
restore/resume callback. Since the cio callbacks would trigger
adapter recovery, zfcp could wakeup before the resume callback.
Therefore, ignore the cio callbacks regarding subchannels while
being suspended. We can safely do so, since zfcp does not deal itself
with subchannels. For problem determination purposes, we still trace the
ignored callback events.
The following kernel messages could be seen on resume:
kernel: <WWPN>: parent <FCP device bus ID> should not be sleeping
As part of adapter reopen recovery, zfcp performs auto port scanning
which can erroneously try to register new remote ports with
scsi_transport_fc and the device core code complains about the parent
(adapter) still sleeping.
kernel: zfcp.3dff9c: <FCP device bus ID>:\
Setting up the QDIO connection to the FCP adapter failed
<last kernel message repeated 3 more times>
kernel: zfcp.574d43: <FCP device bus ID>:\
ERP cannot recover an error on the FCP device
In such cases, the adapter gave up recovery and remained blocked along
with its child objects: remote ports and LUNs/scsi devices. Even the
adapter shutdown as part of giving up recovery failed because the ccw
device state remained disconnected. Later, the corresponding remote
ports ran into dev_loss_tmo. As a result, the LUNs were erroneously
not available again after resume.
Even a manually triggered adapter recovery (e.g. sysfs attribute
failed, or device offline/online via sysfs) could not recover the
adapter due to the remaining disconnected state of the corresponding
ccw device.
Signed-off-by: Steffen Maier <maier@linux.vnet.ibm.com>
Cc: <stable@vger.kernel.org> #2.6.32+
Signed-off-by: James Bottomley <JBottomley@Parallels.com>
2012-09-04 21:23:32 +08:00
|
|
|
ZFCP_DBF_HBA_BASIC = 4,
|
2010-12-02 22:16:14 +08:00
|
|
|
};
|
2008-03-31 17:15:28 +08:00
|
|
|
|
2010-12-02 22:16:14 +08:00
|
|
|
/**
|
|
|
|
* struct zfcp_dbf_hba - common trace record for HBA records
|
|
|
|
* @id: unique number of recovery record type
|
|
|
|
* @tag: identifier string specifying the location of initiation
|
|
|
|
* @fsf_req_id: request id for fsf requests
|
|
|
|
* @fsf_req_status: status of fsf request
|
|
|
|
* @fsf_cmd: fsf command
|
|
|
|
* @fsf_seq_no: fsf sequence number
|
|
|
|
* @pl_len: length of payload stored as zfcp_dbf_pay
|
|
|
|
* @u: record type specific data
|
|
|
|
*/
|
|
|
|
struct zfcp_dbf_hba {
|
|
|
|
u8 id;
|
|
|
|
char tag[ZFCP_DBF_TAG_LEN];
|
|
|
|
u64 fsf_req_id;
|
|
|
|
u32 fsf_req_status;
|
|
|
|
u32 fsf_cmd;
|
|
|
|
u32 fsf_seq_no;
|
|
|
|
u16 pl_len;
|
2008-03-31 17:15:28 +08:00
|
|
|
union {
|
2010-12-02 22:16:14 +08:00
|
|
|
struct zfcp_dbf_hba_res res;
|
|
|
|
struct zfcp_dbf_hba_uss uss;
|
|
|
|
struct fsf_bit_error_payload be;
|
2008-03-31 17:15:29 +08:00
|
|
|
} u;
|
2010-12-02 22:16:14 +08:00
|
|
|
} __packed;
|
|
|
|
|
2010-12-02 22:16:15 +08:00
|
|
|
/**
|
|
|
|
* enum zfcp_dbf_scsi_id - scsi trace record identifier
|
|
|
|
* @ZFCP_DBF_SCSI_CMND: scsi command trace record
|
|
|
|
*/
|
|
|
|
enum zfcp_dbf_scsi_id {
|
|
|
|
ZFCP_DBF_SCSI_CMND = 1,
|
|
|
|
};
|
|
|
|
|
|
|
|
/**
|
|
|
|
* struct zfcp_dbf_scsi - common trace record for SCSI records
|
|
|
|
* @id: unique number of recovery record type
|
|
|
|
* @tag: identifier string specifying the location of initiation
|
|
|
|
* @scsi_id: scsi device id
|
|
|
|
* @scsi_lun: scsi device logical unit number
|
|
|
|
* @scsi_result: scsi result
|
|
|
|
* @scsi_retries: current retry number of scsi request
|
|
|
|
* @scsi_allowed: allowed retries
|
|
|
|
* @fcp_rsp_info: FCP response info
|
|
|
|
* @scsi_opcode: scsi opcode
|
|
|
|
* @fsf_req_id: request id of fsf request
|
|
|
|
* @host_scribble: LLD specific data attached to SCSI request
|
|
|
|
* @pl_len: length of paload stored as zfcp_dbf_pay
|
|
|
|
* @fsf_rsp: response for fsf request
|
|
|
|
*/
|
|
|
|
struct zfcp_dbf_scsi {
|
|
|
|
u8 id;
|
|
|
|
char tag[ZFCP_DBF_TAG_LEN];
|
|
|
|
u32 scsi_id;
|
|
|
|
u32 scsi_lun;
|
|
|
|
u32 scsi_result;
|
|
|
|
u8 scsi_retries;
|
|
|
|
u8 scsi_allowed;
|
|
|
|
u8 fcp_rsp_info;
|
|
|
|
#define ZFCP_DBF_SCSI_OPCODE 16
|
|
|
|
u8 scsi_opcode[ZFCP_DBF_SCSI_OPCODE];
|
|
|
|
u64 fsf_req_id;
|
|
|
|
u64 host_scribble;
|
|
|
|
u16 pl_len;
|
|
|
|
struct fcp_resp_with_ext fcp_rsp;
|
|
|
|
} __packed;
|
|
|
|
|
2010-12-02 22:16:14 +08:00
|
|
|
/**
|
|
|
|
* struct zfcp_dbf_pay - trace record for unformatted payload information
|
|
|
|
* @area: area this record is originated from
|
|
|
|
* @counter: ascending record number
|
|
|
|
* @fsf_req_id: request id of fsf request
|
|
|
|
* @data: unformatted data
|
|
|
|
*/
|
|
|
|
struct zfcp_dbf_pay {
|
2010-12-02 22:16:16 +08:00
|
|
|
u8 counter;
|
2010-12-02 22:16:14 +08:00
|
|
|
char area[ZFCP_DBF_TAG_LEN];
|
|
|
|
u64 fsf_req_id;
|
|
|
|
#define ZFCP_DBF_PAY_MAX_REC 0x100
|
|
|
|
char data[ZFCP_DBF_PAY_MAX_REC];
|
|
|
|
} __packed;
|
2008-03-31 17:15:28 +08:00
|
|
|
|
2010-12-02 22:16:16 +08:00
|
|
|
/**
|
|
|
|
* struct zfcp_dbf - main dbf trace structure
|
|
|
|
* @pay: reference to payload trace area
|
|
|
|
* @rec: reference to recovery trace area
|
|
|
|
* @hba: reference to hba trace area
|
|
|
|
* @san: reference to san trace area
|
|
|
|
* @scsi: reference to scsi trace area
|
|
|
|
* @pay_lock: lock protecting payload trace buffer
|
|
|
|
* @rec_lock: lock protecting recovery trace buffer
|
|
|
|
* @hba_lock: lock protecting hba trace buffer
|
|
|
|
* @san_lock: lock protecting san trace buffer
|
|
|
|
* @scsi_lock: lock protecting scsi trace buffer
|
|
|
|
* @pay_buf: pre-allocated buffer for payload
|
|
|
|
* @rec_buf: pre-allocated buffer for recovery
|
|
|
|
* @hba_buf: pre-allocated buffer for hba
|
|
|
|
* @san_buf: pre-allocated buffer for san
|
|
|
|
* @scsi_buf: pre-allocated buffer for scsi
|
|
|
|
*/
|
2009-08-18 21:43:07 +08:00
|
|
|
struct zfcp_dbf {
|
2010-12-02 22:16:14 +08:00
|
|
|
debug_info_t *pay;
|
2009-08-18 21:43:21 +08:00
|
|
|
debug_info_t *rec;
|
|
|
|
debug_info_t *hba;
|
|
|
|
debug_info_t *san;
|
|
|
|
debug_info_t *scsi;
|
2010-12-02 22:16:14 +08:00
|
|
|
spinlock_t pay_lock;
|
2009-08-18 21:43:21 +08:00
|
|
|
spinlock_t rec_lock;
|
|
|
|
spinlock_t hba_lock;
|
|
|
|
spinlock_t san_lock;
|
|
|
|
spinlock_t scsi_lock;
|
2010-12-02 22:16:16 +08:00
|
|
|
struct zfcp_dbf_pay pay_buf;
|
2010-12-02 22:16:12 +08:00
|
|
|
struct zfcp_dbf_rec rec_buf;
|
2010-12-02 22:16:14 +08:00
|
|
|
struct zfcp_dbf_hba hba_buf;
|
2010-12-02 22:16:13 +08:00
|
|
|
struct zfcp_dbf_san san_buf;
|
2010-12-02 22:16:15 +08:00
|
|
|
struct zfcp_dbf_scsi scsi_buf;
|
2009-08-18 21:43:07 +08:00
|
|
|
};
|
|
|
|
|
2016-12-10 00:16:32 +08:00
|
|
|
/**
|
|
|
|
* zfcp_dbf_hba_fsf_resp_suppress - true if we should not trace by default
|
|
|
|
* @req: request that has been completed
|
|
|
|
*
|
|
|
|
* Returns true if FCP response with only benign residual under count.
|
|
|
|
*/
|
|
|
|
static inline
|
|
|
|
bool zfcp_dbf_hba_fsf_resp_suppress(struct zfcp_fsf_req *req)
|
|
|
|
{
|
|
|
|
struct fsf_qtcb *qtcb = req->qtcb;
|
|
|
|
u32 fsf_stat = qtcb->header.fsf_status;
|
|
|
|
struct fcp_resp *fcp_rsp;
|
|
|
|
u8 rsp_flags, fr_status;
|
|
|
|
|
|
|
|
if (qtcb->prefix.qtcb_type != FSF_IO_COMMAND)
|
|
|
|
return false; /* not an FCP response */
|
|
|
|
fcp_rsp = (struct fcp_resp *)&qtcb->bottom.io.fcp_rsp;
|
|
|
|
rsp_flags = fcp_rsp->fr_flags;
|
|
|
|
fr_status = fcp_rsp->fr_status;
|
|
|
|
return (fsf_stat == FSF_FCP_RSP_AVAILABLE) &&
|
|
|
|
(rsp_flags == FCP_RESID_UNDER) &&
|
|
|
|
(fr_status == SAM_STAT_GOOD);
|
|
|
|
}
|
|
|
|
|
2009-08-18 21:43:09 +08:00
|
|
|
static inline
|
2010-12-02 22:16:14 +08:00
|
|
|
void zfcp_dbf_hba_fsf_resp(char *tag, int level, struct zfcp_fsf_req *req)
|
2009-08-18 21:43:09 +08:00
|
|
|
{
|
2013-09-18 23:21:34 +08:00
|
|
|
if (debug_level_enabled(req->adapter->dbf->hba, level))
|
2016-08-11 00:30:47 +08:00
|
|
|
zfcp_dbf_hba_fsf_res(tag, level, req);
|
2009-08-18 21:43:09 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
2009-08-18 21:43:21 +08:00
|
|
|
* zfcp_dbf_hba_fsf_response - trace event for request completion
|
2010-12-02 22:16:15 +08:00
|
|
|
* @req: request that has been completed
|
2009-08-18 21:43:09 +08:00
|
|
|
*/
|
2010-12-02 22:16:14 +08:00
|
|
|
static inline
|
|
|
|
void zfcp_dbf_hba_fsf_response(struct zfcp_fsf_req *req)
|
2009-08-18 21:43:09 +08:00
|
|
|
{
|
|
|
|
struct fsf_qtcb *qtcb = req->qtcb;
|
|
|
|
|
scsi: zfcp: trace HBA FSF response by default on dismiss or timedout late response
At the default trace level, we only trace unsuccessful events including
FSF responses.
zfcp_dbf_hba_fsf_response() only used protocol status and FSF status to
decide on an unsuccessful response. However, this is only one of multiple
possible sources determining a failed struct zfcp_fsf_req.
An FSF request can also "fail" if its response runs into an ERP timeout
or if it gets dismissed because a higher level recovery was triggered
[trace tags "erscf_1" or "erscf_2" in zfcp_erp_strategy_check_fsfreq()].
FSF requests with ERP timeout are:
FSF_QTCB_EXCHANGE_CONFIG_DATA, FSF_QTCB_EXCHANGE_PORT_DATA,
FSF_QTCB_OPEN_PORT_WITH_DID or FSF_QTCB_CLOSE_PORT or
FSF_QTCB_CLOSE_PHYSICAL_PORT for target ports,
FSF_QTCB_OPEN_LUN, FSF_QTCB_CLOSE_LUN.
One example is slow queue processing which can cause follow-on errors,
e.g. FSF_PORT_ALREADY_OPEN after FSF_QTCB_OPEN_PORT_WITH_DID timed out.
In order to see the root cause, we need to see late responses even if the
channel presented them successfully with FSF_PROT_GOOD and FSF_GOOD.
Example trace records formatted with zfcpdbf from the s390-tools package:
Timestamp : ...
Area : REC
Subarea : 00
Level : 1
Exception : -
CPU ID : ..
Caller : ...
Record ID : 1
Tag : fcegpf1
LUN : 0xffffffffffffffff
WWPN : 0x<WWPN>
D_ID : 0x00<D_ID>
Adapter status : 0x5400050b
Port status : 0x41200000
LUN status : 0x00000000
Ready count : 0x00000001
Running count : 0x...
ERP want : 0x02 ZFCP_ERP_ACTION_REOPEN_PORT
ERP need : 0x02 ZFCP_ERP_ACTION_REOPEN_PORT
|
Timestamp : ... 30 seconds later
Area : REC
Subarea : 00
Level : 1
Exception : -
CPU ID : ..
Caller : ...
Record ID : 2
Tag : erscf_2
LUN : 0xffffffffffffffff
WWPN : 0x<WWPN>
D_ID : 0x00<D_ID>
Adapter status : 0x5400050b
Port status : 0x41200000
LUN status : 0x00000000
Request ID : 0x<request_ID>
ERP status : 0x10000000 ZFCP_STATUS_ERP_TIMEDOUT
ERP step : 0x0800 ZFCP_ERP_STEP_PORT_OPENING
ERP action : 0x02 ZFCP_ERP_ACTION_REOPEN_PORT
ERP count : 0x00
|
Timestamp : ... later than previous record
Area : HBA
Subarea : 00
Level : 5 > default level => 3 <= default level
Exception : -
CPU ID : 00
Caller : ...
Record ID : 1
Tag : fs_qtcb => fs_rerr
Request ID : 0x<request_ID>
Request status : 0x00001010 ZFCP_STATUS_FSFREQ_DISMISSED
| ZFCP_STATUS_FSFREQ_CLEANUP
FSF cmnd : 0x00000005
FSF sequence no: 0x...
FSF issued : ... > 30 seconds ago
FSF stat : 0x00000000 FSF_GOOD
FSF stat qual : 00000000 00000000 00000000 00000000
Prot stat : 0x00000001 FSF_PROT_GOOD
Prot stat qual : 00000000 00000000 00000000 00000000
Port handle : 0x...
LUN handle : 0x00000000
QTCB log length: ...
QTCB log info : ...
In case of problems detecting that new responses are waiting on the input
queue, we sooner or later trigger adapter recovery due to an FSF request
timeout (trace tag "fsrth_1").
FSF requests with FSF request timeout are:
typically FSF_QTCB_ABORT_FCP_CMND; but theoretically also
FSF_QTCB_EXCHANGE_CONFIG_DATA or FSF_QTCB_EXCHANGE_PORT_DATA via sysfs,
FSF_QTCB_OPEN_PORT_WITH_DID or FSF_QTCB_CLOSE_PORT for WKA ports,
FSF_QTCB_FCP_CMND for task management function (LUN / target reset).
One or more pending requests can meanwhile have FSF_PROT_GOOD and FSF_GOOD
because the channel filled in the response via DMA into the request's QTCB.
In a theroretical case, inject code can create an erroneous FSF request
on purpose. If data router is enabled, it uses deferred error reporting.
A READ SCSI command can succeed with FSF_PROT_GOOD, FSF_GOOD, and
SAM_STAT_GOOD. But on writing the read data to host memory via DMA,
it can still fail, e.g. if an intentionally wrong scatter list does not
provide enough space. Rather than getting an unsuccessful response,
we get a QDIO activate check which in turn triggers adapter recovery.
One or more pending requests can meanwhile have FSF_PROT_GOOD and FSF_GOOD
because the channel filled in the response via DMA into the request's QTCB.
Example trace records formatted with zfcpdbf from the s390-tools package:
Timestamp : ...
Area : HBA
Subarea : 00
Level : 6 > default level => 3 <= default level
Exception : -
CPU ID : ..
Caller : ...
Record ID : 1
Tag : fs_norm => fs_rerr
Request ID : 0x<request_ID2>
Request status : 0x00001010 ZFCP_STATUS_FSFREQ_DISMISSED
| ZFCP_STATUS_FSFREQ_CLEANUP
FSF cmnd : 0x00000001
FSF sequence no: 0x...
FSF issued : ...
FSF stat : 0x00000000 FSF_GOOD
FSF stat qual : 00000000 00000000 00000000 00000000
Prot stat : 0x00000001 FSF_PROT_GOOD
Prot stat qual : ........ ........ 00000000 00000000
Port handle : 0x...
LUN handle : 0x...
|
Timestamp : ...
Area : SCSI
Subarea : 00
Level : 3
Exception : -
CPU ID : ..
Caller : ...
Record ID : 1
Tag : rsl_err
Request ID : 0x<request_ID2>
SCSI ID : 0x...
SCSI LUN : 0x...
SCSI result : 0x000e0000 DID_TRANSPORT_DISRUPTED
SCSI retries : 0x00
SCSI allowed : 0x05
SCSI scribble : 0x<request_ID2>
SCSI opcode : 28... Read(10)
FCP rsp inf cod: 0x00
FCP rsp IU : 00000000 00000000 00000000 00000000
^^ SAM_STAT_GOOD
00000000 00000000
Only with luck in both above cases, we could see a follow-on trace record
of an unsuccesful event following a successful but late FSF response with
FSF_PROT_GOOD and FSF_GOOD. Typically this was the case for I/O requests
resulting in a SCSI trace record "rsl_err" with DID_TRANSPORT_DISRUPTED
[On ZFCP_STATUS_FSFREQ_DISMISSED, zfcp_fsf_protstatus_eval() sets
ZFCP_STATUS_FSFREQ_ERROR seen by the request handler functions as failure].
However, the reason for this follow-on trace was invisible because the
corresponding HBA trace record was missing at the default trace level
(by default hidden records with tags "fs_norm", "fs_qtcb", or "fs_open").
On adapter recovery, after we had shut down the QDIO queues, we perform
unsuccessful pseudo completions with flag ZFCP_STATUS_FSFREQ_DISMISSED
for each pending FSF request in zfcp_fsf_req_dismiss_all().
In order to find the root cause, we need to see all pseudo responses even
if the channel presented them successfully with FSF_PROT_GOOD and FSF_GOOD.
Therefore, check zfcp_fsf_req.status for ZFCP_STATUS_FSFREQ_DISMISSED
or ZFCP_STATUS_FSFREQ_ERROR and trace with a new tag "fs_rerr".
It does not matter that there are numerous places which set
ZFCP_STATUS_FSFREQ_ERROR after the location where we trace an FSF response
early. These cases are based on protocol status != FSF_PROT_GOOD or
== FSF_PROT_FSF_STATUS_PRESENTED and are thus already traced by default
as trace tag "fs_perr" or "fs_ferr" respectively.
NB: The trace record with tag "fssrh_1" for status read buffers on dismiss
all remains. zfcp_fsf_req_complete() handles this and returns early.
All other FSF request types are handled separately and as described above.
Signed-off-by: Steffen Maier <maier@linux.vnet.ibm.com>
Fixes: 8a36e4532ea1 ("[SCSI] zfcp: enhancement of zfcp debug features")
Fixes: 2e261af84cdb ("[SCSI] zfcp: Only collect FSF/HBA debug data for matching trace levels")
Cc: <stable@vger.kernel.org> #2.6.38+
Reviewed-by: Benjamin Block <bblock@linux.vnet.ibm.com>
Signed-off-by: Benjamin Block <bblock@linux.vnet.ibm.com>
Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
2017-07-28 18:30:57 +08:00
|
|
|
if (unlikely(req->status & (ZFCP_STATUS_FSFREQ_DISMISSED |
|
|
|
|
ZFCP_STATUS_FSFREQ_ERROR))) {
|
|
|
|
zfcp_dbf_hba_fsf_resp("fs_rerr", 3, req);
|
|
|
|
|
|
|
|
} else if ((qtcb->prefix.prot_status != FSF_PROT_GOOD) &&
|
2009-08-18 21:43:09 +08:00
|
|
|
(qtcb->prefix.prot_status != FSF_PROT_FSF_STATUS_PRESENTED)) {
|
2010-12-02 22:16:14 +08:00
|
|
|
zfcp_dbf_hba_fsf_resp("fs_perr", 1, req);
|
2009-08-18 21:43:09 +08:00
|
|
|
|
|
|
|
} else if (qtcb->header.fsf_status != FSF_GOOD) {
|
2016-12-10 00:16:32 +08:00
|
|
|
zfcp_dbf_hba_fsf_resp("fs_ferr",
|
|
|
|
zfcp_dbf_hba_fsf_resp_suppress(req)
|
|
|
|
? 5 : 1, req);
|
2009-08-18 21:43:09 +08:00
|
|
|
|
|
|
|
} else if ((req->fsf_command == FSF_QTCB_OPEN_PORT_WITH_DID) ||
|
|
|
|
(req->fsf_command == FSF_QTCB_OPEN_LUN)) {
|
2010-12-02 22:16:14 +08:00
|
|
|
zfcp_dbf_hba_fsf_resp("fs_open", 4, req);
|
2009-08-18 21:43:09 +08:00
|
|
|
|
|
|
|
} else if (qtcb->header.log_length) {
|
2010-12-02 22:16:14 +08:00
|
|
|
zfcp_dbf_hba_fsf_resp("fs_qtcb", 5, req);
|
2009-08-18 21:43:09 +08:00
|
|
|
|
|
|
|
} else {
|
2010-12-02 22:16:14 +08:00
|
|
|
zfcp_dbf_hba_fsf_resp("fs_norm", 6, req);
|
2009-08-18 21:43:09 +08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2009-08-18 21:43:08 +08:00
|
|
|
static inline
|
2010-12-02 22:16:15 +08:00
|
|
|
void _zfcp_dbf_scsi(char *tag, int level, struct scsi_cmnd *scmd,
|
|
|
|
struct zfcp_fsf_req *req)
|
2009-08-18 21:43:08 +08:00
|
|
|
{
|
2010-12-02 22:16:15 +08:00
|
|
|
struct zfcp_adapter *adapter = (struct zfcp_adapter *)
|
|
|
|
scmd->device->host->hostdata[0];
|
|
|
|
|
2013-09-18 23:21:34 +08:00
|
|
|
if (debug_level_enabled(adapter->dbf->scsi, level))
|
2016-08-11 00:30:47 +08:00
|
|
|
zfcp_dbf_scsi(tag, level, scmd, req);
|
2009-08-18 21:43:08 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
2009-08-18 21:43:21 +08:00
|
|
|
* zfcp_dbf_scsi_result - trace event for SCSI command completion
|
2009-08-18 21:43:08 +08:00
|
|
|
* @scmd: SCSI command pointer
|
2010-02-17 18:18:57 +08:00
|
|
|
* @req: FSF request used to issue SCSI command
|
2009-08-18 21:43:08 +08:00
|
|
|
*/
|
|
|
|
static inline
|
2010-12-02 22:16:15 +08:00
|
|
|
void zfcp_dbf_scsi_result(struct scsi_cmnd *scmd, struct zfcp_fsf_req *req)
|
2009-08-18 21:43:08 +08:00
|
|
|
{
|
2010-02-17 18:18:57 +08:00
|
|
|
if (scmd->result != 0)
|
2010-12-02 22:16:15 +08:00
|
|
|
_zfcp_dbf_scsi("rsl_err", 3, scmd, req);
|
2010-02-17 18:18:57 +08:00
|
|
|
else if (scmd->retries > 0)
|
2010-12-02 22:16:15 +08:00
|
|
|
_zfcp_dbf_scsi("rsl_ret", 4, scmd, req);
|
2010-02-17 18:18:57 +08:00
|
|
|
else
|
2010-12-02 22:16:15 +08:00
|
|
|
_zfcp_dbf_scsi("rsl_nor", 6, scmd, req);
|
2010-02-17 18:18:57 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* zfcp_dbf_scsi_fail_send - trace event for failure to send SCSI command
|
|
|
|
* @scmd: SCSI command pointer
|
|
|
|
*/
|
|
|
|
static inline
|
2010-12-02 22:16:15 +08:00
|
|
|
void zfcp_dbf_scsi_fail_send(struct scsi_cmnd *scmd)
|
2010-02-17 18:18:57 +08:00
|
|
|
{
|
2010-12-02 22:16:15 +08:00
|
|
|
_zfcp_dbf_scsi("rsl_fai", 4, scmd, NULL);
|
2009-08-18 21:43:08 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
2009-08-18 21:43:21 +08:00
|
|
|
* zfcp_dbf_scsi_abort - trace event for SCSI command abort
|
2009-08-18 21:43:08 +08:00
|
|
|
* @tag: tag indicating success or failure of abort operation
|
|
|
|
* @scmd: SCSI command to be aborted
|
2010-12-02 22:16:15 +08:00
|
|
|
* @fsf_req: request containing abort (might be NULL)
|
2009-08-18 21:43:08 +08:00
|
|
|
*/
|
|
|
|
static inline
|
2010-12-02 22:16:15 +08:00
|
|
|
void zfcp_dbf_scsi_abort(char *tag, struct scsi_cmnd *scmd,
|
|
|
|
struct zfcp_fsf_req *fsf_req)
|
2009-08-18 21:43:08 +08:00
|
|
|
{
|
2010-12-02 22:16:15 +08:00
|
|
|
_zfcp_dbf_scsi(tag, 1, scmd, fsf_req);
|
2009-08-18 21:43:08 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
2009-08-18 21:43:21 +08:00
|
|
|
* zfcp_dbf_scsi_devreset - trace event for Logical Unit or Target Reset
|
2009-08-18 21:43:08 +08:00
|
|
|
* @tag: tag indicating success or failure of reset operation
|
2010-09-08 20:39:55 +08:00
|
|
|
* @scmnd: SCSI command which caused this error recovery
|
2009-08-18 21:43:08 +08:00
|
|
|
* @flag: indicates type of reset (Target Reset, Logical Unit Reset)
|
|
|
|
*/
|
|
|
|
static inline
|
2017-07-28 18:30:54 +08:00
|
|
|
void zfcp_dbf_scsi_devreset(char *tag, struct scsi_cmnd *scmnd, u8 flag,
|
|
|
|
struct zfcp_fsf_req *fsf_req)
|
2009-08-18 21:43:08 +08:00
|
|
|
{
|
2010-12-02 22:16:15 +08:00
|
|
|
char tmp_tag[ZFCP_DBF_TAG_LEN];
|
|
|
|
|
|
|
|
if (flag == FCP_TMF_TGT_RESET)
|
|
|
|
memcpy(tmp_tag, "tr_", 3);
|
|
|
|
else
|
|
|
|
memcpy(tmp_tag, "lr_", 3);
|
2010-09-08 20:39:55 +08:00
|
|
|
|
2010-12-02 22:16:15 +08:00
|
|
|
memcpy(&tmp_tag[3], tag, 4);
|
2017-07-28 18:30:54 +08:00
|
|
|
_zfcp_dbf_scsi(tmp_tag, 1, scmnd, fsf_req);
|
2009-08-18 21:43:08 +08:00
|
|
|
}
|
|
|
|
|
scsi: zfcp: fix use-after-"free" in FC ingress path after TMF
When SCSI EH invokes zFCP's callbacks for eh_device_reset_handler() and
eh_target_reset_handler(), it expects us to relent the ownership over
the given scsi_cmnd and all other scsi_cmnds within the same scope - LUN
or target - when returning with SUCCESS from the callback ('release'
them). SCSI EH can then reuse those commands.
We did not follow this rule to release commands upon SUCCESS; and if
later a reply arrived for one of those supposed to be released commands,
we would still make use of the scsi_cmnd in our ingress tasklet. This
will at least result in undefined behavior or a kernel panic because of
a wrong kernel pointer dereference.
To fix this, we NULLify all pointers to scsi_cmnds (struct zfcp_fsf_req
*)->data in the matching scope if a TMF was successful. This is done
under the locks (struct zfcp_adapter *)->abort_lock and (struct
zfcp_reqlist *)->lock to prevent the requests from being removed from
the request-hashtable, and the ingress tasklet from making use of the
scsi_cmnd-pointer in zfcp_fsf_fcp_cmnd_handler().
For cases where a reply arrives during SCSI EH, but before we get a
chance to NULLify the pointer - but before we return from the callback
-, we assume that the code is protected from races via the CAS operation
in blk_complete_request() that is called in scsi_done().
The following stacktrace shows an example for a crash resulting from the
previous behavior:
Unable to handle kernel pointer dereference at virtual kernel address fffffee17a672000
Oops: 0038 [#1] SMP
CPU: 2 PID: 0 Comm: swapper/2 Not tainted
task: 00000003f7ff5be0 ti: 00000003f3d38000 task.ti: 00000003f3d38000
Krnl PSW : 0404d00180000000 00000000001156b0 (smp_vcpu_scheduled+0x18/0x40)
R:0 T:1 IO:0 EX:0 Key:0 M:1 W:0 P:0 AS:3 CC:1 PM:0 EA:3
Krnl GPRS: 000000200000007e 0000000000000000 fffffee17a671fd8 0000000300000015
ffffffff80000000 00000000005dfde8 07000003f7f80e00 000000004fa4e800
000000036ce8d8f8 000000036ce8d9c0 00000003ece8fe00 ffffffff969c9e93
00000003fffffffd 000000036ce8da10 00000000003bf134 00000003f3b07918
Krnl Code: 00000000001156a2: a7190000 lghi %r1,0
00000000001156a6: a7380015 lhi %r3,21
#00000000001156aa: e32050000008 ag %r2,0(%r5)
>00000000001156b0: 482022b0 lh %r2,688(%r2)
00000000001156b4: ae123000 sigp %r1,%r2,0(%r3)
00000000001156b8: b2220020 ipm %r2
00000000001156bc: 8820001c srl %r2,28
00000000001156c0: c02700000001 xilf %r2,1
Call Trace:
([<0000000000000000>] 0x0)
[<000003ff807bdb8e>] zfcp_fsf_fcp_cmnd_handler+0x3de/0x490 [zfcp]
[<000003ff807be30a>] zfcp_fsf_req_complete+0x252/0x800 [zfcp]
[<000003ff807c0a48>] zfcp_fsf_reqid_check+0xe8/0x190 [zfcp]
[<000003ff807c194e>] zfcp_qdio_int_resp+0x66/0x188 [zfcp]
[<000003ff80440c64>] qdio_kick_handler+0xdc/0x310 [qdio]
[<000003ff804463d0>] __tiqdio_inbound_processing+0xf8/0xcd8 [qdio]
[<0000000000141fd4>] tasklet_action+0x9c/0x170
[<0000000000141550>] __do_softirq+0xe8/0x258
[<000000000010ce0a>] do_softirq+0xba/0xc0
[<000000000014187c>] irq_exit+0xc4/0xe8
[<000000000046b526>] do_IRQ+0x146/0x1d8
[<00000000005d6a3c>] io_return+0x0/0x8
[<00000000005d6422>] vtime_stop_cpu+0x4a/0xa0
([<0000000000000000>] 0x0)
[<0000000000103d8a>] arch_cpu_idle+0xa2/0xb0
[<0000000000197f94>] cpu_startup_entry+0x13c/0x1f8
[<0000000000114782>] smp_start_secondary+0xda/0xe8
[<00000000005d6efe>] restart_int_handler+0x56/0x6c
[<0000000000000000>] 0x0
Last Breaking-Event-Address:
[<00000000003bf12e>] arch_spin_lock_wait+0x56/0xb0
Suggested-by: Steffen Maier <maier@linux.vnet.ibm.com>
Signed-off-by: Benjamin Block <bblock@linux.vnet.ibm.com>
Fixes: ea127f9754 ("[PATCH] s390 (7/7): zfcp host adapter.") (tglx/history.git)
Cc: <stable@vger.kernel.org> #2.6.32+
Signed-off-by: Steffen Maier <maier@linux.vnet.ibm.com>
Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
2016-12-10 00:16:31 +08:00
|
|
|
/**
|
|
|
|
* zfcp_dbf_scsi_nullcmnd() - trace NULLify of SCSI command in dev/tgt-reset.
|
|
|
|
* @scmnd: SCSI command that was NULLified.
|
|
|
|
* @fsf_req: request that owned @scmnd.
|
|
|
|
*/
|
|
|
|
static inline void zfcp_dbf_scsi_nullcmnd(struct scsi_cmnd *scmnd,
|
|
|
|
struct zfcp_fsf_req *fsf_req)
|
|
|
|
{
|
|
|
|
_zfcp_dbf_scsi("scfc__1", 3, scmnd, fsf_req);
|
|
|
|
}
|
|
|
|
|
2008-03-31 17:15:28 +08:00
|
|
|
#endif /* ZFCP_DBF_H */
|