linux-sg2042/drivers/s390/scsi/zfcp_dbf.h

434 lines
11 KiB
C
Raw Normal View History

/*
* zfcp device driver
* debug feature declarations
*
scsi: zfcp: fix passing fsf_req to SCSI trace on TMF to correlate with HBA Without this fix we get SCSI trace records on task management functions which cannot be correlated to HBA trace records because all fields related to the FSF request are empty (zero). Also, the FCP_RSP_IU is missing as well as any sense data if available. This was caused by v2.6.14 commit 8a36e4532ea1 ("[SCSI] zfcp: enhancement of zfcp debug features") introducing trace records for TMFs but hard coding NULL for a possibly existing TMF FSF request. The scsi_cmnd scribble is also zero or unrelated for the TMF request so it also could not lookup a suitable FSF request from there. A broken example trace record formatted with zfcpdbf from the s390-tools package: Timestamp : ... Area : SCSI Subarea : 00 Level : 1 Exception : - CPU ID : .. Caller : 0x... Record ID : 1 Tag : lr_fail Request ID : 0x0000000000000000 ^^^^^^^^^^^^^^^^ no correlation to HBA record SCSI ID : 0x<scsitarget> SCSI LUN : 0x<scsilun> SCSI result : 0x000e0000 SCSI retries : 0x00 SCSI allowed : 0x05 SCSI scribble : 0x0000000000000000 SCSI opcode : 2a000017 3bb80000 08000000 00000000 FCP rsp inf cod: 0x00 ^^ no TMF response FCP rsp IU : 00000000 00000000 00000000 00000000 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ 00000000 00000000 ^^^^^^^^^^^^^^^^^ no interesting FCP_RSP_IU Sense len : ... ^^^^^^^^^^^^^^^^^^^^ no sense data length Sense info : ... ^^^^^^^^^^^^^^^^^^^^ no sense data content, even if present There are some true cases where we really do not have an FSF request: "rsl_fai" from zfcp_dbf_scsi_fail_send() called for early returns / completions in zfcp_scsi_queuecommand(), "abrt_or", "abrt_bl", "abrt_ru", "abrt_ar" from zfcp_scsi_eh_abort_handler() where we did not get as far, "lr_nres", "tr_nres" from zfcp_task_mgmt_function() where we're successful and do not need to do anything because adapter stopped. For these cases it's correct to pass NULL for fsf_req to _zfcp_dbf_scsi(). Signed-off-by: Steffen Maier <maier@linux.vnet.ibm.com> Fixes: 8a36e4532ea1 ("[SCSI] zfcp: enhancement of zfcp debug features") Cc: <stable@vger.kernel.org> #2.6.38+ Reviewed-by: Benjamin Block <bblock@linux.vnet.ibm.com> Signed-off-by: Benjamin Block <bblock@linux.vnet.ibm.com> Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
2017-07-28 18:30:54 +08:00
* Copyright IBM Corp. 2008, 2017
*/
#ifndef ZFCP_DBF_H
#define ZFCP_DBF_H
#include <scsi/fc/fc_fcp.h>
#include "zfcp_ext.h"
#include "zfcp_fsf.h"
#include "zfcp_def.h"
#define ZFCP_DBF_TAG_LEN 7
#define ZFCP_DBF_INVALID_LUN 0xFFFFFFFFFFFFFFFFull
zfcp: close window with unblocked rport during rport gone On a successful end of reopen port forced, zfcp_erp_strategy_followup_success() re-uses the port erp_action and the subsequent zfcp_erp_action_cleanup() now sees ZFCP_ERP_SUCCEEDED with erp_action->action==ZFCP_ERP_ACTION_REOPEN_PORT instead of ZFCP_ERP_ACTION_REOPEN_PORT_FORCED but must not perform zfcp_scsi_schedule_rport_register(). We can detect this because the fresh port reopen erp_action is in its very first step ZFCP_ERP_STEP_UNINITIALIZED. Otherwise this opens a time window with unblocked rport (until the followup port reopen recovery would block it again). If a scsi_cmnd timeout occurs during this time window fc_timed_out() cannot work as desired and such command would indeed time out and trigger scsi_eh. This prevents a clean and timely path failover. This should not happen if the path issue can be recovered on FC transport layer such as path issues involving RSCNs. Also, unnecessary and repeated DID_IMM_RETRY for pending and undesired new requests occur because internally zfcp still has its zfcp_port blocked. As follow-on errors with scsi_eh, it can cause, in the worst case, permanently lost paths due to one of: sd <scsidev>: [<scsidisk>] Medium access timeout failure. Offlining disk! sd <scsidev>: Device offlined - not ready after error recovery For fix validation and to aid future debugging with other recoveries we now also trace (un)blocking of rports. Signed-off-by: Steffen Maier <maier@linux.vnet.ibm.com> Fixes: 5767620c383a ("[SCSI] zfcp: Do not unblock rport from REOPEN_PORT_FORCED") Fixes: a2fa0aede07c ("[SCSI] zfcp: Block FC transport rports early on errors") Fixes: 5f852be9e11d ("[SCSI] zfcp: Fix deadlock between zfcp ERP and SCSI") Fixes: 338151e06608 ("[SCSI] zfcp: make use of fc_remote_port_delete when target port is unavailable") Fixes: 3859f6a248cb ("[PATCH] zfcp: add rports to enable scsi_add_device to work again") Cc: <stable@vger.kernel.org> #2.6.32+ Reviewed-by: Benjamin Block <bblock@linux.vnet.ibm.com> Reviewed-by: Hannes Reinecke <hare@suse.com> Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
2016-08-11 00:30:46 +08:00
enum zfcp_dbf_pseudo_erp_act_type {
ZFCP_PSEUDO_ERP_ACTION_RPORT_ADD = 0xff,
ZFCP_PSEUDO_ERP_ACTION_RPORT_DEL = 0xfe,
};
/**
* struct zfcp_dbf_rec_trigger - trace record for triggered recovery action
* @ready: number of ready recovery actions
* @running: number of running recovery actions
* @want: wanted recovery action
* @need: needed recovery action
*/
struct zfcp_dbf_rec_trigger {
u32 ready;
u32 running;
u8 want;
u8 need;
} __packed;
/**
* struct zfcp_dbf_rec_running - trace record for running recovery
* @fsf_req_id: request id for fsf requests
* @rec_status: status of the fsf request
* @rec_step: current step of the recovery action
* rec_count: recovery counter
*/
struct zfcp_dbf_rec_running {
u64 fsf_req_id;
u32 rec_status;
u16 rec_step;
u8 rec_action;
u8 rec_count;
} __packed;
/**
* enum zfcp_dbf_rec_id - recovery trace record id
* @ZFCP_DBF_REC_TRIG: triggered recovery identifier
* @ZFCP_DBF_REC_RUN: running recovery identifier
*/
enum zfcp_dbf_rec_id {
ZFCP_DBF_REC_TRIG = 1,
ZFCP_DBF_REC_RUN = 2,
};
/**
* struct zfcp_dbf_rec - trace record for error recovery actions
* @id: unique number of recovery record type
* @tag: identifier string specifying the location of initiation
* @lun: logical unit number
* @wwpn: word wide port number
* @d_id: destination ID
* @adapter_status: current status of the adapter
* @port_status: current status of the port
* @lun_status: current status of the lun
* @u.trig: structure zfcp_dbf_rec_trigger
* @u.run: structure zfcp_dbf_rec_running
*/
struct zfcp_dbf_rec {
u8 id;
char tag[ZFCP_DBF_TAG_LEN];
u64 lun;
u64 wwpn;
u32 d_id;
u32 adapter_status;
u32 port_status;
u32 lun_status;
union {
struct zfcp_dbf_rec_trigger trig;
struct zfcp_dbf_rec_running run;
} u;
} __packed;
/**
* enum zfcp_dbf_san_id - SAN trace record identifier
* @ZFCP_DBF_SAN_REQ: request trace record id
* @ZFCP_DBF_SAN_RES: response trace record id
* @ZFCP_DBF_SAN_ELS: extended link service record id
*/
enum zfcp_dbf_san_id {
ZFCP_DBF_SAN_REQ = 1,
ZFCP_DBF_SAN_RES = 2,
ZFCP_DBF_SAN_ELS = 3,
};
/** struct zfcp_dbf_san - trace record for SAN requests and responses
* @id: unique number of recovery record type
* @tag: identifier string specifying the location of initiation
* @fsf_req_id: request id for fsf requests
* @payload: unformatted information related to request/response
* @d_id: destination id
*/
struct zfcp_dbf_san {
u8 id;
char tag[ZFCP_DBF_TAG_LEN];
u64 fsf_req_id;
u32 d_id;
#define ZFCP_DBF_SAN_MAX_PAYLOAD (FC_CT_HDR_LEN + 32)
char payload[ZFCP_DBF_SAN_MAX_PAYLOAD];
u16 pl_len;
} __packed;
/**
* struct zfcp_dbf_hba_res - trace record for hba responses
* @req_issued: timestamp when request was issued
* @prot_status: protocol status
* @prot_status_qual: protocol status qualifier
* @fsf_status: fsf status
* @fsf_status_qual: fsf status qualifier
*/
struct zfcp_dbf_hba_res {
u64 req_issued;
u32 prot_status;
u8 prot_status_qual[FSF_PROT_STATUS_QUAL_SIZE];
u32 fsf_status;
u8 fsf_status_qual[FSF_STATUS_QUALIFIER_SIZE];
u32 port_handle;
u32 lun_handle;
} __packed;
/**
* struct zfcp_dbf_hba_uss - trace record for unsolicited status
* @status_type: type of unsolicited status
* @status_subtype: subtype of unsolicited status
* @d_id: destination ID
* @lun: logical unit number
* @queue_designator: queue designator
*/
struct zfcp_dbf_hba_uss {
u32 status_type;
u32 status_subtype;
u32 d_id;
u64 lun;
u64 queue_designator;
} __packed;
/**
* enum zfcp_dbf_hba_id - HBA trace record identifier
* @ZFCP_DBF_HBA_RES: response trace record
* @ZFCP_DBF_HBA_USS: unsolicited status trace record
* @ZFCP_DBF_HBA_BIT: bit error trace record
*/
enum zfcp_dbf_hba_id {
ZFCP_DBF_HBA_RES = 1,
ZFCP_DBF_HBA_USS = 2,
ZFCP_DBF_HBA_BIT = 3,
[SCSI] zfcp: Do not wakeup while suspended If the mapping of FCP device bus ID and corresponding subchannel is modified while the Linux image is suspended, the resume of FCP devices can fail. During resume, zfcp gets callbacks from cio regarding the modified subchannels but they can be arbitrarily mixed with the restore/resume callback. Since the cio callbacks would trigger adapter recovery, zfcp could wakeup before the resume callback. Therefore, ignore the cio callbacks regarding subchannels while being suspended. We can safely do so, since zfcp does not deal itself with subchannels. For problem determination purposes, we still trace the ignored callback events. The following kernel messages could be seen on resume: kernel: <WWPN>: parent <FCP device bus ID> should not be sleeping As part of adapter reopen recovery, zfcp performs auto port scanning which can erroneously try to register new remote ports with scsi_transport_fc and the device core code complains about the parent (adapter) still sleeping. kernel: zfcp.3dff9c: <FCP device bus ID>:\ Setting up the QDIO connection to the FCP adapter failed <last kernel message repeated 3 more times> kernel: zfcp.574d43: <FCP device bus ID>:\ ERP cannot recover an error on the FCP device In such cases, the adapter gave up recovery and remained blocked along with its child objects: remote ports and LUNs/scsi devices. Even the adapter shutdown as part of giving up recovery failed because the ccw device state remained disconnected. Later, the corresponding remote ports ran into dev_loss_tmo. As a result, the LUNs were erroneously not available again after resume. Even a manually triggered adapter recovery (e.g. sysfs attribute failed, or device offline/online via sysfs) could not recover the adapter due to the remaining disconnected state of the corresponding ccw device. Signed-off-by: Steffen Maier <maier@linux.vnet.ibm.com> Cc: <stable@vger.kernel.org> #2.6.32+ Signed-off-by: James Bottomley <JBottomley@Parallels.com>
2012-09-04 21:23:32 +08:00
ZFCP_DBF_HBA_BASIC = 4,
};
/**
* struct zfcp_dbf_hba - common trace record for HBA records
* @id: unique number of recovery record type
* @tag: identifier string specifying the location of initiation
* @fsf_req_id: request id for fsf requests
* @fsf_req_status: status of fsf request
* @fsf_cmd: fsf command
* @fsf_seq_no: fsf sequence number
* @pl_len: length of payload stored as zfcp_dbf_pay
* @u: record type specific data
*/
struct zfcp_dbf_hba {
u8 id;
char tag[ZFCP_DBF_TAG_LEN];
u64 fsf_req_id;
u32 fsf_req_status;
u32 fsf_cmd;
u32 fsf_seq_no;
u16 pl_len;
union {
struct zfcp_dbf_hba_res res;
struct zfcp_dbf_hba_uss uss;
struct fsf_bit_error_payload be;
} u;
} __packed;
/**
* enum zfcp_dbf_scsi_id - scsi trace record identifier
* @ZFCP_DBF_SCSI_CMND: scsi command trace record
*/
enum zfcp_dbf_scsi_id {
ZFCP_DBF_SCSI_CMND = 1,
};
/**
* struct zfcp_dbf_scsi - common trace record for SCSI records
* @id: unique number of recovery record type
* @tag: identifier string specifying the location of initiation
* @scsi_id: scsi device id
* @scsi_lun: scsi device logical unit number
* @scsi_result: scsi result
* @scsi_retries: current retry number of scsi request
* @scsi_allowed: allowed retries
* @fcp_rsp_info: FCP response info
* @scsi_opcode: scsi opcode
* @fsf_req_id: request id of fsf request
* @host_scribble: LLD specific data attached to SCSI request
* @pl_len: length of paload stored as zfcp_dbf_pay
* @fsf_rsp: response for fsf request
*/
struct zfcp_dbf_scsi {
u8 id;
char tag[ZFCP_DBF_TAG_LEN];
u32 scsi_id;
u32 scsi_lun;
u32 scsi_result;
u8 scsi_retries;
u8 scsi_allowed;
u8 fcp_rsp_info;
#define ZFCP_DBF_SCSI_OPCODE 16
u8 scsi_opcode[ZFCP_DBF_SCSI_OPCODE];
u64 fsf_req_id;
u64 host_scribble;
u16 pl_len;
struct fcp_resp_with_ext fcp_rsp;
} __packed;
/**
* struct zfcp_dbf_pay - trace record for unformatted payload information
* @area: area this record is originated from
* @counter: ascending record number
* @fsf_req_id: request id of fsf request
* @data: unformatted data
*/
struct zfcp_dbf_pay {
u8 counter;
char area[ZFCP_DBF_TAG_LEN];
u64 fsf_req_id;
#define ZFCP_DBF_PAY_MAX_REC 0x100
char data[ZFCP_DBF_PAY_MAX_REC];
} __packed;
/**
* struct zfcp_dbf - main dbf trace structure
* @pay: reference to payload trace area
* @rec: reference to recovery trace area
* @hba: reference to hba trace area
* @san: reference to san trace area
* @scsi: reference to scsi trace area
* @pay_lock: lock protecting payload trace buffer
* @rec_lock: lock protecting recovery trace buffer
* @hba_lock: lock protecting hba trace buffer
* @san_lock: lock protecting san trace buffer
* @scsi_lock: lock protecting scsi trace buffer
* @pay_buf: pre-allocated buffer for payload
* @rec_buf: pre-allocated buffer for recovery
* @hba_buf: pre-allocated buffer for hba
* @san_buf: pre-allocated buffer for san
* @scsi_buf: pre-allocated buffer for scsi
*/
struct zfcp_dbf {
debug_info_t *pay;
debug_info_t *rec;
debug_info_t *hba;
debug_info_t *san;
debug_info_t *scsi;
spinlock_t pay_lock;
spinlock_t rec_lock;
spinlock_t hba_lock;
spinlock_t san_lock;
spinlock_t scsi_lock;
struct zfcp_dbf_pay pay_buf;
struct zfcp_dbf_rec rec_buf;
struct zfcp_dbf_hba hba_buf;
struct zfcp_dbf_san san_buf;
struct zfcp_dbf_scsi scsi_buf;
};
/**
* zfcp_dbf_hba_fsf_resp_suppress - true if we should not trace by default
* @req: request that has been completed
*
* Returns true if FCP response with only benign residual under count.
*/
static inline
bool zfcp_dbf_hba_fsf_resp_suppress(struct zfcp_fsf_req *req)
{
struct fsf_qtcb *qtcb = req->qtcb;
u32 fsf_stat = qtcb->header.fsf_status;
struct fcp_resp *fcp_rsp;
u8 rsp_flags, fr_status;
if (qtcb->prefix.qtcb_type != FSF_IO_COMMAND)
return false; /* not an FCP response */
fcp_rsp = (struct fcp_resp *)&qtcb->bottom.io.fcp_rsp;
rsp_flags = fcp_rsp->fr_flags;
fr_status = fcp_rsp->fr_status;
return (fsf_stat == FSF_FCP_RSP_AVAILABLE) &&
(rsp_flags == FCP_RESID_UNDER) &&
(fr_status == SAM_STAT_GOOD);
}
static inline
void zfcp_dbf_hba_fsf_resp(char *tag, int level, struct zfcp_fsf_req *req)
{
if (debug_level_enabled(req->adapter->dbf->hba, level))
zfcp_dbf_hba_fsf_res(tag, level, req);
}
/**
* zfcp_dbf_hba_fsf_response - trace event for request completion
* @req: request that has been completed
*/
static inline
void zfcp_dbf_hba_fsf_response(struct zfcp_fsf_req *req)
{
struct fsf_qtcb *qtcb = req->qtcb;
scsi: zfcp: trace HBA FSF response by default on dismiss or timedout late response At the default trace level, we only trace unsuccessful events including FSF responses. zfcp_dbf_hba_fsf_response() only used protocol status and FSF status to decide on an unsuccessful response. However, this is only one of multiple possible sources determining a failed struct zfcp_fsf_req. An FSF request can also "fail" if its response runs into an ERP timeout or if it gets dismissed because a higher level recovery was triggered [trace tags "erscf_1" or "erscf_2" in zfcp_erp_strategy_check_fsfreq()]. FSF requests with ERP timeout are: FSF_QTCB_EXCHANGE_CONFIG_DATA, FSF_QTCB_EXCHANGE_PORT_DATA, FSF_QTCB_OPEN_PORT_WITH_DID or FSF_QTCB_CLOSE_PORT or FSF_QTCB_CLOSE_PHYSICAL_PORT for target ports, FSF_QTCB_OPEN_LUN, FSF_QTCB_CLOSE_LUN. One example is slow queue processing which can cause follow-on errors, e.g. FSF_PORT_ALREADY_OPEN after FSF_QTCB_OPEN_PORT_WITH_DID timed out. In order to see the root cause, we need to see late responses even if the channel presented them successfully with FSF_PROT_GOOD and FSF_GOOD. Example trace records formatted with zfcpdbf from the s390-tools package: Timestamp : ... Area : REC Subarea : 00 Level : 1 Exception : - CPU ID : .. Caller : ... Record ID : 1 Tag : fcegpf1 LUN : 0xffffffffffffffff WWPN : 0x<WWPN> D_ID : 0x00<D_ID> Adapter status : 0x5400050b Port status : 0x41200000 LUN status : 0x00000000 Ready count : 0x00000001 Running count : 0x... ERP want : 0x02 ZFCP_ERP_ACTION_REOPEN_PORT ERP need : 0x02 ZFCP_ERP_ACTION_REOPEN_PORT | Timestamp : ... 30 seconds later Area : REC Subarea : 00 Level : 1 Exception : - CPU ID : .. Caller : ... Record ID : 2 Tag : erscf_2 LUN : 0xffffffffffffffff WWPN : 0x<WWPN> D_ID : 0x00<D_ID> Adapter status : 0x5400050b Port status : 0x41200000 LUN status : 0x00000000 Request ID : 0x<request_ID> ERP status : 0x10000000 ZFCP_STATUS_ERP_TIMEDOUT ERP step : 0x0800 ZFCP_ERP_STEP_PORT_OPENING ERP action : 0x02 ZFCP_ERP_ACTION_REOPEN_PORT ERP count : 0x00 | Timestamp : ... later than previous record Area : HBA Subarea : 00 Level : 5 > default level => 3 <= default level Exception : - CPU ID : 00 Caller : ... Record ID : 1 Tag : fs_qtcb => fs_rerr Request ID : 0x<request_ID> Request status : 0x00001010 ZFCP_STATUS_FSFREQ_DISMISSED | ZFCP_STATUS_FSFREQ_CLEANUP FSF cmnd : 0x00000005 FSF sequence no: 0x... FSF issued : ... > 30 seconds ago FSF stat : 0x00000000 FSF_GOOD FSF stat qual : 00000000 00000000 00000000 00000000 Prot stat : 0x00000001 FSF_PROT_GOOD Prot stat qual : 00000000 00000000 00000000 00000000 Port handle : 0x... LUN handle : 0x00000000 QTCB log length: ... QTCB log info : ... In case of problems detecting that new responses are waiting on the input queue, we sooner or later trigger adapter recovery due to an FSF request timeout (trace tag "fsrth_1"). FSF requests with FSF request timeout are: typically FSF_QTCB_ABORT_FCP_CMND; but theoretically also FSF_QTCB_EXCHANGE_CONFIG_DATA or FSF_QTCB_EXCHANGE_PORT_DATA via sysfs, FSF_QTCB_OPEN_PORT_WITH_DID or FSF_QTCB_CLOSE_PORT for WKA ports, FSF_QTCB_FCP_CMND for task management function (LUN / target reset). One or more pending requests can meanwhile have FSF_PROT_GOOD and FSF_GOOD because the channel filled in the response via DMA into the request's QTCB. In a theroretical case, inject code can create an erroneous FSF request on purpose. If data router is enabled, it uses deferred error reporting. A READ SCSI command can succeed with FSF_PROT_GOOD, FSF_GOOD, and SAM_STAT_GOOD. But on writing the read data to host memory via DMA, it can still fail, e.g. if an intentionally wrong scatter list does not provide enough space. Rather than getting an unsuccessful response, we get a QDIO activate check which in turn triggers adapter recovery. One or more pending requests can meanwhile have FSF_PROT_GOOD and FSF_GOOD because the channel filled in the response via DMA into the request's QTCB. Example trace records formatted with zfcpdbf from the s390-tools package: Timestamp : ... Area : HBA Subarea : 00 Level : 6 > default level => 3 <= default level Exception : - CPU ID : .. Caller : ... Record ID : 1 Tag : fs_norm => fs_rerr Request ID : 0x<request_ID2> Request status : 0x00001010 ZFCP_STATUS_FSFREQ_DISMISSED | ZFCP_STATUS_FSFREQ_CLEANUP FSF cmnd : 0x00000001 FSF sequence no: 0x... FSF issued : ... FSF stat : 0x00000000 FSF_GOOD FSF stat qual : 00000000 00000000 00000000 00000000 Prot stat : 0x00000001 FSF_PROT_GOOD Prot stat qual : ........ ........ 00000000 00000000 Port handle : 0x... LUN handle : 0x... | Timestamp : ... Area : SCSI Subarea : 00 Level : 3 Exception : - CPU ID : .. Caller : ... Record ID : 1 Tag : rsl_err Request ID : 0x<request_ID2> SCSI ID : 0x... SCSI LUN : 0x... SCSI result : 0x000e0000 DID_TRANSPORT_DISRUPTED SCSI retries : 0x00 SCSI allowed : 0x05 SCSI scribble : 0x<request_ID2> SCSI opcode : 28... Read(10) FCP rsp inf cod: 0x00 FCP rsp IU : 00000000 00000000 00000000 00000000 ^^ SAM_STAT_GOOD 00000000 00000000 Only with luck in both above cases, we could see a follow-on trace record of an unsuccesful event following a successful but late FSF response with FSF_PROT_GOOD and FSF_GOOD. Typically this was the case for I/O requests resulting in a SCSI trace record "rsl_err" with DID_TRANSPORT_DISRUPTED [On ZFCP_STATUS_FSFREQ_DISMISSED, zfcp_fsf_protstatus_eval() sets ZFCP_STATUS_FSFREQ_ERROR seen by the request handler functions as failure]. However, the reason for this follow-on trace was invisible because the corresponding HBA trace record was missing at the default trace level (by default hidden records with tags "fs_norm", "fs_qtcb", or "fs_open"). On adapter recovery, after we had shut down the QDIO queues, we perform unsuccessful pseudo completions with flag ZFCP_STATUS_FSFREQ_DISMISSED for each pending FSF request in zfcp_fsf_req_dismiss_all(). In order to find the root cause, we need to see all pseudo responses even if the channel presented them successfully with FSF_PROT_GOOD and FSF_GOOD. Therefore, check zfcp_fsf_req.status for ZFCP_STATUS_FSFREQ_DISMISSED or ZFCP_STATUS_FSFREQ_ERROR and trace with a new tag "fs_rerr". It does not matter that there are numerous places which set ZFCP_STATUS_FSFREQ_ERROR after the location where we trace an FSF response early. These cases are based on protocol status != FSF_PROT_GOOD or == FSF_PROT_FSF_STATUS_PRESENTED and are thus already traced by default as trace tag "fs_perr" or "fs_ferr" respectively. NB: The trace record with tag "fssrh_1" for status read buffers on dismiss all remains. zfcp_fsf_req_complete() handles this and returns early. All other FSF request types are handled separately and as described above. Signed-off-by: Steffen Maier <maier@linux.vnet.ibm.com> Fixes: 8a36e4532ea1 ("[SCSI] zfcp: enhancement of zfcp debug features") Fixes: 2e261af84cdb ("[SCSI] zfcp: Only collect FSF/HBA debug data for matching trace levels") Cc: <stable@vger.kernel.org> #2.6.38+ Reviewed-by: Benjamin Block <bblock@linux.vnet.ibm.com> Signed-off-by: Benjamin Block <bblock@linux.vnet.ibm.com> Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
2017-07-28 18:30:57 +08:00
if (unlikely(req->status & (ZFCP_STATUS_FSFREQ_DISMISSED |
ZFCP_STATUS_FSFREQ_ERROR))) {
zfcp_dbf_hba_fsf_resp("fs_rerr", 3, req);
} else if ((qtcb->prefix.prot_status != FSF_PROT_GOOD) &&
(qtcb->prefix.prot_status != FSF_PROT_FSF_STATUS_PRESENTED)) {
zfcp_dbf_hba_fsf_resp("fs_perr", 1, req);
} else if (qtcb->header.fsf_status != FSF_GOOD) {
zfcp_dbf_hba_fsf_resp("fs_ferr",
zfcp_dbf_hba_fsf_resp_suppress(req)
? 5 : 1, req);
} else if ((req->fsf_command == FSF_QTCB_OPEN_PORT_WITH_DID) ||
(req->fsf_command == FSF_QTCB_OPEN_LUN)) {
zfcp_dbf_hba_fsf_resp("fs_open", 4, req);
} else if (qtcb->header.log_length) {
zfcp_dbf_hba_fsf_resp("fs_qtcb", 5, req);
} else {
zfcp_dbf_hba_fsf_resp("fs_norm", 6, req);
}
}
static inline
void _zfcp_dbf_scsi(char *tag, int level, struct scsi_cmnd *scmd,
struct zfcp_fsf_req *req)
{
struct zfcp_adapter *adapter = (struct zfcp_adapter *)
scmd->device->host->hostdata[0];
if (debug_level_enabled(adapter->dbf->scsi, level))
zfcp_dbf_scsi(tag, level, scmd, req);
}
/**
* zfcp_dbf_scsi_result - trace event for SCSI command completion
* @scmd: SCSI command pointer
* @req: FSF request used to issue SCSI command
*/
static inline
void zfcp_dbf_scsi_result(struct scsi_cmnd *scmd, struct zfcp_fsf_req *req)
{
if (scmd->result != 0)
_zfcp_dbf_scsi("rsl_err", 3, scmd, req);
else if (scmd->retries > 0)
_zfcp_dbf_scsi("rsl_ret", 4, scmd, req);
else
_zfcp_dbf_scsi("rsl_nor", 6, scmd, req);
}
/**
* zfcp_dbf_scsi_fail_send - trace event for failure to send SCSI command
* @scmd: SCSI command pointer
*/
static inline
void zfcp_dbf_scsi_fail_send(struct scsi_cmnd *scmd)
{
_zfcp_dbf_scsi("rsl_fai", 4, scmd, NULL);
}
/**
* zfcp_dbf_scsi_abort - trace event for SCSI command abort
* @tag: tag indicating success or failure of abort operation
* @scmd: SCSI command to be aborted
* @fsf_req: request containing abort (might be NULL)
*/
static inline
void zfcp_dbf_scsi_abort(char *tag, struct scsi_cmnd *scmd,
struct zfcp_fsf_req *fsf_req)
{
_zfcp_dbf_scsi(tag, 1, scmd, fsf_req);
}
/**
* zfcp_dbf_scsi_devreset - trace event for Logical Unit or Target Reset
* @tag: tag indicating success or failure of reset operation
* @scmnd: SCSI command which caused this error recovery
* @flag: indicates type of reset (Target Reset, Logical Unit Reset)
*/
static inline
scsi: zfcp: fix passing fsf_req to SCSI trace on TMF to correlate with HBA Without this fix we get SCSI trace records on task management functions which cannot be correlated to HBA trace records because all fields related to the FSF request are empty (zero). Also, the FCP_RSP_IU is missing as well as any sense data if available. This was caused by v2.6.14 commit 8a36e4532ea1 ("[SCSI] zfcp: enhancement of zfcp debug features") introducing trace records for TMFs but hard coding NULL for a possibly existing TMF FSF request. The scsi_cmnd scribble is also zero or unrelated for the TMF request so it also could not lookup a suitable FSF request from there. A broken example trace record formatted with zfcpdbf from the s390-tools package: Timestamp : ... Area : SCSI Subarea : 00 Level : 1 Exception : - CPU ID : .. Caller : 0x... Record ID : 1 Tag : lr_fail Request ID : 0x0000000000000000 ^^^^^^^^^^^^^^^^ no correlation to HBA record SCSI ID : 0x<scsitarget> SCSI LUN : 0x<scsilun> SCSI result : 0x000e0000 SCSI retries : 0x00 SCSI allowed : 0x05 SCSI scribble : 0x0000000000000000 SCSI opcode : 2a000017 3bb80000 08000000 00000000 FCP rsp inf cod: 0x00 ^^ no TMF response FCP rsp IU : 00000000 00000000 00000000 00000000 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ 00000000 00000000 ^^^^^^^^^^^^^^^^^ no interesting FCP_RSP_IU Sense len : ... ^^^^^^^^^^^^^^^^^^^^ no sense data length Sense info : ... ^^^^^^^^^^^^^^^^^^^^ no sense data content, even if present There are some true cases where we really do not have an FSF request: "rsl_fai" from zfcp_dbf_scsi_fail_send() called for early returns / completions in zfcp_scsi_queuecommand(), "abrt_or", "abrt_bl", "abrt_ru", "abrt_ar" from zfcp_scsi_eh_abort_handler() where we did not get as far, "lr_nres", "tr_nres" from zfcp_task_mgmt_function() where we're successful and do not need to do anything because adapter stopped. For these cases it's correct to pass NULL for fsf_req to _zfcp_dbf_scsi(). Signed-off-by: Steffen Maier <maier@linux.vnet.ibm.com> Fixes: 8a36e4532ea1 ("[SCSI] zfcp: enhancement of zfcp debug features") Cc: <stable@vger.kernel.org> #2.6.38+ Reviewed-by: Benjamin Block <bblock@linux.vnet.ibm.com> Signed-off-by: Benjamin Block <bblock@linux.vnet.ibm.com> Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
2017-07-28 18:30:54 +08:00
void zfcp_dbf_scsi_devreset(char *tag, struct scsi_cmnd *scmnd, u8 flag,
struct zfcp_fsf_req *fsf_req)
{
char tmp_tag[ZFCP_DBF_TAG_LEN];
if (flag == FCP_TMF_TGT_RESET)
memcpy(tmp_tag, "tr_", 3);
else
memcpy(tmp_tag, "lr_", 3);
memcpy(&tmp_tag[3], tag, 4);
scsi: zfcp: fix passing fsf_req to SCSI trace on TMF to correlate with HBA Without this fix we get SCSI trace records on task management functions which cannot be correlated to HBA trace records because all fields related to the FSF request are empty (zero). Also, the FCP_RSP_IU is missing as well as any sense data if available. This was caused by v2.6.14 commit 8a36e4532ea1 ("[SCSI] zfcp: enhancement of zfcp debug features") introducing trace records for TMFs but hard coding NULL for a possibly existing TMF FSF request. The scsi_cmnd scribble is also zero or unrelated for the TMF request so it also could not lookup a suitable FSF request from there. A broken example trace record formatted with zfcpdbf from the s390-tools package: Timestamp : ... Area : SCSI Subarea : 00 Level : 1 Exception : - CPU ID : .. Caller : 0x... Record ID : 1 Tag : lr_fail Request ID : 0x0000000000000000 ^^^^^^^^^^^^^^^^ no correlation to HBA record SCSI ID : 0x<scsitarget> SCSI LUN : 0x<scsilun> SCSI result : 0x000e0000 SCSI retries : 0x00 SCSI allowed : 0x05 SCSI scribble : 0x0000000000000000 SCSI opcode : 2a000017 3bb80000 08000000 00000000 FCP rsp inf cod: 0x00 ^^ no TMF response FCP rsp IU : 00000000 00000000 00000000 00000000 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ 00000000 00000000 ^^^^^^^^^^^^^^^^^ no interesting FCP_RSP_IU Sense len : ... ^^^^^^^^^^^^^^^^^^^^ no sense data length Sense info : ... ^^^^^^^^^^^^^^^^^^^^ no sense data content, even if present There are some true cases where we really do not have an FSF request: "rsl_fai" from zfcp_dbf_scsi_fail_send() called for early returns / completions in zfcp_scsi_queuecommand(), "abrt_or", "abrt_bl", "abrt_ru", "abrt_ar" from zfcp_scsi_eh_abort_handler() where we did not get as far, "lr_nres", "tr_nres" from zfcp_task_mgmt_function() where we're successful and do not need to do anything because adapter stopped. For these cases it's correct to pass NULL for fsf_req to _zfcp_dbf_scsi(). Signed-off-by: Steffen Maier <maier@linux.vnet.ibm.com> Fixes: 8a36e4532ea1 ("[SCSI] zfcp: enhancement of zfcp debug features") Cc: <stable@vger.kernel.org> #2.6.38+ Reviewed-by: Benjamin Block <bblock@linux.vnet.ibm.com> Signed-off-by: Benjamin Block <bblock@linux.vnet.ibm.com> Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
2017-07-28 18:30:54 +08:00
_zfcp_dbf_scsi(tmp_tag, 1, scmnd, fsf_req);
}
scsi: zfcp: fix use-after-"free" in FC ingress path after TMF When SCSI EH invokes zFCP's callbacks for eh_device_reset_handler() and eh_target_reset_handler(), it expects us to relent the ownership over the given scsi_cmnd and all other scsi_cmnds within the same scope - LUN or target - when returning with SUCCESS from the callback ('release' them). SCSI EH can then reuse those commands. We did not follow this rule to release commands upon SUCCESS; and if later a reply arrived for one of those supposed to be released commands, we would still make use of the scsi_cmnd in our ingress tasklet. This will at least result in undefined behavior or a kernel panic because of a wrong kernel pointer dereference. To fix this, we NULLify all pointers to scsi_cmnds (struct zfcp_fsf_req *)->data in the matching scope if a TMF was successful. This is done under the locks (struct zfcp_adapter *)->abort_lock and (struct zfcp_reqlist *)->lock to prevent the requests from being removed from the request-hashtable, and the ingress tasklet from making use of the scsi_cmnd-pointer in zfcp_fsf_fcp_cmnd_handler(). For cases where a reply arrives during SCSI EH, but before we get a chance to NULLify the pointer - but before we return from the callback -, we assume that the code is protected from races via the CAS operation in blk_complete_request() that is called in scsi_done(). The following stacktrace shows an example for a crash resulting from the previous behavior: Unable to handle kernel pointer dereference at virtual kernel address fffffee17a672000 Oops: 0038 [#1] SMP CPU: 2 PID: 0 Comm: swapper/2 Not tainted task: 00000003f7ff5be0 ti: 00000003f3d38000 task.ti: 00000003f3d38000 Krnl PSW : 0404d00180000000 00000000001156b0 (smp_vcpu_scheduled+0x18/0x40) R:0 T:1 IO:0 EX:0 Key:0 M:1 W:0 P:0 AS:3 CC:1 PM:0 EA:3 Krnl GPRS: 000000200000007e 0000000000000000 fffffee17a671fd8 0000000300000015 ffffffff80000000 00000000005dfde8 07000003f7f80e00 000000004fa4e800 000000036ce8d8f8 000000036ce8d9c0 00000003ece8fe00 ffffffff969c9e93 00000003fffffffd 000000036ce8da10 00000000003bf134 00000003f3b07918 Krnl Code: 00000000001156a2: a7190000 lghi %r1,0 00000000001156a6: a7380015 lhi %r3,21 #00000000001156aa: e32050000008 ag %r2,0(%r5) >00000000001156b0: 482022b0 lh %r2,688(%r2) 00000000001156b4: ae123000 sigp %r1,%r2,0(%r3) 00000000001156b8: b2220020 ipm %r2 00000000001156bc: 8820001c srl %r2,28 00000000001156c0: c02700000001 xilf %r2,1 Call Trace: ([<0000000000000000>] 0x0) [<000003ff807bdb8e>] zfcp_fsf_fcp_cmnd_handler+0x3de/0x490 [zfcp] [<000003ff807be30a>] zfcp_fsf_req_complete+0x252/0x800 [zfcp] [<000003ff807c0a48>] zfcp_fsf_reqid_check+0xe8/0x190 [zfcp] [<000003ff807c194e>] zfcp_qdio_int_resp+0x66/0x188 [zfcp] [<000003ff80440c64>] qdio_kick_handler+0xdc/0x310 [qdio] [<000003ff804463d0>] __tiqdio_inbound_processing+0xf8/0xcd8 [qdio] [<0000000000141fd4>] tasklet_action+0x9c/0x170 [<0000000000141550>] __do_softirq+0xe8/0x258 [<000000000010ce0a>] do_softirq+0xba/0xc0 [<000000000014187c>] irq_exit+0xc4/0xe8 [<000000000046b526>] do_IRQ+0x146/0x1d8 [<00000000005d6a3c>] io_return+0x0/0x8 [<00000000005d6422>] vtime_stop_cpu+0x4a/0xa0 ([<0000000000000000>] 0x0) [<0000000000103d8a>] arch_cpu_idle+0xa2/0xb0 [<0000000000197f94>] cpu_startup_entry+0x13c/0x1f8 [<0000000000114782>] smp_start_secondary+0xda/0xe8 [<00000000005d6efe>] restart_int_handler+0x56/0x6c [<0000000000000000>] 0x0 Last Breaking-Event-Address: [<00000000003bf12e>] arch_spin_lock_wait+0x56/0xb0 Suggested-by: Steffen Maier <maier@linux.vnet.ibm.com> Signed-off-by: Benjamin Block <bblock@linux.vnet.ibm.com> Fixes: ea127f9754 ("[PATCH] s390 (7/7): zfcp host adapter.") (tglx/history.git) Cc: <stable@vger.kernel.org> #2.6.32+ Signed-off-by: Steffen Maier <maier@linux.vnet.ibm.com> Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
2016-12-10 00:16:31 +08:00
/**
* zfcp_dbf_scsi_nullcmnd() - trace NULLify of SCSI command in dev/tgt-reset.
* @scmnd: SCSI command that was NULLified.
* @fsf_req: request that owned @scmnd.
*/
static inline void zfcp_dbf_scsi_nullcmnd(struct scsi_cmnd *scmnd,
struct zfcp_fsf_req *fsf_req)
{
_zfcp_dbf_scsi("scfc__1", 3, scmnd, fsf_req);
}
#endif /* ZFCP_DBF_H */