drbd: fix hang on local read errors while disconnected
"canceled" w_read_retry_remote never completed, if they have been canceled after drbd_disconnect connection teardown cleanup has already run (or we are currently not connected anyways). Fixed by not queueing a remote retry if we already know it won't work (pdsk not uptodate), and cleanup ourselves on "cancel", in case we hit a race with drbd_disconnect. Signed-off-by: Philipp Reisner <philipp.reisner@linbit.com> Signed-off-by: Lars Ellenberg <lars.ellenberg@linbit.com> Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
This commit is contained in:
parent
32fa7e91f9
commit
d255e5ff5f
|
@ -452,20 +452,21 @@ void __req_mod(struct drbd_request *req, enum drbd_req_event what,
|
||||||
|
|
||||||
dev_alert(DEV, "Local READ failed sec=%llus size=%u\n",
|
dev_alert(DEV, "Local READ failed sec=%llus size=%u\n",
|
||||||
(unsigned long long)req->sector, req->size);
|
(unsigned long long)req->sector, req->size);
|
||||||
/* _req_mod(req,to_be_send); oops, recursion... */
|
|
||||||
D_ASSERT(!(req->rq_state & RQ_NET_MASK));
|
D_ASSERT(!(req->rq_state & RQ_NET_MASK));
|
||||||
req->rq_state |= RQ_NET_PENDING;
|
|
||||||
inc_ap_pending(mdev);
|
|
||||||
|
|
||||||
__drbd_chk_io_error(mdev, FALSE);
|
__drbd_chk_io_error(mdev, FALSE);
|
||||||
put_ldev(mdev);
|
put_ldev(mdev);
|
||||||
/* NOTE: if we have no connection,
|
|
||||||
* or know the peer has no good data either,
|
|
||||||
* then we don't actually need to "queue_for_net_read",
|
|
||||||
* but we do so anyways, since the drbd_io_error()
|
|
||||||
* and the potential state change to "Diskless"
|
|
||||||
* needs to be done from process context */
|
|
||||||
|
|
||||||
|
/* no point in retrying if there is no good remote data,
|
||||||
|
* or we have no connection. */
|
||||||
|
if (mdev->state.pdsk != D_UP_TO_DATE) {
|
||||||
|
_req_may_be_done(req, m);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* _req_mod(req,to_be_send); oops, recursion... */
|
||||||
|
req->rq_state |= RQ_NET_PENDING;
|
||||||
|
inc_ap_pending(mdev);
|
||||||
/* fall through: _req_mod(req,queue_for_net_read); */
|
/* fall through: _req_mod(req,queue_for_net_read); */
|
||||||
|
|
||||||
case queue_for_net_read:
|
case queue_for_net_read:
|
||||||
|
@ -575,6 +576,9 @@ void __req_mod(struct drbd_request *req, enum drbd_req_event what,
|
||||||
_req_may_be_done(req, m);
|
_req_may_be_done(req, m);
|
||||||
break;
|
break;
|
||||||
|
|
||||||
|
case read_retry_remote_canceled:
|
||||||
|
req->rq_state &= ~RQ_NET_QUEUED;
|
||||||
|
/* fall through, in case we raced with drbd_disconnect */
|
||||||
case connection_lost_while_pending:
|
case connection_lost_while_pending:
|
||||||
/* transfer log cleanup after connection loss */
|
/* transfer log cleanup after connection loss */
|
||||||
/* assert something? */
|
/* assert something? */
|
||||||
|
|
|
@ -91,6 +91,7 @@ enum drbd_req_event {
|
||||||
send_failed,
|
send_failed,
|
||||||
handed_over_to_network,
|
handed_over_to_network,
|
||||||
connection_lost_while_pending,
|
connection_lost_while_pending,
|
||||||
|
read_retry_remote_canceled,
|
||||||
recv_acked_by_peer,
|
recv_acked_by_peer,
|
||||||
write_acked_by_peer,
|
write_acked_by_peer,
|
||||||
write_acked_by_peer_and_sis, /* and set_in_sync */
|
write_acked_by_peer_and_sis, /* and set_in_sync */
|
||||||
|
|
|
@ -266,10 +266,8 @@ int w_read_retry_remote(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
|
||||||
* to give the disk the chance to relocate that block */
|
* to give the disk the chance to relocate that block */
|
||||||
|
|
||||||
spin_lock_irq(&mdev->req_lock);
|
spin_lock_irq(&mdev->req_lock);
|
||||||
if (cancel ||
|
if (cancel || mdev->state.pdsk != D_UP_TO_DATE) {
|
||||||
mdev->state.conn < C_CONNECTED ||
|
_req_mod(req, read_retry_remote_canceled);
|
||||||
mdev->state.pdsk <= D_INCONSISTENT) {
|
|
||||||
_req_mod(req, send_canceled);
|
|
||||||
spin_unlock_irq(&mdev->req_lock);
|
spin_unlock_irq(&mdev->req_lock);
|
||||||
dev_alert(DEV, "WE ARE LOST. Local IO failure, no peer.\n");
|
dev_alert(DEV, "WE ARE LOST. Local IO failure, no peer.\n");
|
||||||
return 1;
|
return 1;
|
||||||
|
|
Loading…
Reference in New Issue