From fa14a0acea1ffe67913ba384a2897130a36dfe03 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Tue, 1 Nov 2016 18:36:46 +0200 Subject: [PATCH 1/6] nvmet-rdma: Fix possible NULL deref when handling rdma cm events When we initiate queue teardown sequence we call rdma_destroy_qp which clears cm_id->qp, afterwards we call rdma_destroy_id, but we might see a rdma_cm event in between with a cleared cm_id->qp so watch out for that and silently ignore the event because this means that the queue teardown sequence is in progress. Signed-off-by: Bart Van Assche Signed-off-by: Sagi Grimberg --- drivers/nvme/target/rdma.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/drivers/nvme/target/rdma.c b/drivers/nvme/target/rdma.c index f8d23999e0f2..cf60759cc169 100644 --- a/drivers/nvme/target/rdma.c +++ b/drivers/nvme/target/rdma.c @@ -1352,7 +1352,13 @@ static int nvmet_rdma_cm_handler(struct rdma_cm_id *cm_id, case RDMA_CM_EVENT_ADDR_CHANGE: case RDMA_CM_EVENT_DISCONNECTED: case RDMA_CM_EVENT_TIMEWAIT_EXIT: - nvmet_rdma_queue_disconnect(queue); + /* + * We might end up here when we already freed the qp + * which means queue release sequence is in progress, + * so don't get in the way... + */ + if (queue) + nvmet_rdma_queue_disconnect(queue); break; case RDMA_CM_EVENT_DEVICE_REMOVAL: ret = nvmet_rdma_device_removal(cm_id, queue); From 553cd9ef82edd811948782a8f73ae73c4bfeedd3 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 2 Nov 2016 08:49:18 -0600 Subject: [PATCH 2/6] nvme-rdma: reject non-connect commands before the queue is live If we reconncect we might have command queue up that get resent as soon as the queue is restarted. But until the connect command succeeded we can't send other command. Add a new flag that marks a queue as live when connect finishes, and delay any non-connect command until the queue is live based on it. Signed-off-by: Christoph Hellwig Reported-by: Steve Wise Tested-by: Steve Wise [sagig: fixes admin queue LIVE setting] Signed-off-by: Sagi Grimberg --- drivers/nvme/host/rdma.c | 31 ++++++++++++++++++++++++++++++- 1 file changed, 30 insertions(+), 1 deletion(-) diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c index 5a8388177959..438d6948895f 100644 --- a/drivers/nvme/host/rdma.c +++ b/drivers/nvme/host/rdma.c @@ -83,6 +83,7 @@ enum nvme_rdma_queue_flags { NVME_RDMA_Q_CONNECTED = (1 << 0), NVME_RDMA_IB_QUEUE_ALLOCATED = (1 << 1), NVME_RDMA_Q_DELETING = (1 << 2), + NVME_RDMA_Q_LIVE = (1 << 3), }; struct nvme_rdma_queue { @@ -626,6 +627,7 @@ static int nvme_rdma_connect_io_queues(struct nvme_rdma_ctrl *ctrl) ret = nvmf_connect_io_queue(&ctrl->ctrl, i); if (ret) break; + set_bit(NVME_RDMA_Q_LIVE, &ctrl->queues[i].flags); } return ret; @@ -712,6 +714,8 @@ static void nvme_rdma_reconnect_ctrl_work(struct work_struct *work) if (ret) goto stop_admin_q; + set_bit(NVME_RDMA_Q_LIVE, &ctrl->queues[0].flags); + ret = nvme_enable_ctrl(&ctrl->ctrl, ctrl->cap); if (ret) goto stop_admin_q; @@ -761,8 +765,10 @@ static void nvme_rdma_error_recovery_work(struct work_struct *work) nvme_stop_keep_alive(&ctrl->ctrl); - for (i = 0; i < ctrl->queue_count; i++) + for (i = 0; i < ctrl->queue_count; i++) { clear_bit(NVME_RDMA_Q_CONNECTED, &ctrl->queues[i].flags); + clear_bit(NVME_RDMA_Q_LIVE, &ctrl->queues[i].flags); + } if (ctrl->queue_count > 1) nvme_stop_queues(&ctrl->ctrl); @@ -1378,6 +1384,24 @@ nvme_rdma_timeout(struct request *rq, bool reserved) return BLK_EH_HANDLED; } +/* + * We cannot accept any other command until the Connect command has completed. + */ +static inline bool nvme_rdma_queue_is_ready(struct nvme_rdma_queue *queue, + struct request *rq) +{ + if (unlikely(!test_bit(NVME_RDMA_Q_LIVE, &queue->flags))) { + struct nvme_command *cmd = (struct nvme_command *)rq->cmd; + + if (rq->cmd_type != REQ_TYPE_DRV_PRIV || + cmd->common.opcode != nvme_fabrics_command || + cmd->fabrics.fctype != nvme_fabrics_type_connect) + return false; + } + + return true; +} + static int nvme_rdma_queue_rq(struct blk_mq_hw_ctx *hctx, const struct blk_mq_queue_data *bd) { @@ -1394,6 +1418,9 @@ static int nvme_rdma_queue_rq(struct blk_mq_hw_ctx *hctx, WARN_ON_ONCE(rq->tag < 0); + if (!nvme_rdma_queue_is_ready(queue, rq)) + return BLK_MQ_RQ_QUEUE_BUSY; + dev = queue->device->dev; ib_dma_sync_single_for_cpu(dev, sqe->dma, sizeof(struct nvme_command), DMA_TO_DEVICE); @@ -1544,6 +1571,8 @@ static int nvme_rdma_configure_admin_queue(struct nvme_rdma_ctrl *ctrl) if (error) goto out_cleanup_queue; + set_bit(NVME_RDMA_Q_LIVE, &ctrl->queues[0].flags); + error = nvmf_reg_read64(&ctrl->ctrl, NVME_REG_CAP, &ctrl->cap); if (error) { dev_err(ctrl->ctrl.device, From 8242ddac1bfcf6eb8873b4d0a4e7a172c2b5b625 Mon Sep 17 00:00:00 2001 From: Sagi Grimberg Date: Sun, 6 Nov 2016 11:03:30 +0200 Subject: [PATCH 3/6] nvmet: Don't queue fatal error work if csts.cfs is set In the transport, in case of an interal queue error like error completion in rdma we trigger a fatal error. However, multiple queues in the same controller can serr error completions and we don't want to trigger fatal error work more than once. Reviewed-by: Christoph Hellwig Signed-off-by: Sagi Grimberg --- drivers/nvme/target/core.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/drivers/nvme/target/core.c b/drivers/nvme/target/core.c index b4cacb6f0258..a21437a33adb 100644 --- a/drivers/nvme/target/core.c +++ b/drivers/nvme/target/core.c @@ -838,9 +838,13 @@ static void nvmet_fatal_error_handler(struct work_struct *work) void nvmet_ctrl_fatal_error(struct nvmet_ctrl *ctrl) { - ctrl->csts |= NVME_CSTS_CFS; - INIT_WORK(&ctrl->fatal_err_work, nvmet_fatal_error_handler); - schedule_work(&ctrl->fatal_err_work); + mutex_lock(&ctrl->lock); + if (!(ctrl->csts & NVME_CSTS_CFS)) { + ctrl->csts |= NVME_CSTS_CFS; + INIT_WORK(&ctrl->fatal_err_work, nvmet_fatal_error_handler); + schedule_work(&ctrl->fatal_err_work); + } + mutex_unlock(&ctrl->lock); } EXPORT_SYMBOL_GPL(nvmet_ctrl_fatal_error); From 766dbb179d41d6337fed2b3ca00caa5845d298ce Mon Sep 17 00:00:00 2001 From: Sagi Grimberg Date: Sun, 6 Nov 2016 11:09:49 +0200 Subject: [PATCH 4/6] nvmet-rdma: don't forget to delete a queue from the list of connection failed In case we accepted a queue connection and it failed, we might not remove the queue from the list until we unload and clean it up. We should delete it from the queue list on the relevant handler. Signed-off-by: Sagi Grimberg --- drivers/nvme/target/rdma.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/drivers/nvme/target/rdma.c b/drivers/nvme/target/rdma.c index cf60759cc169..8c06675c2305 100644 --- a/drivers/nvme/target/rdma.c +++ b/drivers/nvme/target/rdma.c @@ -1066,6 +1066,7 @@ nvmet_rdma_alloc_queue(struct nvmet_rdma_device *ndev, spin_lock_init(&queue->rsp_wr_wait_lock); INIT_LIST_HEAD(&queue->free_rsps); spin_lock_init(&queue->rsps_lock); + INIT_LIST_HEAD(&queue->queue_list); queue->idx = ida_simple_get(&nvmet_rdma_queue_ida, 0, 0, GFP_KERNEL); if (queue->idx < 0) { @@ -1269,7 +1270,12 @@ static void nvmet_rdma_queue_connect_fail(struct rdma_cm_id *cm_id, { WARN_ON_ONCE(queue->state != NVMET_RDMA_Q_CONNECTING); - pr_err("failed to connect queue\n"); + mutex_lock(&nvmet_rdma_queue_mutex); + if (!list_empty(&queue->queue_list)) + list_del_init(&queue->queue_list); + mutex_unlock(&nvmet_rdma_queue_mutex); + + pr_err("failed to connect queue %d\n", queue->idx); schedule_work(&queue->release_work); } From c8dbc37cd81d4705fce51123f5d81ea3267a5b88 Mon Sep 17 00:00:00 2001 From: Steve Wise Date: Tue, 8 Nov 2016 09:16:02 -0800 Subject: [PATCH 5/6] nvme-rdma: stop and free io queues on connect failure While testing nvme-rdma with the spdk nvmf target over iw_cxgb4, I configured the target (mistakenly) to generate an error creating the NVMF IO queues. This resulted a "Invalid SQE Parameter" error sent back to the host on the first IO queue connect: [ 9610.928182] nvme nvme1: queue_size 128 > ctrl maxcmd 120, clamping down [ 9610.938745] nvme nvme1: creating 32 I/O queues. So nvmf_connect_io_queue() returns an error to nvmf_connect_io_queue() / nvmf_connect_io_queues(), and that is returned to nvme_rdma_create_io_queues(). In the error path, nvmf_rdma_create_io_queues() frees the queue tagset memory _before_ stopping and freeing the IB queues, which causes yet another touch-after-free crash due to SQ CQEs being flushed after the ib_cqe structs pointed-to by the flushed WRs have been freed (since they are part of the nvme_rdma_request struct). The fix is to stop and free the queues in nvmf_connect_io_queues() if there is an error connecting any of the queues. Signed-off-by: Steve Wise Signed-off-by: Sagi Grimberg --- drivers/nvme/host/rdma.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c index 438d6948895f..3d25add36d91 100644 --- a/drivers/nvme/host/rdma.c +++ b/drivers/nvme/host/rdma.c @@ -625,11 +625,18 @@ static int nvme_rdma_connect_io_queues(struct nvme_rdma_ctrl *ctrl) for (i = 1; i < ctrl->queue_count; i++) { ret = nvmf_connect_io_queue(&ctrl->ctrl, i); - if (ret) - break; + if (ret) { + dev_info(ctrl->ctrl.device, + "failed to connect i/o queue: %d\n", ret); + goto out_free_queues; + } set_bit(NVME_RDMA_Q_LIVE, &ctrl->queues[i].flags); } + return 0; + +out_free_queues: + nvme_rdma_free_io_queues(ctrl); return ret; } From 14c862dbb0a0e0a9baec20480d441e32cb54b2b9 Mon Sep 17 00:00:00 2001 From: Sagi Grimberg Date: Sun, 6 Nov 2016 11:03:59 +0200 Subject: [PATCH 6/6] nvmet-rdma: drain the queue-pair just before freeing it draining the qp right after disconnect might not suffice because the nvmet sq is not fully drained (in nvmet_sq_destroy) and we might see completions after the drain. Instead, drain right before the qp destroy which comes after the sq destruction and we can be sure that no posts come after the drain. Tested-by: Steve Wise Signed-off-by: Sagi Grimberg --- drivers/nvme/target/rdma.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/nvme/target/rdma.c b/drivers/nvme/target/rdma.c index 8c06675c2305..005ef5d17a19 100644 --- a/drivers/nvme/target/rdma.c +++ b/drivers/nvme/target/rdma.c @@ -951,6 +951,7 @@ err_destroy_cq: static void nvmet_rdma_destroy_queue_ib(struct nvmet_rdma_queue *queue) { + ib_drain_qp(queue->cm_id->qp); rdma_destroy_qp(queue->cm_id); ib_free_cq(queue->cq); } @@ -1245,7 +1246,6 @@ static void __nvmet_rdma_queue_disconnect(struct nvmet_rdma_queue *queue) if (disconnect) { rdma_disconnect(queue->cm_id); - ib_drain_qp(queue->cm_id->qp); schedule_work(&queue->release_work); } }