From d6a2b9535d1e52bea269c138614c4801469d10e1 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Mon, 26 Nov 2018 16:39:47 -0700 Subject: [PATCH 1/8] nvme: Free ctrl device name on init failure Free the kobject name that was allocated for the controller device on failure rather than its parent. Fixes: d22524a4782a9 ("nvme: switch controller refcounting to use struct device") Signed-off-by: Keith Busch Signed-off-by: Christoph Hellwig --- drivers/nvme/host/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 559d567693b8..5afda6fe5ae9 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -3585,7 +3585,7 @@ int nvme_init_ctrl(struct nvme_ctrl *ctrl, struct device *dev, return 0; out_free_name: - kfree_const(dev->kobj.name); + kfree_const(ctrl->device->kobj.name); out_release_instance: ida_simple_remove(&nvme_instance_ida, ctrl->instance); out: From dfa74422d604abc2e16763db12646583219806e4 Mon Sep 17 00:00:00 2001 From: "Ewan D. Milne" Date: Mon, 26 Nov 2018 12:01:30 -0500 Subject: [PATCH 2/8] nvme-fc: initialize nvme_req(rq)->ctrl after calling __nvme_fc_init_request() __nvme_fc_init_request() invokes memset() on the nvme_fcp_op_w_sgl structure, which NULLed-out the nvme_req(req)->ctrl field previously set by nvme_fc_init_request(). This apparently was not referenced until commit faf4a44fff ("nvme: support traffic based keep-alive") which now results in a crash in nvme_complete_rq(): [ 8386.897130] RIP: 0010:panic+0x220/0x26c [ 8386.901406] Code: 83 3d 6f ee 72 01 00 74 05 e8 e8 54 02 00 48 c7 c6 40 fd 5b b4 48 c7 c7 d8 8d c6 b3 31e [ 8386.922359] RSP: 0018:ffff99650019fc40 EFLAGS: 00000246 ORIG_RAX: ffffffffffffff13 [ 8386.930804] RAX: 0000000000000046 RBX: 0000000000000000 RCX: 0000000000000006 [ 8386.938764] RDX: 0000000000000000 RSI: 0000000000000082 RDI: ffff8e325f8168b0 [ 8386.946725] RBP: ffff99650019fcb0 R08: 0000000000000000 R09: 00000000000004f8 [ 8386.954687] R10: 0000000000000000 R11: ffff99650019f9b8 R12: ffffffffb3c55f3c [ 8386.962648] R13: 0000000000000000 R14: 0000000000000000 R15: 0000000000000001 [ 8386.970613] oops_end+0xd1/0xe0 [ 8386.974116] no_context+0x1b2/0x3c0 [ 8386.978006] do_page_fault+0x32/0x140 [ 8386.982090] page_fault+0x1e/0x30 [ 8386.985786] RIP: 0010:nvme_complete_rq+0x65/0x1d0 [nvme_core] [ 8386.992195] Code: 41 bc 03 00 00 00 74 16 0f 86 c3 00 00 00 66 3d 83 00 41 bc 06 00 00 00 0f 85 e7 00 000 [ 8387.013147] RSP: 0018:ffff99650019fe18 EFLAGS: 00010246 [ 8387.018973] RAX: 0000000000000000 RBX: ffff8e322ae51280 RCX: 0000000000000001 [ 8387.026935] RDX: 0000000000000400 RSI: 0000000000000001 RDI: ffff8e322ae51280 [ 8387.034897] RBP: ffff8e322ae51280 R08: 0000000000000000 R09: ffffffffb2f0b890 [ 8387.042859] R10: 0000000000000001 R11: 0000000000000000 R12: 0000000000000000 [ 8387.050821] R13: 0000000000000100 R14: 0000000000000004 R15: ffff8e2b0446d990 [ 8387.058782] ? swiotlb_unmap_page+0x40/0x40 [ 8387.063448] nvme_fc_complete_rq+0x2d/0x70 [nvme_fc] [ 8387.068986] blk_done_softirq+0xa1/0xd0 [ 8387.073264] __do_softirq+0xd6/0x2a9 [ 8387.077251] run_ksoftirqd+0x26/0x40 [ 8387.081238] smpboot_thread_fn+0x10e/0x160 [ 8387.085807] kthread+0xf8/0x130 [ 8387.089309] ? sort_range+0x20/0x20 [ 8387.093198] ? kthread_stop+0x110/0x110 [ 8387.097475] ret_from_fork+0x35/0x40 [ 8387.101462] ---[ end trace 7106b0adf5e422f8 ]--- Fixes: faf4a44fff ("nvme: support traffic based keep-alive") Signed-off-by: Ewan D. Milne Reviewed-by: Hannes Reinecke Signed-off-by: Christoph Hellwig --- drivers/nvme/host/fc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/nvme/host/fc.c b/drivers/nvme/host/fc.c index 54032c466636..feb86b59170e 100644 --- a/drivers/nvme/host/fc.c +++ b/drivers/nvme/host/fc.c @@ -1752,12 +1752,12 @@ nvme_fc_init_request(struct blk_mq_tag_set *set, struct request *rq, struct nvme_fc_queue *queue = &ctrl->queues[queue_idx]; int res; - nvme_req(rq)->ctrl = &ctrl->ctrl; res = __nvme_fc_init_request(ctrl, queue, &op->op, rq, queue->rqcnt++); if (res) return res; op->op.fcp_req.first_sgl = &op->sgl[0]; op->op.fcp_req.private = &op->priv[0]; + nvme_req(rq)->ctrl = &ctrl->ctrl; return res; } From 751a0cc0cd3a0d51e6aaf6fd3b8bd31f4ecfaf3e Mon Sep 17 00:00:00 2001 From: Igor Konopko Date: Fri, 23 Nov 2018 16:58:10 +0100 Subject: [PATCH 3/8] nvme-pci: fix surprise removal When a PCIe NVMe device is not present, nvme_dev_remove_admin() calls blk_cleanup_queue() on the admin queue, which frees the hctx for that queue. Moments later, on the same path nvme_kill_queues() calls blk_mq_unquiesce_queue() on admin queue and tries to access hctx of it, which leads to following OOPS: Oops: 0000 [#1] SMP PTI RIP: 0010:sbitmap_any_bit_set+0xb/0x40 Call Trace: blk_mq_run_hw_queue+0xd5/0x150 blk_mq_run_hw_queues+0x3a/0x50 nvme_kill_queues+0x26/0x50 nvme_remove_namespaces+0xb2/0xc0 nvme_remove+0x60/0x140 pci_device_remove+0x3b/0xb0 Fixes: cb4bfda62afa2 ("nvme-pci: fix hot removal during error handling") Signed-off-by: Igor Konopko Reviewed-by: Keith Busch Signed-off-by: Christoph Hellwig --- drivers/nvme/host/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 5afda6fe5ae9..bb39b91253c2 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -3607,7 +3607,7 @@ void nvme_kill_queues(struct nvme_ctrl *ctrl) down_read(&ctrl->namespaces_rwsem); /* Forcibly unquiesce queues to avoid blocking dispatch */ - if (ctrl->admin_q) + if (ctrl->admin_q && !blk_queue_dying(ctrl->admin_q)) blk_mq_unquiesce_queue(ctrl->admin_q); list_for_each_entry(ns, &ctrl->namespaces, list) From 41e817bca3acd3980efe5dd7d28af0e6f4ab9247 Mon Sep 17 00:00:00 2001 From: Maximilian Heyne Date: Fri, 30 Nov 2018 08:35:14 -0700 Subject: [PATCH 4/8] fs: fix lost error code in dio_complete commit e259221763a40403d5bb232209998e8c45804ab8 ("fs: simplify the generic_write_sync prototype") reworked callers of generic_write_sync(), and ended up dropping the error return for the directio path. Prior to that commit, in dio_complete(), an error would be bubbled up the stack, but after that commit, errors passed on to dio_complete were eaten up. This was reported on the list earlier, and a fix was proposed in https://lore.kernel.org/lkml/20160921141539.GA17898@infradead.org/, but never followed up with. We recently hit this bug in our testing where fencing io errors, which were previously erroring out with EIO, were being returned as success operations after this commit. The fix proposed on the list earlier was a little short -- it would have still called generic_write_sync() in case `ret` already contained an error. This fix ensures generic_write_sync() is only called when there's no pending error in the write. Additionally, transferred is replaced with ret to bring this code in line with other callers. Fixes: e259221763a4 ("fs: simplify the generic_write_sync prototype") Reported-by: Ravi Nankani Signed-off-by: Maximilian Heyne Reviewed-by: Christoph Hellwig CC: Torsten Mehlan CC: Uwe Dannowski CC: Amit Shah CC: David Woodhouse CC: stable@vger.kernel.org Signed-off-by: Jens Axboe --- fs/direct-io.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/direct-io.c b/fs/direct-io.c index 722d17c88edb..41a0e97252ae 100644 --- a/fs/direct-io.c +++ b/fs/direct-io.c @@ -325,8 +325,8 @@ static ssize_t dio_complete(struct dio *dio, ssize_t ret, unsigned int flags) */ dio->iocb->ki_pos += transferred; - if (dio->op == REQ_OP_WRITE) - ret = generic_write_sync(dio->iocb, transferred); + if (ret > 0 && dio->op == REQ_OP_WRITE) + ret = generic_write_sync(dio->iocb, ret); dio->iocb->ki_complete(dio->iocb, ret, 0); } From 14a1336e6fff47dd1028b484d6c802105c58e2ee Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 20 Nov 2018 16:57:54 +0100 Subject: [PATCH 5/8] nvme: warn when finding multi-port subsystems without multipathing enabled Without CONFIG_NVME_MULTIPATH enabled a multi-port subsystem might show up as invididual devices and cause problems, warn about it. Signed-off-by: Christoph Hellwig Reviewed-by: Sagi Grimberg --- drivers/nvme/host/nvme.h | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index cee79cb388af..081cbdcce880 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -531,6 +531,9 @@ static inline void nvme_mpath_check_last_path(struct nvme_ns *ns) static inline int nvme_mpath_init(struct nvme_ctrl *ctrl, struct nvme_id_ctrl *id) { + if (ctrl->subsys->cmic & (1 << 3)) + dev_warn(ctrl->device, +"Please enable CONFIG_NVME_MULTIPATH for full support of multi-port devices.\n"); return 0; } static inline void nvme_mpath_uninit(struct nvme_ctrl *ctrl) From f6c8e432cb0479255322c5d0335b9f1699a0270c Mon Sep 17 00:00:00 2001 From: Sagi Grimberg Date: Wed, 21 Nov 2018 15:17:37 -0800 Subject: [PATCH 6/8] nvme: flush namespace scanning work just before removing namespaces nvme_stop_ctrl can be called also for reset flow and there is no need to flush the scan_work as namespaces are not being removed. This can cause deadlock in rdma, fc and loop drivers since nvme_stop_ctrl barriers before controller teardown (and specifically I/O cancellation of the scan_work itself) takes place, but the scan_work will be blocked anyways so there is no need to flush it. Instead, move scan_work flush to nvme_remove_namespaces() where it really needs to flush. Reported-by: Ming Lei Signed-off-by: Sagi Grimberg Reviewed-by: Keith Busch Reviewed by: James Smart Tested-by: Ewan D. Milne Signed-off-by: Christoph Hellwig --- drivers/nvme/host/core.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index bb39b91253c2..3cf1b773158e 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -3314,6 +3314,9 @@ void nvme_remove_namespaces(struct nvme_ctrl *ctrl) struct nvme_ns *ns, *next; LIST_HEAD(ns_list); + /* prevent racing with ns scanning */ + flush_work(&ctrl->scan_work); + /* * The dead states indicates the controller was not gracefully * disconnected. In that case, we won't be able to flush any data while @@ -3476,7 +3479,6 @@ void nvme_stop_ctrl(struct nvme_ctrl *ctrl) nvme_mpath_stop(ctrl); nvme_stop_keep_alive(ctrl); flush_work(&ctrl->async_event_work); - flush_work(&ctrl->scan_work); cancel_work_sync(&ctrl->fw_act_work); if (ctrl->ops->stop_ctrl) ctrl->ops->stop_ctrl(ctrl); From 6344d02dc8f886b6bbcd922ae1a17e4a41500f2d Mon Sep 17 00:00:00 2001 From: Prabhath Sajeepa Date: Wed, 28 Nov 2018 11:11:29 -0700 Subject: [PATCH 7/8] nvme-rdma: fix double freeing of async event data Some error paths in configuration of admin queue free data buffer associated with async request SQE without resetting the data buffer pointer to NULL, This buffer is also freed up again if the controller is shutdown or reset. Signed-off-by: Prabhath Sajeepa Reviewed-by: Roland Dreier Signed-off-by: Christoph Hellwig --- drivers/nvme/host/rdma.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c index d181cafedc58..ab6ec7295bf9 100644 --- a/drivers/nvme/host/rdma.c +++ b/drivers/nvme/host/rdma.c @@ -184,6 +184,7 @@ static int nvme_rdma_alloc_qe(struct ib_device *ibdev, struct nvme_rdma_qe *qe, qe->dma = ib_dma_map_single(ibdev, qe->data, capsule_size, dir); if (ib_dma_mapping_error(ibdev, qe->dma)) { kfree(qe->data); + qe->data = NULL; return -ENOMEM; } @@ -823,6 +824,7 @@ out_free_tagset: out_free_async_qe: nvme_rdma_free_qe(ctrl->device->dev, &ctrl->async_event_sqe, sizeof(struct nvme_command), DMA_TO_DEVICE); + ctrl->async_event_sqe.data = NULL; out_free_queue: nvme_rdma_free_queue(&ctrl->queues[0]); return error; From 2a5cf35cd6c56b2924bce103413ad3381bdc31fa Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Sat, 1 Dec 2018 00:38:18 +0800 Subject: [PATCH 8/8] block: fix single range discard merge There are actually two kinds of discard merge: - one is the normal discard merge, just like normal read/write request, and call it single-range discard - another is the multi-range discard, queue_max_discard_segments(rq->q) > 1 For the former case, queue_max_discard_segments(rq->q) is 1, and we should handle this kind of discard merge like the normal read/write request. This patch fixes the following kernel panic issue[1], which is caused by not removing the single-range discard request from elevator queue. Guangwu has one raid discard test case, in which this issue is a bit easier to trigger, and I verified that this patch can fix the kernel panic issue in Guangwu's test case. [1] kernel panic log from Jens's report BUG: unable to handle kernel NULL pointer dereference at 0000000000000148 PGD 0 P4D 0. Oops: 0000 [#1] SMP PTI CPU: 37 PID: 763 Comm: kworker/37:1H Not tainted \ 4.20.0-rc3-00649-ge64d9a554a91-dirty #14 Hardware name: Wiwynn \ Leopard-Orv2/Leopard-DDR BW, BIOS LBM08 03/03/2017 Workqueue: kblockd \ blk_mq_run_work_fn RIP: \ 0010:blk_mq_get_driver_tag+0x81/0x120 Code: 24 \ 10 48 89 7c 24 20 74 21 83 fa ff 0f 95 c0 48 8b 4c 24 28 65 48 33 0c 25 28 00 00 00 \ 0f 85 96 00 00 00 48 83 c4 30 5b 5d c3 <48> 8b 87 48 01 00 00 8b 40 04 39 43 20 72 37 \ f6 87 b0 00 00 00 02 RSP: 0018:ffffc90004aabd30 EFLAGS: 00010246 \ RAX: 0000000000000003 RBX: ffff888465ea1300 RCX: ffffc90004aabde8 RDX: 00000000ffffffff RSI: ffffc90004aabde8 RDI: 0000000000000000 RBP: 0000000000000000 R08: ffff888465ea1348 R09: 0000000000000000 R10: 0000000000001000 R11: 00000000ffffffff R12: ffff888465ea1300 R13: 0000000000000000 R14: ffff888465ea1348 R15: ffff888465d10000 FS: 0000000000000000(0000) GS:ffff88846f9c0000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000000000000148 CR3: 000000000220a003 CR4: 00000000003606e0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: blk_mq_dispatch_rq_list+0xec/0x480 ? elv_rb_del+0x11/0x30 blk_mq_do_dispatch_sched+0x6e/0xf0 blk_mq_sched_dispatch_requests+0xfa/0x170 __blk_mq_run_hw_queue+0x5f/0xe0 process_one_work+0x154/0x350 worker_thread+0x46/0x3c0 kthread+0xf5/0x130 ? process_one_work+0x350/0x350 ? kthread_destroy_worker+0x50/0x50 ret_from_fork+0x1f/0x30 Modules linked in: sb_edac x86_pkg_temp_thermal intel_powerclamp coretemp kvm_intel \ kvm switchtec irqbypass iTCO_wdt iTCO_vendor_support efivars cdc_ether usbnet mii \ cdc_acm i2c_i801 lpc_ich mfd_core ipmi_si ipmi_devintf ipmi_msghandler acpi_cpufreq \ button sch_fq_codel nfsd nfs_acl lockd grace auth_rpcgss oid_registry sunrpc nvme \ nvme_core fuse sg loop efivarfs autofs4 CR2: 0000000000000148 \ ---[ end trace 340a1fb996df1b9b ]--- RIP: 0010:blk_mq_get_driver_tag+0x81/0x120 Code: 24 10 48 89 7c 24 20 74 21 83 fa ff 0f 95 c0 48 8b 4c 24 28 65 48 33 0c 25 28 \ 00 00 00 0f 85 96 00 00 00 48 83 c4 30 5b 5d c3 <48> 8b 87 48 01 00 00 8b 40 04 39 43 \ 20 72 37 f6 87 b0 00 00 00 02 Fixes: 445251d0f4d329a ("blk-mq: fix discard merge with scheduler attached") Reported-by: Jens Axboe Cc: Guangwu Zhang Cc: Christoph Hellwig Cc: Jianchao Wang Signed-off-by: Ming Lei Signed-off-by: Jens Axboe --- block/blk-merge.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/block/blk-merge.c b/block/blk-merge.c index e7696c47489a..7695034f4b87 100644 --- a/block/blk-merge.c +++ b/block/blk-merge.c @@ -820,7 +820,7 @@ static struct request *attempt_merge(struct request_queue *q, req->__data_len += blk_rq_bytes(next); - if (req_op(req) != REQ_OP_DISCARD) + if (!blk_discard_mergable(req)) elv_merge_requests(q, req, next); /*