From 0b8eb629a700c0ef15a437758db8255f8444e76c Mon Sep 17 00:00:00 2001 From: Chengguang Xu Date: Wed, 24 Jun 2020 18:21:39 +0800 Subject: [PATCH 01/12] block: release bip in a right way in error path Release bip using kfree() in error path when that was allocated by kmalloc(). Signed-off-by: Chengguang Xu Reviewed-by: Christoph Hellwig Acked-by: Martin K. Petersen Signed-off-by: Jens Axboe --- block/bio-integrity.c | 23 ++++++++++++++--------- 1 file changed, 14 insertions(+), 9 deletions(-) diff --git a/block/bio-integrity.c b/block/bio-integrity.c index 23632a33ed39..4707e90b8ee5 100644 --- a/block/bio-integrity.c +++ b/block/bio-integrity.c @@ -24,6 +24,18 @@ void blk_flush_integrity(void) flush_workqueue(kintegrityd_wq); } +void __bio_integrity_free(struct bio_set *bs, struct bio_integrity_payload *bip) +{ + if (bs && mempool_initialized(&bs->bio_integrity_pool)) { + if (bip->bip_vec) + bvec_free(&bs->bvec_integrity_pool, bip->bip_vec, + bip->bip_slab); + mempool_free(bip, &bs->bio_integrity_pool); + } else { + kfree(bip); + } +} + /** * bio_integrity_alloc - Allocate integrity payload and attach it to bio * @bio: bio to attach integrity metadata to @@ -78,7 +90,7 @@ struct bio_integrity_payload *bio_integrity_alloc(struct bio *bio, return bip; err: - mempool_free(bip, &bs->bio_integrity_pool); + __bio_integrity_free(bs, bip); return ERR_PTR(-ENOMEM); } EXPORT_SYMBOL(bio_integrity_alloc); @@ -99,14 +111,7 @@ void bio_integrity_free(struct bio *bio) kfree(page_address(bip->bip_vec->bv_page) + bip->bip_vec->bv_offset); - if (bs && mempool_initialized(&bs->bio_integrity_pool)) { - bvec_free(&bs->bvec_integrity_pool, bip->bip_vec, bip->bip_slab); - - mempool_free(bip, &bs->bio_integrity_pool); - } else { - kfree(bip); - } - + __bio_integrity_free(bs, bip); bio->bi_integrity = NULL; bio->bi_opf &= ~REQ_INTEGRITY; } From 4fea243ebce40e2e4193d7d25eabfd963c46ef8c Mon Sep 17 00:00:00 2001 From: Max Gurtovoy Date: Tue, 16 Jun 2020 12:34:21 +0300 Subject: [PATCH 02/12] nvme: set initial value for controller's numa node Initialize the node to NUMA_NO_NODE value. Transports that are aware of numa node affinity can override it (e.g. RDMA transport set the affinity according to the RDMA HCA). Signed-off-by: Max Gurtovoy Signed-off-by: Christoph Hellwig --- drivers/nvme/host/core.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index c2c5bc4fb702..915fa2e609eb 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -4174,6 +4174,7 @@ int nvme_init_ctrl(struct nvme_ctrl *ctrl, struct device *dev, ctrl->dev = dev; ctrl->ops = ops; ctrl->quirks = quirks; + ctrl->numa_node = NUMA_NO_NODE; INIT_WORK(&ctrl->scan_work, nvme_scan_work); INIT_WORK(&ctrl->async_event_work, nvme_async_event_work); INIT_WORK(&ctrl->fw_act_work, nvme_fw_act_work); From 635333e400e2e678258ea45232415cdadadd7818 Mon Sep 17 00:00:00 2001 From: Max Gurtovoy Date: Tue, 16 Jun 2020 12:34:22 +0300 Subject: [PATCH 03/12] nvme-pci: override the value of the controller's numa node Set the node value according to the PCI device numa node. Signed-off-by: Max Gurtovoy Signed-off-by: Christoph Hellwig --- drivers/nvme/host/pci.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index e2bacd369a88..46dc530d461c 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -1669,6 +1669,8 @@ static int nvme_pci_configure_admin_queue(struct nvme_dev *dev) if (result) return result; + dev->ctrl.numa_node = dev_to_node(dev->dev); + nvmeq = &dev->queues[0]; aqa = nvmeq->q_depth - 1; aqa |= aqa << 16; From d4ec47f120537c75184c3dc939d3b2e1bcc8b260 Mon Sep 17 00:00:00 2001 From: Max Gurtovoy Date: Tue, 16 Jun 2020 12:34:23 +0300 Subject: [PATCH 04/12] nvme-pci: initialize tagset numa value to the value of the ctrl Both admin's and drive's tagsets should be set according the numa node of the controller. Signed-off-by: Max Gurtovoy Signed-off-by: Christoph Hellwig --- drivers/nvme/host/pci.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 46dc530d461c..b1d18f0633c7 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -1593,7 +1593,7 @@ static int nvme_alloc_admin_tags(struct nvme_dev *dev) dev->admin_tagset.queue_depth = NVME_AQ_MQ_TAG_DEPTH; dev->admin_tagset.timeout = ADMIN_TIMEOUT; - dev->admin_tagset.numa_node = dev_to_node(dev->dev); + dev->admin_tagset.numa_node = dev->ctrl.numa_node; dev->admin_tagset.cmd_size = sizeof(struct nvme_iod); dev->admin_tagset.flags = BLK_MQ_F_NO_SCHED; dev->admin_tagset.driver_data = dev; @@ -2259,7 +2259,7 @@ static void nvme_dev_add(struct nvme_dev *dev) if (dev->io_queues[HCTX_TYPE_POLL]) dev->tagset.nr_maps++; dev->tagset.timeout = NVME_IO_TIMEOUT; - dev->tagset.numa_node = dev_to_node(dev->dev); + dev->tagset.numa_node = dev->ctrl.numa_node; dev->tagset.queue_depth = min_t(int, dev->q_depth, BLK_MQ_MAX_DEPTH) - 1; dev->tagset.cmd_size = sizeof(struct nvme_iod); From 610c823510f7d05a4ddb7bba00893e455bbef9dc Mon Sep 17 00:00:00 2001 From: Max Gurtovoy Date: Tue, 16 Jun 2020 12:34:24 +0300 Subject: [PATCH 05/12] nvme-tcp: initialize tagset numa value to the value of the ctrl Both admin's and drive's tagsets should be set according the numa node of the controller. Signed-off-by: Max Gurtovoy Signed-off-by: Christoph Hellwig --- drivers/nvme/host/tcp.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c index 3345ec7efaff..79ef2b8e2b3c 100644 --- a/drivers/nvme/host/tcp.c +++ b/drivers/nvme/host/tcp.c @@ -1532,7 +1532,7 @@ static struct blk_mq_tag_set *nvme_tcp_alloc_tagset(struct nvme_ctrl *nctrl, set->ops = &nvme_tcp_admin_mq_ops; set->queue_depth = NVME_AQ_MQ_TAG_DEPTH; set->reserved_tags = 2; /* connect + keep-alive */ - set->numa_node = NUMA_NO_NODE; + set->numa_node = nctrl->numa_node; set->flags = BLK_MQ_F_BLOCKING; set->cmd_size = sizeof(struct nvme_tcp_request); set->driver_data = ctrl; @@ -1544,7 +1544,7 @@ static struct blk_mq_tag_set *nvme_tcp_alloc_tagset(struct nvme_ctrl *nctrl, set->ops = &nvme_tcp_mq_ops; set->queue_depth = nctrl->sqsize + 1; set->reserved_tags = 1; /* fabric connect */ - set->numa_node = NUMA_NO_NODE; + set->numa_node = nctrl->numa_node; set->flags = BLK_MQ_F_SHOULD_MERGE | BLK_MQ_F_BLOCKING; set->cmd_size = sizeof(struct nvme_tcp_request); set->driver_data = ctrl; From 1b4ad7a50ab06573aa8841217d6a472dc1db2d85 Mon Sep 17 00:00:00 2001 From: Max Gurtovoy Date: Tue, 16 Jun 2020 12:34:25 +0300 Subject: [PATCH 06/12] nvme-loop: initialize tagset numa value to the value of the ctrl Both admin's and drive's tagsets should be set according the numa node of the controller. Signed-off-by: Max Gurtovoy Signed-off-by: Christoph Hellwig --- drivers/nvme/target/loop.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/nvme/target/loop.c b/drivers/nvme/target/loop.c index 0d54e730cbf2..6344e73c9354 100644 --- a/drivers/nvme/target/loop.c +++ b/drivers/nvme/target/loop.c @@ -340,7 +340,7 @@ static int nvme_loop_configure_admin_queue(struct nvme_loop_ctrl *ctrl) ctrl->admin_tag_set.ops = &nvme_loop_admin_mq_ops; ctrl->admin_tag_set.queue_depth = NVME_AQ_MQ_TAG_DEPTH; ctrl->admin_tag_set.reserved_tags = 2; /* connect + keep-alive */ - ctrl->admin_tag_set.numa_node = NUMA_NO_NODE; + ctrl->admin_tag_set.numa_node = ctrl->ctrl.numa_node; ctrl->admin_tag_set.cmd_size = sizeof(struct nvme_loop_iod) + NVME_INLINE_SG_CNT * sizeof(struct scatterlist); ctrl->admin_tag_set.driver_data = ctrl; @@ -512,7 +512,7 @@ static int nvme_loop_create_io_queues(struct nvme_loop_ctrl *ctrl) ctrl->tag_set.ops = &nvme_loop_mq_ops; ctrl->tag_set.queue_depth = ctrl->ctrl.opts->queue_size; ctrl->tag_set.reserved_tags = 1; /* fabric connect */ - ctrl->tag_set.numa_node = NUMA_NO_NODE; + ctrl->tag_set.numa_node = ctrl->ctrl.numa_node; ctrl->tag_set.flags = BLK_MQ_F_SHOULD_MERGE; ctrl->tag_set.cmd_size = sizeof(struct nvme_loop_iod) + NVME_INLINE_SG_CNT * sizeof(struct scatterlist); From 032a9966a22a3596addf81dacf0c1736dfedc32a Mon Sep 17 00:00:00 2001 From: Max Gurtovoy Date: Tue, 23 Jun 2020 17:55:25 +0300 Subject: [PATCH 07/12] nvme-rdma: assign completion vector correctly The completion vector index that is given during CQ creation can't exceed the number of support vectors by the underlying RDMA device. This violation currently can accure, for example, in case one will try to connect with N regular read/write queues and M poll queues and the sum of N + M > num_supported_vectors. This will lead to failure in establish a connection to remote target. Instead, in that case, share a completion vector between queues. Signed-off-by: Max Gurtovoy Signed-off-by: Christoph Hellwig --- drivers/nvme/host/rdma.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c index f8f856dc0c67..13506a87a444 100644 --- a/drivers/nvme/host/rdma.c +++ b/drivers/nvme/host/rdma.c @@ -470,7 +470,7 @@ static int nvme_rdma_create_queue_ib(struct nvme_rdma_queue *queue) * Spread I/O queues completion vectors according their queue index. * Admin queues can always go on completion vector 0. */ - comp_vector = idx == 0 ? idx : idx - 1; + comp_vector = (idx == 0 ? idx : idx - 1) % ibdev->num_comp_vectors; /* Polling queues need direct cq polling context */ if (nvme_rdma_poll_queue(queue)) From 3b4b19721ec652ad2c4fe51dfbe5124212b5f581 Mon Sep 17 00:00:00 2001 From: Sagi Grimberg Date: Wed, 24 Jun 2020 01:53:08 -0700 Subject: [PATCH 08/12] nvme: fix possible deadlock when I/O is blocked Revert fab7772bfbcf ("nvme-multipath: revalidate nvme_ns_head gendisk in nvme_validate_ns") When adding a new namespace to the head disk (via nvme_mpath_set_live) we will see partition scan which triggers I/O on the mpath device node. This process will usually be triggered from the scan_work which holds the scan_lock. If I/O blocks (if we got ana change currently have only available paths but none are accessible) this can deadlock on the head disk bd_mutex as both partition scan I/O takes it, and head disk revalidation takes it to check for resize (also triggered from scan_work on a different path). See trace [1]. The mpath disk revalidation was originally added to detect online disk size change, but this is no longer needed since commit cb224c3af4df ("nvme: Convert to use set_capacity_revalidate_and_notify") which already updates resize info without unnecessarily revalidating the disk (the mpath disk doesn't even implement .revalidate_disk fop). [1]: -- kernel: INFO: task kworker/u65:9:494 blocked for more than 241 seconds. kernel: Tainted: G OE 5.3.5-050305-generic #201910071830 kernel: "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message. kernel: kworker/u65:9 D 0 494 2 0x80004000 kernel: Workqueue: nvme-wq nvme_scan_work [nvme_core] kernel: Call Trace: kernel: __schedule+0x2b9/0x6c0 kernel: schedule+0x42/0xb0 kernel: schedule_preempt_disabled+0xe/0x10 kernel: __mutex_lock.isra.0+0x182/0x4f0 kernel: __mutex_lock_slowpath+0x13/0x20 kernel: mutex_lock+0x2e/0x40 kernel: revalidate_disk+0x63/0xa0 kernel: __nvme_revalidate_disk+0xfe/0x110 [nvme_core] kernel: nvme_revalidate_disk+0xa4/0x160 [nvme_core] kernel: ? evict+0x14c/0x1b0 kernel: revalidate_disk+0x2b/0xa0 kernel: nvme_validate_ns+0x49/0x940 [nvme_core] kernel: ? blk_mq_free_request+0xd2/0x100 kernel: ? __nvme_submit_sync_cmd+0xbe/0x1e0 [nvme_core] kernel: nvme_scan_work+0x24f/0x380 [nvme_core] kernel: process_one_work+0x1db/0x380 kernel: worker_thread+0x249/0x400 kernel: kthread+0x104/0x140 kernel: ? process_one_work+0x380/0x380 kernel: ? kthread_park+0x80/0x80 kernel: ret_from_fork+0x1f/0x40 ... kernel: INFO: task kworker/u65:1:2630 blocked for more than 241 seconds. kernel: Tainted: G OE 5.3.5-050305-generic #201910071830 kernel: "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message. kernel: kworker/u65:1 D 0 2630 2 0x80004000 kernel: Workqueue: nvme-wq nvme_scan_work [nvme_core] kernel: Call Trace: kernel: __schedule+0x2b9/0x6c0 kernel: schedule+0x42/0xb0 kernel: io_schedule+0x16/0x40 kernel: do_read_cache_page+0x438/0x830 kernel: ? __switch_to_asm+0x34/0x70 kernel: ? file_fdatawait_range+0x30/0x30 kernel: read_cache_page+0x12/0x20 kernel: read_dev_sector+0x27/0xc0 kernel: read_lba+0xc1/0x220 kernel: ? kmem_cache_alloc_trace+0x19c/0x230 kernel: efi_partition+0x1e6/0x708 kernel: ? vsnprintf+0x39e/0x4e0 kernel: ? snprintf+0x49/0x60 kernel: check_partition+0x154/0x244 kernel: rescan_partitions+0xae/0x280 kernel: __blkdev_get+0x40f/0x560 kernel: blkdev_get+0x3d/0x140 kernel: __device_add_disk+0x388/0x480 kernel: device_add_disk+0x13/0x20 kernel: nvme_mpath_set_live+0x119/0x140 [nvme_core] kernel: nvme_update_ns_ana_state+0x5c/0x60 [nvme_core] kernel: nvme_set_ns_ana_state+0x1e/0x30 [nvme_core] kernel: nvme_parse_ana_log+0xa1/0x180 [nvme_core] kernel: ? nvme_update_ns_ana_state+0x60/0x60 [nvme_core] kernel: nvme_mpath_add_disk+0x47/0x90 [nvme_core] kernel: nvme_validate_ns+0x396/0x940 [nvme_core] kernel: ? blk_mq_free_request+0xd2/0x100 kernel: nvme_scan_work+0x24f/0x380 [nvme_core] kernel: process_one_work+0x1db/0x380 kernel: worker_thread+0x249/0x400 kernel: kthread+0x104/0x140 kernel: ? process_one_work+0x380/0x380 kernel: ? kthread_park+0x80/0x80 kernel: ret_from_fork+0x1f/0x40 -- Fixes: fab7772bfbcf ("nvme-multipath: revalidate nvme_ns_head gendisk in nvme_validate_ns") Signed-off-by: Anton Eidelman Signed-off-by: Sagi Grimberg Signed-off-by: Christoph Hellwig --- drivers/nvme/host/core.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 915fa2e609eb..28f4388c1337 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -1974,7 +1974,6 @@ static int __nvme_revalidate_disk(struct gendisk *disk, struct nvme_id_ns *id) if (ns->head->disk) { nvme_update_disk_info(ns->head->disk, ns, id); blk_queue_stack_limits(ns->head->disk->queue, ns->queue); - revalidate_disk(ns->head->disk); } #endif return 0; From 489dd102a2c7c94d783a35f9412eb085b8da1aa4 Mon Sep 17 00:00:00 2001 From: Anton Eidelman Date: Wed, 24 Jun 2020 01:53:09 -0700 Subject: [PATCH 09/12] nvme-multipath: fix deadlock between ana_work and scan_work When scan_work calls nvme_mpath_add_disk() this holds ana_lock and invokes nvme_parse_ana_log(), which may issue IO in device_add_disk() and hang waiting for an accessible path. While nvme_mpath_set_live() only called when nvme_state_is_live(), a transition may cause NVME_SC_ANA_TRANSITION and requeue the IO. In order to recover and complete the IO ana_work on the same ctrl should be able to update the path state and remove NVME_NS_ANA_PENDING. The deadlock occurs because scan_work keeps holding ana_lock, so ana_work hangs [1]. Fix: Now nvme_mpath_add_disk() uses nvme_parse_ana_log() to obtain a copy of the ANA group desc, and then calls nvme_update_ns_ana_state() without holding ana_lock. [1]: kernel: Workqueue: nvme-wq nvme_scan_work [nvme_core] kernel: Call Trace: kernel: __schedule+0x2b9/0x6c0 kernel: schedule+0x42/0xb0 kernel: io_schedule+0x16/0x40 kernel: do_read_cache_page+0x438/0x830 kernel: read_cache_page+0x12/0x20 kernel: read_dev_sector+0x27/0xc0 kernel: read_lba+0xc1/0x220 kernel: efi_partition+0x1e6/0x708 kernel: check_partition+0x154/0x244 kernel: rescan_partitions+0xae/0x280 kernel: __blkdev_get+0x40f/0x560 kernel: blkdev_get+0x3d/0x140 kernel: __device_add_disk+0x388/0x480 kernel: device_add_disk+0x13/0x20 kernel: nvme_mpath_set_live+0x119/0x140 [nvme_core] kernel: nvme_update_ns_ana_state+0x5c/0x60 [nvme_core] kernel: nvme_set_ns_ana_state+0x1e/0x30 [nvme_core] kernel: nvme_parse_ana_log+0xa1/0x180 [nvme_core] kernel: nvme_mpath_add_disk+0x47/0x90 [nvme_core] kernel: nvme_validate_ns+0x396/0x940 [nvme_core] kernel: nvme_scan_work+0x24f/0x380 [nvme_core] kernel: process_one_work+0x1db/0x380 kernel: worker_thread+0x249/0x400 kernel: kthread+0x104/0x140 kernel: Workqueue: nvme-wq nvme_ana_work [nvme_core] kernel: Call Trace: kernel: __schedule+0x2b9/0x6c0 kernel: schedule+0x42/0xb0 kernel: schedule_preempt_disabled+0xe/0x10 kernel: __mutex_lock.isra.0+0x182/0x4f0 kernel: ? __switch_to_asm+0x34/0x70 kernel: ? select_task_rq_fair+0x1aa/0x5c0 kernel: ? kvm_sched_clock_read+0x11/0x20 kernel: ? sched_clock+0x9/0x10 kernel: __mutex_lock_slowpath+0x13/0x20 kernel: mutex_lock+0x2e/0x40 kernel: nvme_read_ana_log+0x3a/0x100 [nvme_core] kernel: nvme_ana_work+0x15/0x20 [nvme_core] kernel: process_one_work+0x1db/0x380 kernel: worker_thread+0x4d/0x400 kernel: kthread+0x104/0x140 kernel: ? process_one_work+0x380/0x380 kernel: ? kthread_park+0x80/0x80 kernel: ret_from_fork+0x35/0x40 Fixes: 0d0b660f214d ("nvme: add ANA support") Signed-off-by: Anton Eidelman Signed-off-by: Sagi Grimberg Signed-off-by: Christoph Hellwig --- drivers/nvme/host/multipath.c | 24 ++++++++++++++++-------- 1 file changed, 16 insertions(+), 8 deletions(-) diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c index da78e499947a..0ef183c1153e 100644 --- a/drivers/nvme/host/multipath.c +++ b/drivers/nvme/host/multipath.c @@ -640,26 +640,34 @@ static ssize_t ana_state_show(struct device *dev, struct device_attribute *attr, } DEVICE_ATTR_RO(ana_state); -static int nvme_set_ns_ana_state(struct nvme_ctrl *ctrl, +static int nvme_lookup_ana_group_desc(struct nvme_ctrl *ctrl, struct nvme_ana_group_desc *desc, void *data) { - struct nvme_ns *ns = data; + struct nvme_ana_group_desc *dst = data; - if (ns->ana_grpid == le32_to_cpu(desc->grpid)) { - nvme_update_ns_ana_state(desc, ns); - return -ENXIO; /* just break out of the loop */ - } + if (desc->grpid != dst->grpid) + return 0; - return 0; + *dst = *desc; + return -ENXIO; /* just break out of the loop */ } void nvme_mpath_add_disk(struct nvme_ns *ns, struct nvme_id_ns *id) { if (nvme_ctrl_use_ana(ns->ctrl)) { + struct nvme_ana_group_desc desc = { + .grpid = id->anagrpid, + .state = 0, + }; + mutex_lock(&ns->ctrl->ana_lock); ns->ana_grpid = le32_to_cpu(id->anagrpid); - nvme_parse_ana_log(ns->ctrl, ns, nvme_set_ns_ana_state); + nvme_parse_ana_log(ns->ctrl, &desc, nvme_lookup_ana_group_desc); mutex_unlock(&ns->ctrl->ana_lock); + if (desc.state) { + /* found the group desc: update */ + nvme_update_ns_ana_state(&desc, ns); + } } else { mutex_lock(&ns->head->lock); ns->ana_state = NVME_ANA_OPTIMIZED; From e164471dcf19308d154adb69e7760d8ba426a77f Mon Sep 17 00:00:00 2001 From: Sagi Grimberg Date: Wed, 24 Jun 2020 01:53:10 -0700 Subject: [PATCH 10/12] nvme: don't protect ns mutation with ns->head->lock Right now ns->head->lock is protecting namespace mutation which is wrong and unneeded. Move it to only protect against head mutations. While we're at it, remove unnecessary ns->head reference as we already have head pointer. The problem with this is that the head->lock spans mpath disk node I/O that may block under some conditions (if for example the controller is disconnecting or the path became inaccessible), The locking scheme does not allow any other path to enable itself, preventing blocked I/O to complete and forward-progress from there. This is a preparation patch for the fix in a subsequent patch where the disk I/O will also be done outside the head->lock. Fixes: 0d0b660f214d ("nvme: add ANA support") Signed-off-by: Anton Eidelman Signed-off-by: Sagi Grimberg Signed-off-by: Christoph Hellwig --- drivers/nvme/host/multipath.c | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c index 0ef183c1153e..5c48a38aebf8 100644 --- a/drivers/nvme/host/multipath.c +++ b/drivers/nvme/host/multipath.c @@ -409,11 +409,10 @@ static void nvme_mpath_set_live(struct nvme_ns *ns) { struct nvme_ns_head *head = ns->head; - lockdep_assert_held(&ns->head->lock); - if (!head->disk) return; + mutex_lock(&head->lock); if (!(head->disk->flags & GENHD_FL_UP)) device_add_disk(&head->subsys->dev, head->disk, nvme_ns_id_attr_groups); @@ -426,9 +425,10 @@ static void nvme_mpath_set_live(struct nvme_ns *ns) __nvme_find_path(head, node); srcu_read_unlock(&head->srcu, srcu_idx); } + mutex_unlock(&head->lock); - synchronize_srcu(&ns->head->srcu); - kblockd_schedule_work(&ns->head->requeue_work); + synchronize_srcu(&head->srcu); + kblockd_schedule_work(&head->requeue_work); } static int nvme_parse_ana_log(struct nvme_ctrl *ctrl, void *data, @@ -483,14 +483,12 @@ static inline bool nvme_state_is_live(enum nvme_ana_state state) static void nvme_update_ns_ana_state(struct nvme_ana_group_desc *desc, struct nvme_ns *ns) { - mutex_lock(&ns->head->lock); ns->ana_grpid = le32_to_cpu(desc->grpid); ns->ana_state = desc->state; clear_bit(NVME_NS_ANA_PENDING, &ns->flags); if (nvme_state_is_live(ns->ana_state)) nvme_mpath_set_live(ns); - mutex_unlock(&ns->head->lock); } static int nvme_update_ana_state(struct nvme_ctrl *ctrl, @@ -669,10 +667,8 @@ void nvme_mpath_add_disk(struct nvme_ns *ns, struct nvme_id_ns *id) nvme_update_ns_ana_state(&desc, ns); } } else { - mutex_lock(&ns->head->lock); ns->ana_state = NVME_ANA_OPTIMIZED; nvme_mpath_set_live(ns); - mutex_unlock(&ns->head->lock); } if (bdi_cap_stable_pages_required(ns->queue->backing_dev_info)) { From d8a22f85609fadb46ba699e0136cc3ebdeebff79 Mon Sep 17 00:00:00 2001 From: Anton Eidelman Date: Wed, 24 Jun 2020 01:53:11 -0700 Subject: [PATCH 11/12] nvme-multipath: fix deadlock due to head->lock In the following scenario scan_work and ana_work will deadlock: When scan_work calls nvme_mpath_add_disk() this holds ana_lock and invokes nvme_parse_ana_log(), which may issue IO in device_add_disk() and hang waiting for an accessible path. While nvme_mpath_set_live() only called when nvme_state_is_live(), a transition may cause NVME_SC_ANA_TRANSITION and requeue the IO. Since nvme_mpath_set_live() holds ns->head->lock, an ana_work on ANY ctrl will not be able to complete nvme_mpath_set_live() on the same ns->head, which is required in order to update the new accessible path and remove NVME_NS_ANA_PENDING.. Therefore IO never completes: deadlock [1]. Fix: Move device_add_disk out of the head->lock and protect it with an atomic test_and_set for a new NVME_NS_HEAD_HAS_DISK bit. [1]: kernel: INFO: task kworker/u8:2:160 blocked for more than 120 seconds. kernel: Tainted: G OE 5.3.5-050305-generic #201910071830 kernel: "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message. kernel: kworker/u8:2 D 0 160 2 0x80004000 kernel: Workqueue: nvme-wq nvme_ana_work [nvme_core] kernel: Call Trace: kernel: __schedule+0x2b9/0x6c0 kernel: schedule+0x42/0xb0 kernel: schedule_preempt_disabled+0xe/0x10 kernel: __mutex_lock.isra.0+0x182/0x4f0 kernel: __mutex_lock_slowpath+0x13/0x20 kernel: mutex_lock+0x2e/0x40 kernel: nvme_update_ns_ana_state+0x22/0x60 [nvme_core] kernel: nvme_update_ana_state+0xca/0xe0 [nvme_core] kernel: nvme_parse_ana_log+0xa1/0x180 [nvme_core] kernel: nvme_read_ana_log+0x76/0x100 [nvme_core] kernel: nvme_ana_work+0x15/0x20 [nvme_core] kernel: process_one_work+0x1db/0x380 kernel: worker_thread+0x4d/0x400 kernel: kthread+0x104/0x140 kernel: ret_from_fork+0x35/0x40 kernel: INFO: task kworker/u8:4:439 blocked for more than 120 seconds. kernel: Tainted: G OE 5.3.5-050305-generic #201910071830 kernel: "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message. kernel: kworker/u8:4 D 0 439 2 0x80004000 kernel: Workqueue: nvme-wq nvme_scan_work [nvme_core] kernel: Call Trace: kernel: __schedule+0x2b9/0x6c0 kernel: schedule+0x42/0xb0 kernel: io_schedule+0x16/0x40 kernel: do_read_cache_page+0x438/0x830 kernel: read_cache_page+0x12/0x20 kernel: read_dev_sector+0x27/0xc0 kernel: read_lba+0xc1/0x220 kernel: efi_partition+0x1e6/0x708 kernel: check_partition+0x154/0x244 kernel: rescan_partitions+0xae/0x280 kernel: __blkdev_get+0x40f/0x560 kernel: blkdev_get+0x3d/0x140 kernel: __device_add_disk+0x388/0x480 kernel: device_add_disk+0x13/0x20 kernel: nvme_mpath_set_live+0x119/0x140 [nvme_core] kernel: nvme_update_ns_ana_state+0x5c/0x60 [nvme_core] kernel: nvme_mpath_add_disk+0xbe/0x100 [nvme_core] kernel: nvme_validate_ns+0x396/0x940 [nvme_core] kernel: nvme_scan_work+0x256/0x390 [nvme_core] kernel: process_one_work+0x1db/0x380 kernel: worker_thread+0x4d/0x400 kernel: kthread+0x104/0x140 kernel: ret_from_fork+0x35/0x40 Fixes: 0d0b660f214d ("nvme: add ANA support") Signed-off-by: Anton Eidelman Signed-off-by: Sagi Grimberg Signed-off-by: Christoph Hellwig --- drivers/nvme/host/multipath.c | 4 ++-- drivers/nvme/host/nvme.h | 2 ++ 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c index 5c48a38aebf8..79c8caced81f 100644 --- a/drivers/nvme/host/multipath.c +++ b/drivers/nvme/host/multipath.c @@ -412,11 +412,11 @@ static void nvme_mpath_set_live(struct nvme_ns *ns) if (!head->disk) return; - mutex_lock(&head->lock); - if (!(head->disk->flags & GENHD_FL_UP)) + if (!test_and_set_bit(NVME_NSHEAD_DISK_LIVE, &head->flags)) device_add_disk(&head->subsys->dev, head->disk, nvme_ns_id_attr_groups); + mutex_lock(&head->lock); if (nvme_path_is_optimized(ns)) { int node, srcu_idx; diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index c0f4226d3299..2ef8d501e2a8 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -364,6 +364,8 @@ struct nvme_ns_head { spinlock_t requeue_lock; struct work_struct requeue_work; struct mutex lock; + unsigned long flags; +#define NVME_NSHEAD_DISK_LIVE 0 struct nvme_ns __rcu *current_path[]; #endif }; From c31244669f57963b6ce133a5555b118fc50aec95 Mon Sep 17 00:00:00 2001 From: Sagi Grimberg Date: Wed, 24 Jun 2020 01:53:12 -0700 Subject: [PATCH 12/12] nvme-multipath: fix bogus request queue reference put The mpath disk node takes a reference on the request mpath request queue when adding live path to the mpath gendisk. However if we connected to an inaccessible path device_add_disk is not called, so if we disconnect and remove the mpath gendisk we endup putting an reference on the request queue that was never taken [1]. Fix that to check if we ever added a live path (using NVME_NS_HEAD_HAS_DISK flag) and if not, clear the disk->queue reference. [1]: ------------[ cut here ]------------ refcount_t: underflow; use-after-free. WARNING: CPU: 1 PID: 1372 at lib/refcount.c:28 refcount_warn_saturate+0xa6/0xf0 CPU: 1 PID: 1372 Comm: nvme Tainted: G O 5.7.0-rc2+ #3 Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.13.0-1ubuntu1 04/01/2014 RIP: 0010:refcount_warn_saturate+0xa6/0xf0 RSP: 0018:ffffb29e8053bdc0 EFLAGS: 00010282 RAX: 0000000000000000 RBX: ffff8b7a2f4fc060 RCX: 0000000000000007 RDX: 0000000000000007 RSI: 0000000000000092 RDI: ffff8b7a3ec99980 RBP: ffff8b7a2f4fc000 R08: 00000000000002e1 R09: 0000000000000004 R10: 0000000000000000 R11: 0000000000000001 R12: 0000000000000000 R13: fffffffffffffff2 R14: ffffb29e8053bf08 R15: ffff8b7a320e2da0 FS: 00007f135d4ca800(0000) GS:ffff8b7a3ec80000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00005651178c0c30 CR3: 000000003b650005 CR4: 0000000000360ee0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: disk_release+0xa2/0xc0 device_release+0x28/0x80 kobject_put+0xa5/0x1b0 nvme_put_ns_head+0x26/0x70 [nvme_core] nvme_put_ns+0x30/0x60 [nvme_core] nvme_remove_namespaces+0x9b/0xe0 [nvme_core] nvme_do_delete_ctrl+0x43/0x5c [nvme_core] nvme_sysfs_delete.cold+0x8/0xd [nvme_core] kernfs_fop_write+0xc1/0x1a0 vfs_write+0xb6/0x1a0 ksys_write+0x5f/0xe0 do_syscall_64+0x52/0x1a0 entry_SYSCALL_64_after_hwframe+0x44/0xa9 Reported-by: Anton Eidelman Tested-by: Anton Eidelman Signed-off-by: Sagi Grimberg Signed-off-by: Christoph Hellwig --- drivers/nvme/host/multipath.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c index 79c8caced81f..18d084ed497e 100644 --- a/drivers/nvme/host/multipath.c +++ b/drivers/nvme/host/multipath.c @@ -690,6 +690,14 @@ void nvme_mpath_remove_disk(struct nvme_ns_head *head) kblockd_schedule_work(&head->requeue_work); flush_work(&head->requeue_work); blk_cleanup_queue(head->disk->queue); + if (!test_bit(NVME_NSHEAD_DISK_LIVE, &head->flags)) { + /* + * if device_add_disk wasn't called, prevent + * disk release to put a bogus reference on the + * request queue + */ + head->disk->queue = NULL; + } put_disk(head->disk); }