From b30a337ca27c4f40439e4bfb290cba5f88d73bb7 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Wed, 30 Mar 2016 08:46:31 +0800 Subject: [PATCH 1/5] block: partition: initialize percpuref before sending out KOBJ_ADD The initialization of partition's percpu_ref should have been done before sending out KOBJ_ADD uevent, which may cause userspace to read partition table. So the uninitialized percpu_ref may be accessed in data path. This patch fixes this issue reported by Naveen. Reported-by: Naveen Kaje Tested-by: Naveen Kaje Fixes: 6c71013ecb7e2(block: partition: convert percpu ref) Cc: # v4.3+ Signed-off-by: Ming Lei Signed-off-by: Jens Axboe --- block/partition-generic.c | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/block/partition-generic.c b/block/partition-generic.c index 5d8701941054..84c53f04777a 100644 --- a/block/partition-generic.c +++ b/block/partition-generic.c @@ -361,15 +361,20 @@ struct hd_struct *add_partition(struct gendisk *disk, int partno, goto out_del; } + err = hd_ref_init(p); + if (err) { + if (flags & ADDPART_FLAG_WHOLEDISK) + goto out_remove_file; + goto out_del; + } + /* everything is up and running, commence */ rcu_assign_pointer(ptbl->part[partno], p); /* suppress uevent if the disk suppresses it */ if (!dev_get_uevent_suppress(ddev)) kobject_uevent(&pdev->kobj, KOBJ_ADD); - - if (!hd_ref_init(p)) - return p; + return p; out_free_info: free_part_info(p); @@ -378,6 +383,8 @@ out_free_stats: out_free: kfree(p); return ERR_PTR(err); +out_remove_file: + device_remove_file(pdev, &dev_attr_whole_disk); out_del: kobject_put(p->holder_dir); device_del(pdev); From c877ef8ae7b8edaedccad0fc8c23d4d1de7e2480 Mon Sep 17 00:00:00 2001 From: Kaixu Xia Date: Thu, 31 Mar 2016 13:19:41 +0000 Subject: [PATCH 2/5] writeback: fix the wrong congested state variable definition The right variable definition should be wb_congested_state that include WB_async_congested and WB_sync_congested. So fix it. Signed-off-by: Kaixu Xia Acked-by: Tejun Heo Signed-off-by: Jens Axboe --- mm/backing-dev.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mm/backing-dev.c b/mm/backing-dev.c index bfbd7096b6ed..0c6317b7db38 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -898,7 +898,7 @@ static atomic_t nr_wb_congested[2]; void clear_wb_congested(struct bdi_writeback_congested *congested, int sync) { wait_queue_head_t *wqh = &congestion_wqh[sync]; - enum wb_state bit; + enum wb_congested_state bit; bit = sync ? WB_sync_congested : WB_async_congested; if (test_and_clear_bit(bit, &congested->state)) @@ -911,7 +911,7 @@ EXPORT_SYMBOL(clear_wb_congested); void set_wb_congested(struct bdi_writeback_congested *congested, int sync) { - enum wb_state bit; + enum wb_congested_state bit; bit = sync ? WB_sync_congested : WB_async_congested; if (!test_and_set_bit(bit, &congested->state)) From 9bf2b972afeaffd173fe2ce211ebc555ea7e8a87 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Fri, 8 Apr 2016 16:11:02 -0600 Subject: [PATCH 3/5] NVMe: Fix reset/remove race This fixes a scenario where device is present and being reset, but a request to unbind the driver occurs. A previous patch series addressing a device failure removal scenario flushed reset_work after controller disable to unblock reset_work waiting on a completion that wouldn't occur. This isn't safe as-is. The broken scenario can potentially be induced with: modprobe nvme && modprobe -r nvme To fix, the reset work is flushed immediately after setting the controller removing flag, and any subsequent reset will not proceed with controller initialization if the flag is set. The controller status must be polled while active, so the watchdog timer is also left active until the controller is disabled to cleanup requests that may be stuck during namespace removal. [Fixes: ff23a2a15a2117245b4599c1352343c8b8fb4c43] Signed-off-by: Keith Busch Reviewed-by: Christoph Hellwig Reviewed-by: Johannes Thumshirn Reviewed-by: Sagi Grimberg Signed-off-by: Jens Axboe --- drivers/nvme/host/pci.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 24ccda303efb..660ec84bc40f 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -1859,6 +1859,9 @@ static void nvme_reset_work(struct work_struct *work) if (dev->ctrl.ctrl_config & NVME_CC_ENABLE) nvme_dev_disable(dev, false); + if (test_bit(NVME_CTRL_REMOVING, &dev->flags)) + goto out; + set_bit(NVME_CTRL_RESETTING, &dev->flags); result = nvme_pci_enable(dev); @@ -2078,11 +2081,10 @@ static void nvme_remove(struct pci_dev *pdev) { struct nvme_dev *dev = pci_get_drvdata(pdev); - del_timer_sync(&dev->watchdog_timer); - set_bit(NVME_CTRL_REMOVING, &dev->flags); pci_set_drvdata(pdev, NULL); flush_work(&dev->async_work); + flush_work(&dev->reset_work); flush_work(&dev->scan_work); nvme_remove_namespaces(&dev->ctrl); nvme_uninit_ctrl(&dev->ctrl); From a5229050b69cfffb690b546c357ca5a60434c0c8 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Fri, 8 Apr 2016 16:09:10 -0600 Subject: [PATCH 4/5] NVMe: Always use MSI/MSI-x interrupts Multiple users have reported device initialization failure due the driver not receiving legacy PCI interrupts. This is not unique to any particular controller, but has been observed on multiple platforms. There have been no issues reported or observed when with message signaled interrupts, so this patch attempts to use MSI-x during initialization, falling back to MSI. If that fails, legacy would become the default. The setup_io_queues error handling had to change as a result: the admin queue's msix_entry used to be initialized to the legacy IRQ. The case where nr_io_queues is 0 would fail request_irq when setting up the admin queue's interrupt since re-enabling MSI-x fails with 0 vectors, leaving the admin queue's msix_entry invalid. Instead, return success immediately. Reported-by: Tim Muhlemmer Reported-by: Jon Derrick Signed-off-by: Keith Busch Signed-off-by: Jens Axboe --- drivers/nvme/host/pci.c | 25 +++++++++++++++---------- 1 file changed, 15 insertions(+), 10 deletions(-) diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 660ec84bc40f..4fd733ff72b1 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -1478,8 +1478,7 @@ static int nvme_setup_io_queues(struct nvme_dev *dev) if (result > 0) { dev_err(dev->ctrl.device, "Could not set queue count (%d)\n", result); - nr_io_queues = 0; - result = 0; + return 0; } if (dev->cmb && NVME_CMB_SQS(dev->cmbsz)) { @@ -1513,7 +1512,9 @@ static int nvme_setup_io_queues(struct nvme_dev *dev) * If we enable msix early due to not intx, disable it again before * setting up the full range we need. */ - if (!pdev->irq) + if (pdev->msi_enabled) + pci_disable_msi(pdev); + else if (pdev->msix_enabled) pci_disable_msix(pdev); for (i = 0; i < nr_io_queues; i++) @@ -1696,7 +1697,6 @@ static int nvme_pci_enable(struct nvme_dev *dev) if (pci_enable_device_mem(pdev)) return result; - dev->entry[0].vector = pdev->irq; pci_set_master(pdev); if (dma_set_mask_and_coherent(dev->dev, DMA_BIT_MASK(64)) && @@ -1709,13 +1709,18 @@ static int nvme_pci_enable(struct nvme_dev *dev) } /* - * Some devices don't advertse INTx interrupts, pre-enable a single - * MSIX vec for setup. We'll adjust this later. + * Some devices and/or platforms don't advertise or work with INTx + * interrupts. Pre-enable a single MSIX or MSI vec for setup. We'll + * adjust this later. */ - if (!pdev->irq) { - result = pci_enable_msix(pdev, dev->entry, 1); - if (result < 0) - goto disable; + if (pci_enable_msix(pdev, dev->entry, 1)) { + pci_enable_msi(pdev); + dev->entry[0].vector = pdev->irq; + } + + if (!dev->entry[0].vector) { + result = -ENODEV; + goto disable; } cap = lo_hi_readq(dev->bar + NVME_REG_CAP); From a7297a6a3a3322b054592e8e988981d2f5f29cc4 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Fri, 15 Apr 2016 18:51:28 +0800 Subject: [PATCH 5/5] block: loop: fix filesystem corruption in case of aio/dio Starting from commit e36f620428(block: split bios to max possible length), block core starts to split bio in the middle of bvec. Unfortunately loop dio/aio doesn't consider this situation, and always treat 'iter.iov_offset' as zero. Then filesystem corruption is observed. This patch figures out the offset of the base bvevc via 'bio->bi_iter.bi_bvec_done' and fixes the issue by passing the offset to iov iterator. Fixes: e36f6204288088f (block: split bios to max possible length) Cc: Keith Busch Cc: Al Viro Cc: stable@vger.kernel.org (4.5) Signed-off-by: Ming Lei Signed-off-by: Jens Axboe --- drivers/block/loop.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/drivers/block/loop.c b/drivers/block/loop.c index 423f4ca7d712..80cf8add46ff 100644 --- a/drivers/block/loop.c +++ b/drivers/block/loop.c @@ -488,6 +488,12 @@ static int lo_rw_aio(struct loop_device *lo, struct loop_cmd *cmd, bvec = __bvec_iter_bvec(bio->bi_io_vec, bio->bi_iter); iov_iter_bvec(&iter, ITER_BVEC | rw, bvec, bio_segments(bio), blk_rq_bytes(cmd->rq)); + /* + * This bio may be started from the middle of the 'bvec' + * because of bio splitting, so offset from the bvec must + * be passed to iov iterator + */ + iter.iov_offset = bio->bi_iter.bi_bvec_done; cmd->iocb.ki_pos = pos; cmd->iocb.ki_filp = file;