From 71bd150c71072014d98bff6dc2db3229306ece35 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 16 Oct 2015 07:58:32 +0200 Subject: [PATCH 01/67] nvme: move struct nvme_iod to pci.c This structure is specific to the PCIe driver internals and should be moved to pci.c. Signed-off-by: Christoph Hellwig Acked-by: Keith Busch Signed-off-by: Keith Busch Signed-off-by: Jens Axboe --- drivers/nvme/host/nvme.h | 17 ----------------- drivers/nvme/host/pci.c | 17 +++++++++++++++++ 2 files changed, 17 insertions(+), 17 deletions(-) diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index fdb4e5bad9ac..2cead2c98163 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -94,23 +94,6 @@ struct nvme_ns { u32 mode_select_block_len; }; -/* - * The nvme_iod describes the data in an I/O, including the list of PRP - * entries. You can't see it in this data structure because C doesn't let - * me express that. Use nvme_alloc_iod to ensure there's enough space - * allocated to store the PRP list. - */ -struct nvme_iod { - unsigned long private; /* For the use of the submitter of the I/O */ - int npages; /* In the PRP list. 0 means small pool in use */ - int offset; /* Of PRP list */ - int nents; /* Used in scatterlist */ - int length; /* Of data, in bytes */ - dma_addr_t first_dma; - struct scatterlist meta_sg[1]; /* metadata requires single contiguous buffer */ - struct scatterlist sg[0]; -}; - static inline u64 nvme_block_nr(struct nvme_ns *ns, sector_t sector) { return (sector >> (ns->lba_shift - 9)); diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index b8a02221233c..0f24d3c531c0 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -129,6 +129,23 @@ struct nvme_queue { struct async_cmd_info cmdinfo; }; +/* + * The nvme_iod describes the data in an I/O, including the list of PRP + * entries. You can't see it in this data structure because C doesn't let + * me express that. Use nvme_alloc_iod to ensure there's enough space + * allocated to store the PRP list. + */ +struct nvme_iod { + unsigned long private; /* For the use of the submitter of the I/O */ + int npages; /* In the PRP list. 0 means small pool in use */ + int offset; /* Of PRP list */ + int nents; /* Used in scatterlist */ + int length; /* Of data, in bytes */ + dma_addr_t first_dma; + struct scatterlist meta_sg[1]; /* metadata requires single contiguous buffer */ + struct scatterlist sg[0]; +}; + /* * Check we didin't inadvertently grow the command struct */ From 21d34711e1b5970acfb22bddf1fefbfbd7e0123b Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 26 Nov 2015 09:08:36 +0100 Subject: [PATCH 02/67] nvme: split command submission helpers out of pci.c Create a new core.c and start by adding the command submission helpers to it, which are already abstracted away from the actual hardware queues by the block layer. Signed-off-by: Christoph Hellwig Acked-by: Keith Busch Signed-off-by: Keith Busch Signed-off-by: Jens Axboe --- drivers/nvme/host/Makefile | 2 +- drivers/nvme/host/core.c | 173 +++++++++++++++++++++++++++++++++++++ drivers/nvme/host/nvme.h | 3 + drivers/nvme/host/pci.c | 155 +-------------------------------- 4 files changed, 178 insertions(+), 155 deletions(-) create mode 100644 drivers/nvme/host/core.c diff --git a/drivers/nvme/host/Makefile b/drivers/nvme/host/Makefile index 219dc206fa5f..3e26dc921c38 100644 --- a/drivers/nvme/host/Makefile +++ b/drivers/nvme/host/Makefile @@ -1,4 +1,4 @@ obj-$(CONFIG_BLK_DEV_NVME) += nvme.o -nvme-y += pci.o scsi.o lightnvm.o +nvme-y += core.o pci.o scsi.o lightnvm.o diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c new file mode 100644 index 000000000000..ce938a428928 --- /dev/null +++ b/drivers/nvme/host/core.c @@ -0,0 +1,173 @@ +/* + * NVM Express device driver + * Copyright (c) 2011-2014, Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + */ + +#include +#include +#include +#include +#include +#include + +#include "nvme.h" + +/* + * Returns 0 on success. If the result is negative, it's a Linux error code; + * if the result is positive, it's an NVM Express status code + */ +int __nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd, + void *buffer, void __user *ubuffer, unsigned bufflen, + u32 *result, unsigned timeout) +{ + bool write = cmd->common.opcode & 1; + struct bio *bio = NULL; + struct request *req; + int ret; + + req = blk_mq_alloc_request(q, write, 0); + if (IS_ERR(req)) + return PTR_ERR(req); + + req->cmd_type = REQ_TYPE_DRV_PRIV; + req->cmd_flags |= REQ_FAILFAST_DRIVER; + req->__data_len = 0; + req->__sector = (sector_t) -1; + req->bio = req->biotail = NULL; + + req->timeout = timeout ? timeout : ADMIN_TIMEOUT; + + req->cmd = (unsigned char *)cmd; + req->cmd_len = sizeof(struct nvme_command); + req->special = (void *)0; + + if (buffer && bufflen) { + ret = blk_rq_map_kern(q, req, buffer, bufflen, GFP_KERNEL); + if (ret) + goto out; + } else if (ubuffer && bufflen) { + ret = blk_rq_map_user(q, req, NULL, ubuffer, bufflen, + GFP_KERNEL); + if (ret) + goto out; + bio = req->bio; + } + + blk_execute_rq(req->q, NULL, req, 0); + if (bio) + blk_rq_unmap_user(bio); + if (result) + *result = (u32)(uintptr_t)req->special; + ret = req->errors; + out: + blk_mq_free_request(req); + return ret; +} + +int nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd, + void *buffer, unsigned bufflen) +{ + return __nvme_submit_sync_cmd(q, cmd, buffer, NULL, bufflen, NULL, 0); +} + +int nvme_identify_ctrl(struct nvme_dev *dev, struct nvme_id_ctrl **id) +{ + struct nvme_command c = { }; + int error; + + /* gcc-4.4.4 (at least) has issues with initializers and anon unions */ + c.identify.opcode = nvme_admin_identify; + c.identify.cns = cpu_to_le32(1); + + *id = kmalloc(sizeof(struct nvme_id_ctrl), GFP_KERNEL); + if (!*id) + return -ENOMEM; + + error = nvme_submit_sync_cmd(dev->admin_q, &c, *id, + sizeof(struct nvme_id_ctrl)); + if (error) + kfree(*id); + return error; +} + +int nvme_identify_ns(struct nvme_dev *dev, unsigned nsid, + struct nvme_id_ns **id) +{ + struct nvme_command c = { }; + int error; + + /* gcc-4.4.4 (at least) has issues with initializers and anon unions */ + c.identify.opcode = nvme_admin_identify, + c.identify.nsid = cpu_to_le32(nsid), + + *id = kmalloc(sizeof(struct nvme_id_ns), GFP_KERNEL); + if (!*id) + return -ENOMEM; + + error = nvme_submit_sync_cmd(dev->admin_q, &c, *id, + sizeof(struct nvme_id_ns)); + if (error) + kfree(*id); + return error; +} + +int nvme_get_features(struct nvme_dev *dev, unsigned fid, unsigned nsid, + dma_addr_t dma_addr, u32 *result) +{ + struct nvme_command c; + + memset(&c, 0, sizeof(c)); + c.features.opcode = nvme_admin_get_features; + c.features.nsid = cpu_to_le32(nsid); + c.features.prp1 = cpu_to_le64(dma_addr); + c.features.fid = cpu_to_le32(fid); + + return __nvme_submit_sync_cmd(dev->admin_q, &c, NULL, NULL, 0, + result, 0); +} + +int nvme_set_features(struct nvme_dev *dev, unsigned fid, unsigned dword11, + dma_addr_t dma_addr, u32 *result) +{ + struct nvme_command c; + + memset(&c, 0, sizeof(c)); + c.features.opcode = nvme_admin_set_features; + c.features.prp1 = cpu_to_le64(dma_addr); + c.features.fid = cpu_to_le32(fid); + c.features.dword11 = cpu_to_le32(dword11); + + return __nvme_submit_sync_cmd(dev->admin_q, &c, NULL, NULL, 0, + result, 0); +} + +int nvme_get_log_page(struct nvme_dev *dev, struct nvme_smart_log **log) +{ + struct nvme_command c = { }; + int error; + + c.common.opcode = nvme_admin_get_log_page, + c.common.nsid = cpu_to_le32(0xFFFFFFFF), + c.common.cdw10[0] = cpu_to_le32( + (((sizeof(struct nvme_smart_log) / 4) - 1) << 16) | + NVME_LOG_SMART), + + *log = kmalloc(sizeof(struct nvme_smart_log), GFP_KERNEL); + if (!*log) + return -ENOMEM; + + error = nvme_submit_sync_cmd(dev->admin_q, &c, *log, + sizeof(struct nvme_smart_log)); + if (error) + kfree(*log); + return error; +} diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index 2cead2c98163..a53977cc9fc2 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -22,6 +22,9 @@ extern unsigned char nvme_io_timeout; #define NVME_IO_TIMEOUT (nvme_io_timeout * HZ) +extern unsigned char admin_timeout; +#define ADMIN_TIMEOUT (admin_timeout * HZ) + enum { NVME_NS_LBA = 0, NVME_NS_LIGHTNVM = 1, diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 0f24d3c531c0..996356261c6b 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -52,10 +52,9 @@ #define NVME_AQ_DEPTH 256 #define SQ_SIZE(depth) (depth * sizeof(struct nvme_command)) #define CQ_SIZE(depth) (depth * sizeof(struct nvme_completion)) -#define ADMIN_TIMEOUT (admin_timeout * HZ) #define SHUTDOWN_TIMEOUT (shutdown_timeout * HZ) -static unsigned char admin_timeout = 60; +unsigned char admin_timeout = 60; module_param(admin_timeout, byte, 0644); MODULE_PARM_DESC(admin_timeout, "timeout in seconds for admin commands"); @@ -1045,65 +1044,6 @@ static int nvme_poll(struct blk_mq_hw_ctx *hctx, unsigned int tag) return 0; } -/* - * Returns 0 on success. If the result is negative, it's a Linux error code; - * if the result is positive, it's an NVM Express status code - */ -int __nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd, - void *buffer, void __user *ubuffer, unsigned bufflen, - u32 *result, unsigned timeout) -{ - bool write = cmd->common.opcode & 1; - struct bio *bio = NULL; - struct request *req; - int ret; - - req = blk_mq_alloc_request(q, write, 0); - if (IS_ERR(req)) - return PTR_ERR(req); - - req->cmd_type = REQ_TYPE_DRV_PRIV; - req->cmd_flags |= REQ_FAILFAST_DRIVER; - req->__data_len = 0; - req->__sector = (sector_t) -1; - req->bio = req->biotail = NULL; - - req->timeout = timeout ? timeout : ADMIN_TIMEOUT; - - req->cmd = (unsigned char *)cmd; - req->cmd_len = sizeof(struct nvme_command); - req->special = (void *)0; - - if (buffer && bufflen) { - ret = blk_rq_map_kern(q, req, buffer, bufflen, - __GFP_DIRECT_RECLAIM); - if (ret) - goto out; - } else if (ubuffer && bufflen) { - ret = blk_rq_map_user(q, req, NULL, ubuffer, bufflen, - __GFP_DIRECT_RECLAIM); - if (ret) - goto out; - bio = req->bio; - } - - blk_execute_rq(req->q, NULL, req, 0); - if (bio) - blk_rq_unmap_user(bio); - if (result) - *result = (u32)(uintptr_t)req->special; - ret = req->errors; - out: - blk_mq_free_request(req); - return ret; -} - -int nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd, - void *buffer, unsigned bufflen) -{ - return __nvme_submit_sync_cmd(q, cmd, buffer, NULL, bufflen, NULL, 0); -} - static int nvme_submit_async_admin_req(struct nvme_dev *dev) { struct nvme_queue *nvmeq = dev->queues[0]; @@ -1216,99 +1156,6 @@ static int adapter_delete_sq(struct nvme_dev *dev, u16 sqid) return adapter_delete_queue(dev, nvme_admin_delete_sq, sqid); } -int nvme_identify_ctrl(struct nvme_dev *dev, struct nvme_id_ctrl **id) -{ - struct nvme_command c = { }; - int error; - - /* gcc-4.4.4 (at least) has issues with initializers and anon unions */ - c.identify.opcode = nvme_admin_identify; - c.identify.cns = cpu_to_le32(1); - - *id = kmalloc(sizeof(struct nvme_id_ctrl), GFP_KERNEL); - if (!*id) - return -ENOMEM; - - error = nvme_submit_sync_cmd(dev->admin_q, &c, *id, - sizeof(struct nvme_id_ctrl)); - if (error) - kfree(*id); - return error; -} - -int nvme_identify_ns(struct nvme_dev *dev, unsigned nsid, - struct nvme_id_ns **id) -{ - struct nvme_command c = { }; - int error; - - /* gcc-4.4.4 (at least) has issues with initializers and anon unions */ - c.identify.opcode = nvme_admin_identify, - c.identify.nsid = cpu_to_le32(nsid), - - *id = kmalloc(sizeof(struct nvme_id_ns), GFP_KERNEL); - if (!*id) - return -ENOMEM; - - error = nvme_submit_sync_cmd(dev->admin_q, &c, *id, - sizeof(struct nvme_id_ns)); - if (error) - kfree(*id); - return error; -} - -int nvme_get_features(struct nvme_dev *dev, unsigned fid, unsigned nsid, - dma_addr_t dma_addr, u32 *result) -{ - struct nvme_command c; - - memset(&c, 0, sizeof(c)); - c.features.opcode = nvme_admin_get_features; - c.features.nsid = cpu_to_le32(nsid); - c.features.prp1 = cpu_to_le64(dma_addr); - c.features.fid = cpu_to_le32(fid); - - return __nvme_submit_sync_cmd(dev->admin_q, &c, NULL, NULL, 0, - result, 0); -} - -int nvme_set_features(struct nvme_dev *dev, unsigned fid, unsigned dword11, - dma_addr_t dma_addr, u32 *result) -{ - struct nvme_command c; - - memset(&c, 0, sizeof(c)); - c.features.opcode = nvme_admin_set_features; - c.features.prp1 = cpu_to_le64(dma_addr); - c.features.fid = cpu_to_le32(fid); - c.features.dword11 = cpu_to_le32(dword11); - - return __nvme_submit_sync_cmd(dev->admin_q, &c, NULL, NULL, 0, - result, 0); -} - -int nvme_get_log_page(struct nvme_dev *dev, struct nvme_smart_log **log) -{ - struct nvme_command c = { }; - int error; - - c.common.opcode = nvme_admin_get_log_page, - c.common.nsid = cpu_to_le32(0xFFFFFFFF), - c.common.cdw10[0] = cpu_to_le32( - (((sizeof(struct nvme_smart_log) / 4) - 1) << 16) | - NVME_LOG_SMART), - - *log = kmalloc(sizeof(struct nvme_smart_log), GFP_KERNEL); - if (!*log) - return -ENOMEM; - - error = nvme_submit_sync_cmd(dev->admin_q, &c, *log, - sizeof(struct nvme_smart_log)); - if (error) - kfree(*log); - return error; -} - /** * nvme_abort_req - Attempt aborting a request * From 7a67cbea653e444d04d7e850ab9631a14a196422 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 20 Nov 2015 08:58:10 +0100 Subject: [PATCH 03/67] nvme: use offset instead of a struct for registers This makes life easier for future non-PCI drivers where access to the registers might be more complicated. Note that Linux drivers are pretty evenly split between the two versions, and in fact the NVMe driver already uses offsets for the doorbells. Signed-off-by: Christoph Hellwig Acked-by: Keith Busch [Fixed CMBSZ offset] Signed-off-by: Keith Busch Signed-off-by: Jens Axboe --- drivers/nvme/host/nvme.h | 2 +- drivers/nvme/host/pci.c | 60 +++++++++++++++++++++------------------- drivers/nvme/host/scsi.c | 6 ++-- include/linux/nvme.h | 27 +++++++++--------- 4 files changed, 49 insertions(+), 46 deletions(-) diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index a53977cc9fc2..66550b76b05c 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -51,7 +51,7 @@ struct nvme_dev { u32 db_stride; u32 ctrl_config; struct msix_entry *entry; - struct nvme_bar __iomem *bar; + void __iomem *bar; struct list_head namespaces; struct kref kref; struct device *device; diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 996356261c6b..bfea7ec22b98 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -1322,7 +1322,7 @@ static void nvme_disable_queue(struct nvme_dev *dev, int qid) /* Don't tell the adapter to delete the admin queue. * Don't tell a removed adapter to delete IO queues. */ - if (qid && readl(&dev->bar->csts) != -1) { + if (qid && readl(dev->bar + NVME_REG_CSTS) != -1) { adapter_delete_sq(dev, qid); adapter_delete_cq(dev, qid); } @@ -1475,7 +1475,7 @@ static int nvme_wait_ready(struct nvme_dev *dev, u64 cap, bool enabled) timeout = ((NVME_CAP_TIMEOUT(cap) + 1) * HZ / 2) + jiffies; - while ((readl(&dev->bar->csts) & NVME_CSTS_RDY) != bit) { + while ((readl(dev->bar + NVME_REG_CSTS) & NVME_CSTS_RDY) != bit) { msleep(100); if (fatal_signal_pending(current)) return -EINTR; @@ -1500,7 +1500,7 @@ static int nvme_disable_ctrl(struct nvme_dev *dev, u64 cap) { dev->ctrl_config &= ~NVME_CC_SHN_MASK; dev->ctrl_config &= ~NVME_CC_ENABLE; - writel(dev->ctrl_config, &dev->bar->cc); + writel(dev->ctrl_config, dev->bar + NVME_REG_CC); return nvme_wait_ready(dev, cap, false); } @@ -1509,7 +1509,7 @@ static int nvme_enable_ctrl(struct nvme_dev *dev, u64 cap) { dev->ctrl_config &= ~NVME_CC_SHN_MASK; dev->ctrl_config |= NVME_CC_ENABLE; - writel(dev->ctrl_config, &dev->bar->cc); + writel(dev->ctrl_config, dev->bar + NVME_REG_CC); return nvme_wait_ready(dev, cap, true); } @@ -1521,10 +1521,10 @@ static int nvme_shutdown_ctrl(struct nvme_dev *dev) dev->ctrl_config &= ~NVME_CC_SHN_MASK; dev->ctrl_config |= NVME_CC_SHN_NORMAL; - writel(dev->ctrl_config, &dev->bar->cc); + writel(dev->ctrl_config, dev->bar + NVME_REG_CC); timeout = SHUTDOWN_TIMEOUT + jiffies; - while ((readl(&dev->bar->csts) & NVME_CSTS_SHST_MASK) != + while ((readl(dev->bar + NVME_REG_CSTS) & NVME_CSTS_SHST_MASK) != NVME_CSTS_SHST_CMPLT) { msleep(100); if (fatal_signal_pending(current)) @@ -1600,7 +1600,7 @@ static int nvme_configure_admin_queue(struct nvme_dev *dev) { int result; u32 aqa; - u64 cap = lo_hi_readq(&dev->bar->cap); + u64 cap = lo_hi_readq(dev->bar + NVME_REG_CAP); struct nvme_queue *nvmeq; /* * default to a 4K page size, with the intention to update this @@ -1618,11 +1618,12 @@ static int nvme_configure_admin_queue(struct nvme_dev *dev) return -ENODEV; } - dev->subsystem = readl(&dev->bar->vs) >= NVME_VS(1, 1) ? + dev->subsystem = readl(dev->bar + NVME_REG_VS) >= NVME_VS(1, 1) ? NVME_CAP_NSSRC(cap) : 0; - if (dev->subsystem && (readl(&dev->bar->csts) & NVME_CSTS_NSSRO)) - writel(NVME_CSTS_NSSRO, &dev->bar->csts); + if (dev->subsystem && + (readl(dev->bar + NVME_REG_CSTS) & NVME_CSTS_NSSRO)) + writel(NVME_CSTS_NSSRO, dev->bar + NVME_REG_CSTS); result = nvme_disable_ctrl(dev, cap); if (result < 0) @@ -1645,9 +1646,9 @@ static int nvme_configure_admin_queue(struct nvme_dev *dev) dev->ctrl_config |= NVME_CC_ARB_RR | NVME_CC_SHN_NONE; dev->ctrl_config |= NVME_CC_IOSQES | NVME_CC_IOCQES; - writel(aqa, &dev->bar->aqa); - lo_hi_writeq(nvmeq->sq_dma_addr, &dev->bar->asq); - lo_hi_writeq(nvmeq->cq_dma_addr, &dev->bar->acq); + writel(aqa, dev->bar + NVME_REG_AQA); + lo_hi_writeq(nvmeq->sq_dma_addr, dev->bar + NVME_REG_ASQ); + lo_hi_writeq(nvmeq->cq_dma_addr, dev->bar + NVME_REG_ACQ); result = nvme_enable_ctrl(dev, cap); if (result) @@ -1789,7 +1790,7 @@ static int nvme_subsys_reset(struct nvme_dev *dev) if (!dev->subsystem) return -ENOTTY; - writel(0x4E564D65, &dev->bar->nssr); /* "NVMe" */ + writel(0x4E564D65, dev->bar + NVME_REG_NSSR); /* "NVMe" */ return 0; } @@ -2076,14 +2077,14 @@ static int nvme_kthread(void *data) spin_lock(&dev_list_lock); list_for_each_entry_safe(dev, next, &dev_list, node) { int i; - u32 csts = readl(&dev->bar->csts); + u32 csts = readl(dev->bar + NVME_REG_CSTS); if ((dev->subsystem && (csts & NVME_CSTS_NSSRO)) || csts & NVME_CSTS_CFS) { if (!__nvme_reset(dev)) { dev_warn(dev->dev, "Failed status: %x, reset controller\n", - readl(&dev->bar->csts)); + readl(dev->bar + NVME_REG_CSTS)); } continue; } @@ -2243,11 +2244,11 @@ static void __iomem *nvme_map_cmb(struct nvme_dev *dev) if (!use_cmb_sqes) return NULL; - dev->cmbsz = readl(&dev->bar->cmbsz); + dev->cmbsz = readl(dev->bar + NVME_REG_CMBSZ); if (!(NVME_CMB_SZ(dev->cmbsz))) return NULL; - cmbloc = readl(&dev->bar->cmbloc); + cmbloc = readl(dev->bar + NVME_REG_CMBLOC); szu = (u64)1 << (12 + 4 * NVME_CMB_SZU(dev->cmbsz)); size = szu * NVME_CMB_SZ(dev->cmbsz); @@ -2321,7 +2322,7 @@ static int nvme_setup_io_queues(struct nvme_dev *dev) return -ENOMEM; size = db_bar_size(dev, nr_io_queues); } while (1); - dev->dbs = ((void __iomem *)dev->bar) + 4096; + dev->dbs = dev->bar + 4096; adminq->q_db = dev->dbs; } @@ -2397,8 +2398,9 @@ static struct nvme_ns *nvme_find_ns(struct nvme_dev *dev, unsigned nsid) static inline bool nvme_io_incapable(struct nvme_dev *dev) { - return (!dev->bar || readl(&dev->bar->csts) & NVME_CSTS_CFS || - dev->online_queues < 2); + return (!dev->bar || + readl(dev->bar + NVME_REG_CSTS) & NVME_CSTS_CFS || + dev->online_queues < 2); } static void nvme_ns_remove(struct nvme_ns *ns) @@ -2478,7 +2480,7 @@ static int nvme_dev_add(struct nvme_dev *dev) struct pci_dev *pdev = to_pci_dev(dev->dev); int res; struct nvme_id_ctrl *ctrl; - int shift = NVME_CAP_MPSMIN(lo_hi_readq(&dev->bar->cap)) + 12; + int shift = NVME_CAP_MPSMIN(lo_hi_readq(dev->bar + NVME_REG_CAP)) + 12; res = nvme_identify_ctrl(dev, &ctrl); if (res) { @@ -2554,7 +2556,7 @@ static int nvme_dev_map(struct nvme_dev *dev) if (!dev->bar) goto disable; - if (readl(&dev->bar->csts) == -1) { + if (readl(dev->bar + NVME_REG_CSTS) == -1) { result = -ENODEV; goto unmap; } @@ -2569,11 +2571,12 @@ static int nvme_dev_map(struct nvme_dev *dev) goto unmap; } - cap = lo_hi_readq(&dev->bar->cap); + cap = lo_hi_readq(dev->bar + NVME_REG_CAP); + dev->q_depth = min_t(int, NVME_CAP_MQES(cap) + 1, NVME_Q_DEPTH); dev->db_stride = 1 << NVME_CAP_STRIDE(cap); - dev->dbs = ((void __iomem *)dev->bar) + 4096; - if (readl(&dev->bar->vs) >= NVME_VS(1, 2)) + dev->dbs = dev->bar + 4096; + if (readl(dev->bar + NVME_REG_VS) >= NVME_VS(1, 2)) dev->cmb = nvme_map_cmb(dev); return 0; @@ -2632,7 +2635,8 @@ static void nvme_wait_dq(struct nvme_delq_ctx *dq, struct nvme_dev *dev) * queues than admin tags. */ set_current_state(TASK_RUNNING); - nvme_disable_ctrl(dev, lo_hi_readq(&dev->bar->cap)); + nvme_disable_ctrl(dev, + lo_hi_readq(dev->bar + NVME_REG_CAP)); nvme_clear_queue(dev->queues[0]); flush_kthread_worker(dq->worker); nvme_disable_queue(dev, 0); @@ -2808,7 +2812,7 @@ static void nvme_dev_shutdown(struct nvme_dev *dev) if (dev->bar) { nvme_freeze_queues(dev); - csts = readl(&dev->bar->csts); + csts = readl(dev->bar + NVME_REG_CSTS); } if (csts & NVME_CSTS_CFS || !(csts & NVME_CSTS_RDY)) { for (i = dev->queue_count - 1; i >= 0; i--) { diff --git a/drivers/nvme/host/scsi.c b/drivers/nvme/host/scsi.c index c3d8d3887a31..85869946d226 100644 --- a/drivers/nvme/host/scsi.c +++ b/drivers/nvme/host/scsi.c @@ -611,7 +611,7 @@ static int nvme_trans_device_id_page(struct nvme_ns *ns, struct sg_io_hdr *hdr, memset(inq_response, 0, alloc_len); inq_response[1] = INQ_DEVICE_IDENTIFICATION_PAGE; /* Page Code */ - if (readl(&dev->bar->vs) >= NVME_VS(1, 1)) { + if (readl(dev->bar + NVME_REG_VS) >= NVME_VS(1, 1)) { struct nvme_id_ns *id_ns; void *eui; int len; @@ -623,7 +623,7 @@ static int nvme_trans_device_id_page(struct nvme_ns *ns, struct sg_io_hdr *hdr, eui = id_ns->eui64; len = sizeof(id_ns->eui64); - if (readl(&dev->bar->vs) >= NVME_VS(1, 2)) { + if (readl(dev->bar + NVME_REG_VS) >= NVME_VS(1, 2)) { if (bitmap_empty(eui, len * 8)) { eui = id_ns->nguid; len = sizeof(id_ns->nguid); @@ -2297,7 +2297,7 @@ static int nvme_trans_test_unit_ready(struct nvme_ns *ns, { struct nvme_dev *dev = ns->dev; - if (!(readl(&dev->bar->csts) & NVME_CSTS_RDY)) + if (!(readl(dev->bar + NVME_REG_CSTS) & NVME_CSTS_RDY)) return nvme_trans_completion(hdr, SAM_STAT_CHECK_CONDITION, NOT_READY, SCSI_ASC_LUN_NOT_READY, SCSI_ASCQ_CAUSE_NOT_REPORTABLE); diff --git a/include/linux/nvme.h b/include/linux/nvme.h index 3af5f454c04a..a55986f6fe38 100644 --- a/include/linux/nvme.h +++ b/include/linux/nvme.h @@ -17,20 +17,19 @@ #include -struct nvme_bar { - __u64 cap; /* Controller Capabilities */ - __u32 vs; /* Version */ - __u32 intms; /* Interrupt Mask Set */ - __u32 intmc; /* Interrupt Mask Clear */ - __u32 cc; /* Controller Configuration */ - __u32 rsvd1; /* Reserved */ - __u32 csts; /* Controller Status */ - __u32 nssr; /* Subsystem Reset */ - __u32 aqa; /* Admin Queue Attributes */ - __u64 asq; /* Admin SQ Base Address */ - __u64 acq; /* Admin CQ Base Address */ - __u32 cmbloc; /* Controller Memory Buffer Location */ - __u32 cmbsz; /* Controller Memory Buffer Size */ +enum { + NVME_REG_CAP = 0x0000, /* Controller Capabilities */ + NVME_REG_VS = 0x0008, /* Version */ + NVME_REG_INTMS = 0x000c, /* Interrupt Mask Set */ + NVME_REG_INTMC = 0x0010, /* Interrupt Mask Set */ + NVME_REG_CC = 0x0014, /* Controller Configuration */ + NVME_REG_CSTS = 0x001c, /* Controller Status */ + NVME_REG_NSSR = 0x0020, /* NVM Subsystem Reset */ + NVME_REG_AQA = 0x0024, /* Admin Queue Attributes */ + NVME_REG_ASQ = 0x0028, /* Admin SQ Base Address */ + NVME_REG_ACQ = 0x0030, /* Admin SQ Base Address */ + NVME_REG_CMBLOC = 0x0038, /* Controller Memory Buffer Location */ + NVME_REG_CMBSZ = 0x003c, /* Controller Memory Buffer Size */ }; #define NVME_CAP_MQES(cap) ((cap) & 0xffff) From bf7d3ebbd219d8ad948e812d03e1decfd96c97d0 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 26 Nov 2015 09:55:48 +0100 Subject: [PATCH 04/67] nvme: split nvme_trans_device_id_page Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- drivers/nvme/host/scsi.c | 135 +++++++++++++++++++++++---------------- 1 file changed, 79 insertions(+), 56 deletions(-) diff --git a/drivers/nvme/host/scsi.c b/drivers/nvme/host/scsi.c index 85869946d226..b42cf448a55e 100644 --- a/drivers/nvme/host/scsi.c +++ b/drivers/nvme/host/scsi.c @@ -600,70 +600,93 @@ static int nvme_trans_unit_serial_page(struct nvme_ns *ns, return nvme_trans_copy_to_user(hdr, inq_response, xfer_len); } -static int nvme_trans_device_id_page(struct nvme_ns *ns, struct sg_io_hdr *hdr, - u8 *inq_response, int alloc_len) +static int nvme_fill_device_id_eui64(struct nvme_ns *ns, struct sg_io_hdr *hdr, + u8 *inq_response, int alloc_len) { - struct nvme_dev *dev = ns->dev; - int res; - int nvme_sc; - int xfer_len; - __be32 tmp_id = cpu_to_be32(ns->ns_id); + struct nvme_id_ns *id_ns; + int nvme_sc, res; + size_t len; + void *eui; + + nvme_sc = nvme_identify_ns(ns->dev, ns->ns_id, &id_ns); + res = nvme_trans_status_code(hdr, nvme_sc); + if (res) + return res; + + eui = id_ns->eui64; + len = sizeof(id_ns->eui64); + + if (readl(ns->dev->bar + NVME_REG_VS) >= NVME_VS(1, 2)) { + if (bitmap_empty(eui, len * 8)) { + eui = id_ns->nguid; + len = sizeof(id_ns->nguid); + } + } + + if (bitmap_empty(eui, len * 8)) { + res = -EOPNOTSUPP; + goto out_free_id; + } memset(inq_response, 0, alloc_len); - inq_response[1] = INQ_DEVICE_IDENTIFICATION_PAGE; /* Page Code */ - if (readl(dev->bar + NVME_REG_VS) >= NVME_VS(1, 1)) { - struct nvme_id_ns *id_ns; - void *eui; - int len; + inq_response[1] = INQ_DEVICE_IDENTIFICATION_PAGE; + inq_response[3] = 4 + len; /* Page Length */ - nvme_sc = nvme_identify_ns(dev, ns->ns_id, &id_ns); - res = nvme_trans_status_code(hdr, nvme_sc); - if (res) - return res; + /* Designation Descriptor start */ + inq_response[4] = 0x01; /* Proto ID=0h | Code set=1h */ + inq_response[5] = 0x02; /* PIV=0b | Asso=00b | Designator Type=2h */ + inq_response[6] = 0x00; /* Rsvd */ + inq_response[7] = len; /* Designator Length */ + memcpy(&inq_response[8], eui, len); - eui = id_ns->eui64; - len = sizeof(id_ns->eui64); - if (readl(dev->bar + NVME_REG_VS) >= NVME_VS(1, 2)) { - if (bitmap_empty(eui, len * 8)) { - eui = id_ns->nguid; - len = sizeof(id_ns->nguid); - } - } - if (bitmap_empty(eui, len * 8)) { - kfree(id_ns); - goto scsi_string; - } + res = nvme_trans_copy_to_user(hdr, inq_response, alloc_len); +out_free_id: + kfree(id_ns); + return res; +} - inq_response[3] = 4 + len; /* Page Length */ - /* Designation Descriptor start */ - inq_response[4] = 0x01; /* Proto ID=0h | Code set=1h */ - inq_response[5] = 0x02; /* PIV=0b | Asso=00b | Designator Type=2h */ - inq_response[6] = 0x00; /* Rsvd */ - inq_response[7] = len; /* Designator Length */ - memcpy(&inq_response[8], eui, len); - kfree(id_ns); - } else { - scsi_string: - if (alloc_len < 72) { - return nvme_trans_completion(hdr, - SAM_STAT_CHECK_CONDITION, - ILLEGAL_REQUEST, SCSI_ASC_INVALID_CDB, - SCSI_ASCQ_CAUSE_NOT_REPORTABLE); - } - inq_response[3] = 0x48; /* Page Length */ - /* Designation Descriptor start */ - inq_response[4] = 0x03; /* Proto ID=0h | Code set=3h */ - inq_response[5] = 0x08; /* PIV=0b | Asso=00b | Designator Type=8h */ - inq_response[6] = 0x00; /* Rsvd */ - inq_response[7] = 0x44; /* Designator Length */ +static int nvme_fill_device_id_scsi_string(struct nvme_ns *ns, + struct sg_io_hdr *hdr, u8 *inq_response, int alloc_len) +{ + struct nvme_dev *dev = ns->dev; - sprintf(&inq_response[8], "%04x", to_pci_dev(dev->dev)->vendor); - memcpy(&inq_response[12], dev->model, sizeof(dev->model)); - sprintf(&inq_response[52], "%04x", tmp_id); - memcpy(&inq_response[56], dev->serial, sizeof(dev->serial)); + if (alloc_len < 72) { + return nvme_trans_completion(hdr, + SAM_STAT_CHECK_CONDITION, + ILLEGAL_REQUEST, SCSI_ASC_INVALID_CDB, + SCSI_ASCQ_CAUSE_NOT_REPORTABLE); } - xfer_len = alloc_len; - return nvme_trans_copy_to_user(hdr, inq_response, xfer_len); + + memset(inq_response, 0, alloc_len); + inq_response[1] = INQ_DEVICE_IDENTIFICATION_PAGE; + inq_response[3] = 0x48; /* Page Length */ + + /* Designation Descriptor start */ + inq_response[4] = 0x03; /* Proto ID=0h | Code set=3h */ + inq_response[5] = 0x08; /* PIV=0b | Asso=00b | Designator Type=8h */ + inq_response[6] = 0x00; /* Rsvd */ + inq_response[7] = 0x44; /* Designator Length */ + + sprintf(&inq_response[8], "%04x", to_pci_dev(dev->dev)->vendor); + memcpy(&inq_response[12], dev->model, sizeof(dev->model)); + sprintf(&inq_response[52], "%04x", cpu_to_be32(ns->ns_id)); + memcpy(&inq_response[56], dev->serial, sizeof(dev->serial)); + + return nvme_trans_copy_to_user(hdr, inq_response, alloc_len); +} + +static int nvme_trans_device_id_page(struct nvme_ns *ns, struct sg_io_hdr *hdr, + u8 *resp, int alloc_len) +{ + int res; + + if (readl(ns->dev->bar + NVME_REG_VS) >= NVME_VS(1, 1)) { + res = nvme_fill_device_id_eui64(ns, hdr, resp, alloc_len); + if (res != -EOPNOTSUPP) + return res; + } + + return nvme_fill_device_id_scsi_string(ns, hdr, resp, alloc_len); } static int nvme_trans_ext_inq_page(struct nvme_ns *ns, struct sg_io_hdr *hdr, From 01fec28a6f3ba96d4f46a538eae089dd92189fd1 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 26 Nov 2015 09:59:44 +0100 Subject: [PATCH 05/67] nvme: use vendor it from identify Use the vendor ID from the identify data instead of the PCI device to make the SCSI translation layer independent from the PCI driver. The NVMe spec defines them as having the same value for current PCIe devices. Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- drivers/nvme/host/scsi.c | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/drivers/nvme/host/scsi.c b/drivers/nvme/host/scsi.c index b42cf448a55e..0bf90b62ec27 100644 --- a/drivers/nvme/host/scsi.c +++ b/drivers/nvme/host/scsi.c @@ -649,6 +649,8 @@ static int nvme_fill_device_id_scsi_string(struct nvme_ns *ns, struct sg_io_hdr *hdr, u8 *inq_response, int alloc_len) { struct nvme_dev *dev = ns->dev; + struct nvme_id_ctrl *id_ctrl; + int nvme_sc, res; if (alloc_len < 72) { return nvme_trans_completion(hdr, @@ -657,6 +659,11 @@ static int nvme_fill_device_id_scsi_string(struct nvme_ns *ns, SCSI_ASCQ_CAUSE_NOT_REPORTABLE); } + nvme_sc = nvme_identify_ctrl(dev, &id_ctrl); + res = nvme_trans_status_code(hdr, nvme_sc); + if (res) + return res; + memset(inq_response, 0, alloc_len); inq_response[1] = INQ_DEVICE_IDENTIFICATION_PAGE; inq_response[3] = 0x48; /* Page Length */ @@ -667,12 +674,14 @@ static int nvme_fill_device_id_scsi_string(struct nvme_ns *ns, inq_response[6] = 0x00; /* Rsvd */ inq_response[7] = 0x44; /* Designator Length */ - sprintf(&inq_response[8], "%04x", to_pci_dev(dev->dev)->vendor); + sprintf(&inq_response[8], "%04x", le16_to_cpu(id_ctrl->vid)); memcpy(&inq_response[12], dev->model, sizeof(dev->model)); sprintf(&inq_response[52], "%04x", cpu_to_be32(ns->ns_id)); memcpy(&inq_response[56], dev->serial, sizeof(dev->serial)); - return nvme_trans_copy_to_user(hdr, inq_response, alloc_len); + res = nvme_trans_copy_to_user(hdr, inq_response, alloc_len); + kfree(id_ctrl); + return res; } static int nvme_trans_device_id_page(struct nvme_ns *ns, struct sg_io_hdr *hdr, From 1c63dc66580d4bbb6d2b75bf184b5aa105ba5bdb Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 26 Nov 2015 10:06:56 +0100 Subject: [PATCH 06/67] nvme: split a new struct nvme_ctrl out of struct nvme_dev The new struct nvme_ctrl will be used by the common NVMe code that sits on top of struct request_queue and the new nvme_ctrl_ops abstraction. It only contains the bare minimum required, which consists of values sampled during controller probe, the admin queue pointer and a second struct device pointer at the moment, but more will follow later. Only values that are not used in the I/O fast path should be moved to struct nvme_ctrl so that drivers can optimize their cache line usage easily. That's also the reason why we have two device pointers as the struct device is used for DMA mapping purposes. Signed-off-by: Christoph Hellwig Acked-by: Keith Busch Signed-off-by: Keith Busch Signed-off-by: Jens Axboe --- drivers/nvme/host/core.c | 10 +-- drivers/nvme/host/nvme.h | 61 +++++-------- drivers/nvme/host/pci.c | 190 ++++++++++++++++++++++++++------------- drivers/nvme/host/scsi.c | 89 +++++++++--------- 4 files changed, 193 insertions(+), 157 deletions(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index ce938a428928..ca54a34665ac 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -79,7 +79,7 @@ int nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd, return __nvme_submit_sync_cmd(q, cmd, buffer, NULL, bufflen, NULL, 0); } -int nvme_identify_ctrl(struct nvme_dev *dev, struct nvme_id_ctrl **id) +int nvme_identify_ctrl(struct nvme_ctrl *dev, struct nvme_id_ctrl **id) { struct nvme_command c = { }; int error; @@ -99,7 +99,7 @@ int nvme_identify_ctrl(struct nvme_dev *dev, struct nvme_id_ctrl **id) return error; } -int nvme_identify_ns(struct nvme_dev *dev, unsigned nsid, +int nvme_identify_ns(struct nvme_ctrl *dev, unsigned nsid, struct nvme_id_ns **id) { struct nvme_command c = { }; @@ -120,7 +120,7 @@ int nvme_identify_ns(struct nvme_dev *dev, unsigned nsid, return error; } -int nvme_get_features(struct nvme_dev *dev, unsigned fid, unsigned nsid, +int nvme_get_features(struct nvme_ctrl *dev, unsigned fid, unsigned nsid, dma_addr_t dma_addr, u32 *result) { struct nvme_command c; @@ -135,7 +135,7 @@ int nvme_get_features(struct nvme_dev *dev, unsigned fid, unsigned nsid, result, 0); } -int nvme_set_features(struct nvme_dev *dev, unsigned fid, unsigned dword11, +int nvme_set_features(struct nvme_ctrl *dev, unsigned fid, unsigned dword11, dma_addr_t dma_addr, u32 *result) { struct nvme_command c; @@ -150,7 +150,7 @@ int nvme_set_features(struct nvme_dev *dev, unsigned fid, unsigned dword11, result, 0); } -int nvme_get_log_page(struct nvme_dev *dev, struct nvme_smart_log **log) +int nvme_get_log_page(struct nvme_ctrl *dev, struct nvme_smart_log **log) { struct nvme_command c = { }; int error; diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index 66550b76b05c..19583e1125e6 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -30,46 +30,16 @@ enum { NVME_NS_LIGHTNVM = 1, }; -/* - * Represents an NVM Express device. Each nvme_dev is a PCI function. - */ -struct nvme_dev { - struct list_head node; - struct nvme_queue **queues; +struct nvme_ctrl { + const struct nvme_ctrl_ops *ops; struct request_queue *admin_q; - struct blk_mq_tag_set tagset; - struct blk_mq_tag_set admin_tagset; - u32 __iomem *dbs; struct device *dev; - struct dma_pool *prp_page_pool; - struct dma_pool *prp_small_pool; int instance; - unsigned queue_count; - unsigned online_queues; - unsigned max_qid; - int q_depth; - u32 db_stride; - u32 ctrl_config; - struct msix_entry *entry; - void __iomem *bar; - struct list_head namespaces; - struct kref kref; - struct device *device; - struct work_struct reset_work; - struct work_struct probe_work; - struct work_struct scan_work; + char name[12]; char serial[20]; char model[40]; char firmware_rev[8]; - bool subsystem; - u32 max_hw_sectors; - u32 stripe_size; - u32 page_size; - void __iomem *cmb; - dma_addr_t cmb_dma_addr; - u64 cmb_size; - u32 cmbsz; u16 oncs; u16 abort_limit; u8 event_limit; @@ -82,7 +52,7 @@ struct nvme_dev { struct nvme_ns { struct list_head list; - struct nvme_dev *dev; + struct nvme_ctrl *ctrl; struct request_queue *queue; struct gendisk *disk; struct kref kref; @@ -97,6 +67,19 @@ struct nvme_ns { u32 mode_select_block_len; }; +struct nvme_ctrl_ops { + int (*reg_read32)(struct nvme_ctrl *ctrl, u32 off, u32 *val); +}; + +static inline bool nvme_ctrl_ready(struct nvme_ctrl *ctrl) +{ + u32 val = 0; + + if (ctrl->ops->reg_read32(ctrl, NVME_REG_CSTS, &val)) + return false; + return val & NVME_CSTS_RDY; +} + static inline u64 nvme_block_nr(struct nvme_ns *ns, sector_t sector) { return (sector >> (ns->lba_shift - 9)); @@ -107,13 +90,13 @@ int nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd, int __nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd, void *buffer, void __user *ubuffer, unsigned bufflen, u32 *result, unsigned timeout); -int nvme_identify_ctrl(struct nvme_dev *dev, struct nvme_id_ctrl **id); -int nvme_identify_ns(struct nvme_dev *dev, unsigned nsid, +int nvme_identify_ctrl(struct nvme_ctrl *dev, struct nvme_id_ctrl **id); +int nvme_identify_ns(struct nvme_ctrl *dev, unsigned nsid, struct nvme_id_ns **id); -int nvme_get_log_page(struct nvme_dev *dev, struct nvme_smart_log **log); -int nvme_get_features(struct nvme_dev *dev, unsigned fid, unsigned nsid, +int nvme_get_log_page(struct nvme_ctrl *dev, struct nvme_smart_log **log); +int nvme_get_features(struct nvme_ctrl *dev, unsigned fid, unsigned nsid, dma_addr_t dma_addr, u32 *result); -int nvme_set_features(struct nvme_dev *dev, unsigned fid, unsigned dword11, +int nvme_set_features(struct nvme_ctrl *dev, unsigned fid, unsigned dword11, dma_addr_t dma_addr, u32 *result); struct sg_io_hdr; diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index bfea7ec22b98..8a564f4ecf99 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -87,6 +87,9 @@ static wait_queue_head_t nvme_kthread_wait; static struct class *nvme_class; +struct nvme_dev; +struct nvme_queue; + static int __nvme_reset(struct nvme_dev *dev); static int nvme_reset(struct nvme_dev *dev); static void nvme_process_cq(struct nvme_queue *nvmeq); @@ -101,6 +104,49 @@ struct async_cmd_info { void *ctx; }; +/* + * Represents an NVM Express device. Each nvme_dev is a PCI function. + */ +struct nvme_dev { + struct list_head node; + struct nvme_queue **queues; + struct blk_mq_tag_set tagset; + struct blk_mq_tag_set admin_tagset; + u32 __iomem *dbs; + struct device *dev; + struct dma_pool *prp_page_pool; + struct dma_pool *prp_small_pool; + unsigned queue_count; + unsigned online_queues; + unsigned max_qid; + int q_depth; + u32 db_stride; + u32 ctrl_config; + struct msix_entry *entry; + void __iomem *bar; + struct list_head namespaces; + struct kref kref; + struct device *device; + struct work_struct reset_work; + struct work_struct probe_work; + struct work_struct scan_work; + bool subsystem; + u32 max_hw_sectors; + u32 stripe_size; + u32 page_size; + void __iomem *cmb; + dma_addr_t cmb_dma_addr; + u64 cmb_size; + u32 cmbsz; + + struct nvme_ctrl ctrl; +}; + +static inline struct nvme_dev *to_nvme_dev(struct nvme_ctrl *ctrl) +{ + return container_of(ctrl, struct nvme_dev, ctrl); +} + /* * An NVM Express queue. Each device has at least two (one for admin * commands and one for I/O commands). @@ -333,7 +379,7 @@ static void async_req_completion(struct nvme_queue *nvmeq, void *ctx, u16 status = le16_to_cpup(&cqe->status) >> 1; if (status == NVME_SC_SUCCESS || status == NVME_SC_ABORT_REQ) - ++nvmeq->dev->event_limit; + ++nvmeq->dev->ctrl.event_limit; if (status != NVME_SC_SUCCESS) return; @@ -357,7 +403,7 @@ static void abort_completion(struct nvme_queue *nvmeq, void *ctx, blk_mq_free_request(req); dev_warn(nvmeq->q_dmadev, "Abort status:%x result:%x", status, result); - ++nvmeq->dev->abort_limit; + ++nvmeq->dev->ctrl.abort_limit; } static void async_completion(struct nvme_queue *nvmeq, void *ctx, @@ -1051,7 +1097,7 @@ static int nvme_submit_async_admin_req(struct nvme_dev *dev) struct nvme_cmd_info *cmd_info; struct request *req; - req = blk_mq_alloc_request(dev->admin_q, WRITE, + req = blk_mq_alloc_request(dev->ctrl.admin_q, WRITE, BLK_MQ_REQ_NOWAIT | BLK_MQ_REQ_RESERVED); if (IS_ERR(req)) return PTR_ERR(req); @@ -1077,7 +1123,7 @@ static int nvme_submit_admin_async_cmd(struct nvme_dev *dev, struct request *req; struct nvme_cmd_info *cmd_rq; - req = blk_mq_alloc_request(dev->admin_q, WRITE, 0); + req = blk_mq_alloc_request(dev->ctrl.admin_q, WRITE, 0); if (IS_ERR(req)) return PTR_ERR(req); @@ -1101,7 +1147,7 @@ static int adapter_delete_queue(struct nvme_dev *dev, u8 opcode, u16 id) c.delete_queue.opcode = opcode; c.delete_queue.qid = cpu_to_le16(id); - return nvme_submit_sync_cmd(dev->admin_q, &c, NULL, 0); + return nvme_submit_sync_cmd(dev->ctrl.admin_q, &c, NULL, 0); } static int adapter_alloc_cq(struct nvme_dev *dev, u16 qid, @@ -1122,7 +1168,7 @@ static int adapter_alloc_cq(struct nvme_dev *dev, u16 qid, c.create_cq.cq_flags = cpu_to_le16(flags); c.create_cq.irq_vector = cpu_to_le16(nvmeq->cq_vector); - return nvme_submit_sync_cmd(dev->admin_q, &c, NULL, 0); + return nvme_submit_sync_cmd(dev->ctrl.admin_q, &c, NULL, 0); } static int adapter_alloc_sq(struct nvme_dev *dev, u16 qid, @@ -1143,7 +1189,7 @@ static int adapter_alloc_sq(struct nvme_dev *dev, u16 qid, c.create_sq.sq_flags = cpu_to_le16(flags); c.create_sq.cqid = cpu_to_le16(qid); - return nvme_submit_sync_cmd(dev->admin_q, &c, NULL, 0); + return nvme_submit_sync_cmd(dev->ctrl.admin_q, &c, NULL, 0); } static int adapter_delete_cq(struct nvme_dev *dev, u16 cqid) @@ -1182,10 +1228,10 @@ static void nvme_abort_req(struct request *req) return; } - if (!dev->abort_limit) + if (!dev->ctrl.abort_limit) return; - abort_req = blk_mq_alloc_request(dev->admin_q, WRITE, + abort_req = blk_mq_alloc_request(dev->ctrl.admin_q, WRITE, BLK_MQ_REQ_NOWAIT); if (IS_ERR(abort_req)) return; @@ -1199,7 +1245,7 @@ static void nvme_abort_req(struct request *req) cmd.abort.sqid = cpu_to_le16(nvmeq->qid); cmd.abort.command_id = abort_req->tag; - --dev->abort_limit; + --dev->ctrl.abort_limit; cmd_rq->aborted = 1; dev_warn(nvmeq->q_dmadev, "Aborting I/O %d QID %d\n", req->tag, @@ -1294,8 +1340,8 @@ static int nvme_suspend_queue(struct nvme_queue *nvmeq) nvmeq->cq_vector = -1; spin_unlock_irq(&nvmeq->q_lock); - if (!nvmeq->qid && nvmeq->dev->admin_q) - blk_mq_freeze_queue_start(nvmeq->dev->admin_q); + if (!nvmeq->qid && nvmeq->dev->ctrl.admin_q) + blk_mq_freeze_queue_start(nvmeq->dev->ctrl.admin_q); irq_set_affinity_hint(vector, NULL); free_irq(vector, nvmeq); @@ -1391,7 +1437,7 @@ static struct nvme_queue *nvme_alloc_queue(struct nvme_dev *dev, int qid, nvmeq->q_dmadev = dev->dev; nvmeq->dev = dev; snprintf(nvmeq->irqname, sizeof(nvmeq->irqname), "nvme%dq%d", - dev->instance, qid); + dev->ctrl.instance, qid); spin_lock_init(&nvmeq->q_lock); nvmeq->cq_head = 0; nvmeq->cq_phase = 1; @@ -1559,15 +1605,15 @@ static struct blk_mq_ops nvme_mq_ops = { static void nvme_dev_remove_admin(struct nvme_dev *dev) { - if (dev->admin_q && !blk_queue_dying(dev->admin_q)) { - blk_cleanup_queue(dev->admin_q); + if (dev->ctrl.admin_q && !blk_queue_dying(dev->ctrl.admin_q)) { + blk_cleanup_queue(dev->ctrl.admin_q); blk_mq_free_tag_set(&dev->admin_tagset); } } static int nvme_alloc_admin_tags(struct nvme_dev *dev) { - if (!dev->admin_q) { + if (!dev->ctrl.admin_q) { dev->admin_tagset.ops = &nvme_mq_admin_ops; dev->admin_tagset.nr_hw_queues = 1; dev->admin_tagset.queue_depth = NVME_AQ_DEPTH - 1; @@ -1580,18 +1626,18 @@ static int nvme_alloc_admin_tags(struct nvme_dev *dev) if (blk_mq_alloc_tag_set(&dev->admin_tagset)) return -ENOMEM; - dev->admin_q = blk_mq_init_queue(&dev->admin_tagset); - if (IS_ERR(dev->admin_q)) { + dev->ctrl.admin_q = blk_mq_init_queue(&dev->admin_tagset); + if (IS_ERR(dev->ctrl.admin_q)) { blk_mq_free_tag_set(&dev->admin_tagset); return -ENOMEM; } - if (!blk_get_queue(dev->admin_q)) { + if (!blk_get_queue(dev->ctrl.admin_q)) { nvme_dev_remove_admin(dev); - dev->admin_q = NULL; + dev->ctrl.admin_q = NULL; return -ENODEV; } } else - blk_mq_unfreeze_queue(dev->admin_q); + blk_mq_unfreeze_queue(dev->ctrl.admin_q); return 0; } @@ -1670,7 +1716,7 @@ static int nvme_configure_admin_queue(struct nvme_dev *dev) static int nvme_submit_io(struct nvme_ns *ns, struct nvme_user_io __user *uio) { - struct nvme_dev *dev = ns->dev; + struct nvme_dev *dev = to_nvme_dev(ns->ctrl); struct nvme_user_io io; struct nvme_command c; unsigned length, meta_len; @@ -1745,7 +1791,7 @@ static int nvme_submit_io(struct nvme_ns *ns, struct nvme_user_io __user *uio) return status; } -static int nvme_user_cmd(struct nvme_dev *dev, struct nvme_ns *ns, +static int nvme_user_cmd(struct nvme_ctrl *ctrl, struct nvme_ns *ns, struct nvme_passthru_cmd __user *ucmd) { struct nvme_passthru_cmd cmd; @@ -1774,7 +1820,7 @@ static int nvme_user_cmd(struct nvme_dev *dev, struct nvme_ns *ns, if (cmd.timeout_ms) timeout = msecs_to_jiffies(cmd.timeout_ms); - status = __nvme_submit_sync_cmd(ns ? ns->queue : dev->admin_q, &c, + status = __nvme_submit_sync_cmd(ns ? ns->queue : ctrl->admin_q, &c, NULL, (void __user *)(uintptr_t)cmd.addr, cmd.data_len, &cmd.result, timeout); if (status >= 0) { @@ -1804,9 +1850,9 @@ static int nvme_ioctl(struct block_device *bdev, fmode_t mode, unsigned int cmd, force_successful_syscall_return(); return ns->ns_id; case NVME_IOCTL_ADMIN_CMD: - return nvme_user_cmd(ns->dev, NULL, (void __user *)arg); + return nvme_user_cmd(ns->ctrl, NULL, (void __user *)arg); case NVME_IOCTL_IO_CMD: - return nvme_user_cmd(ns->dev, ns, (void __user *)arg); + return nvme_user_cmd(ns->ctrl, ns, (void __user *)arg); case NVME_IOCTL_SUBMIT_IO: return nvme_submit_io(ns, (void __user *)arg); case SG_GET_VERSION_NUM: @@ -1836,6 +1882,7 @@ static void nvme_free_dev(struct kref *kref); static void nvme_free_ns(struct kref *kref) { struct nvme_ns *ns = container_of(kref, struct nvme_ns, kref); + struct nvme_dev *dev = to_nvme_dev(ns->ctrl); if (ns->type == NVME_NS_LIGHTNVM) nvme_nvm_unregister(ns->queue, ns->disk->disk_name); @@ -1844,7 +1891,7 @@ static void nvme_free_ns(struct kref *kref) ns->disk->private_data = NULL; spin_unlock(&dev_list_lock); - kref_put(&ns->dev->kref, nvme_free_dev); + kref_put(&dev->kref, nvme_free_dev); put_disk(ns->disk); kfree(ns); } @@ -1893,15 +1940,15 @@ static void nvme_config_discard(struct nvme_ns *ns) static int nvme_revalidate_disk(struct gendisk *disk) { struct nvme_ns *ns = disk->private_data; - struct nvme_dev *dev = ns->dev; + struct nvme_dev *dev = to_nvme_dev(ns->ctrl); struct nvme_id_ns *id; u8 lbaf, pi_type; u16 old_ms; unsigned short bs; - if (nvme_identify_ns(dev, ns->ns_id, &id)) { + if (nvme_identify_ns(&dev->ctrl, ns->ns_id, &id)) { dev_warn(dev->dev, "%s: Identify failure nvme%dn%d\n", __func__, - dev->instance, ns->ns_id); + dev->ctrl.instance, ns->ns_id); return -ENODEV; } if (id->ncap == 0) { @@ -1957,7 +2004,7 @@ static int nvme_revalidate_disk(struct gendisk *disk) else set_capacity(disk, le64_to_cpup(&id->nsze) << (ns->lba_shift - 9)); - if (dev->oncs & NVME_CTRL_ONCS_DSM) + if (dev->ctrl.oncs & NVME_CTRL_ONCS_DSM) nvme_config_discard(ns); blk_mq_unfreeze_queue(disk->queue); @@ -2095,10 +2142,10 @@ static int nvme_kthread(void *data) spin_lock_irq(&nvmeq->q_lock); nvme_process_cq(nvmeq); - while ((i == 0) && (dev->event_limit > 0)) { + while (i == 0 && dev->ctrl.event_limit > 0) { if (nvme_submit_async_admin_req(dev)) break; - dev->event_limit--; + dev->ctrl.event_limit--; } spin_unlock_irq(&nvmeq->q_lock); } @@ -2124,7 +2171,7 @@ static void nvme_alloc_ns(struct nvme_dev *dev, unsigned nsid) goto out_free_ns; queue_flag_set_unlocked(QUEUE_FLAG_NOMERGES, ns->queue); queue_flag_set_unlocked(QUEUE_FLAG_NONROT, ns->queue); - ns->dev = dev; + ns->ctrl = &dev->ctrl; ns->queue->queuedata = ns; disk = alloc_disk_node(0, node); @@ -2145,7 +2192,7 @@ static void nvme_alloc_ns(struct nvme_dev *dev, unsigned nsid) } if (dev->stripe_size) blk_queue_chunk_sectors(ns->queue, dev->stripe_size >> 9); - if (dev->vwc & NVME_CTRL_VWC_PRESENT) + if (dev->ctrl.vwc & NVME_CTRL_VWC_PRESENT) blk_queue_flush(ns->queue, REQ_FLUSH | REQ_FUA); blk_queue_virt_boundary(ns->queue, dev->page_size - 1); @@ -2156,7 +2203,7 @@ static void nvme_alloc_ns(struct nvme_dev *dev, unsigned nsid) disk->queue = ns->queue; disk->driverfs_dev = dev->device; disk->flags = GENHD_FL_EXT_DEVT; - sprintf(disk->disk_name, "nvme%dn%d", dev->instance, nsid); + sprintf(disk->disk_name, "nvme%dn%d", dev->ctrl.instance, nsid); /* * Initialize capacity to 0 until we establish the namespace format and @@ -2221,7 +2268,7 @@ static int set_queue_count(struct nvme_dev *dev, int count) u32 result; u32 q_count = (count - 1) | ((count - 1) << 16); - status = nvme_set_features(dev, NVME_FEAT_NUM_QUEUES, q_count, 0, + status = nvme_set_features(&dev->ctrl, NVME_FEAT_NUM_QUEUES, q_count, 0, &result); if (status < 0) return status; @@ -2405,7 +2452,8 @@ static inline bool nvme_io_incapable(struct nvme_dev *dev) static void nvme_ns_remove(struct nvme_ns *ns) { - bool kill = nvme_io_incapable(ns->dev) && !blk_queue_dying(ns->queue); + bool kill = nvme_io_incapable(to_nvme_dev(ns->ctrl)) && + !blk_queue_dying(ns->queue); if (kill) blk_set_queue_dying(ns->queue); @@ -2462,7 +2510,7 @@ static void nvme_dev_scan(struct work_struct *work) if (!dev->tagset.tags) return; - if (nvme_identify_ctrl(dev, &ctrl)) + if (nvme_identify_ctrl(&dev->ctrl, &ctrl)) return; nvme_scan_namespaces(dev, le32_to_cpup(&ctrl->nn)); kfree(ctrl); @@ -2482,18 +2530,18 @@ static int nvme_dev_add(struct nvme_dev *dev) struct nvme_id_ctrl *ctrl; int shift = NVME_CAP_MPSMIN(lo_hi_readq(dev->bar + NVME_REG_CAP)) + 12; - res = nvme_identify_ctrl(dev, &ctrl); + res = nvme_identify_ctrl(&dev->ctrl, &ctrl); if (res) { dev_err(dev->dev, "Identify Controller failed (%d)\n", res); return -EIO; } - dev->oncs = le16_to_cpup(&ctrl->oncs); - dev->abort_limit = ctrl->acl + 1; - dev->vwc = ctrl->vwc; - memcpy(dev->serial, ctrl->sn, sizeof(ctrl->sn)); - memcpy(dev->model, ctrl->mn, sizeof(ctrl->mn)); - memcpy(dev->firmware_rev, ctrl->fr, sizeof(ctrl->fr)); + dev->ctrl.oncs = le16_to_cpup(&ctrl->oncs); + dev->ctrl.abort_limit = ctrl->acl + 1; + dev->ctrl.vwc = ctrl->vwc; + memcpy(dev->ctrl.serial, ctrl->sn, sizeof(ctrl->sn)); + memcpy(dev->ctrl.model, ctrl->mn, sizeof(ctrl->mn)); + memcpy(dev->ctrl.firmware_rev, ctrl->fr, sizeof(ctrl->fr)); if (ctrl->mdts) dev->max_hw_sectors = 1 << (ctrl->mdts + shift - 9); else @@ -2728,7 +2776,7 @@ static void nvme_disable_io_queues(struct nvme_dev *dev) DEFINE_KTHREAD_WORKER_ONSTACK(worker); struct nvme_delq_ctx dq; struct task_struct *kworker_task = kthread_run(kthread_worker_fn, - &worker, "nvme%d", dev->instance); + &worker, "nvme%d", dev->ctrl.instance); if (IS_ERR(kworker_task)) { dev_err(dev->dev, @@ -2879,14 +2927,14 @@ static int nvme_set_instance(struct nvme_dev *dev) if (error) return -ENODEV; - dev->instance = instance; + dev->ctrl.instance = instance; return 0; } static void nvme_release_instance(struct nvme_dev *dev) { spin_lock(&dev_list_lock); - ida_remove(&nvme_instance_ida, dev->instance); + ida_remove(&nvme_instance_ida, dev->ctrl.instance); spin_unlock(&dev_list_lock); } @@ -2899,8 +2947,8 @@ static void nvme_free_dev(struct kref *kref) nvme_release_instance(dev); if (dev->tagset.tags) blk_mq_free_tag_set(&dev->tagset); - if (dev->admin_q) - blk_put_queue(dev->admin_q); + if (dev->ctrl.admin_q) + blk_put_queue(dev->ctrl.admin_q); kfree(dev->queues); kfree(dev->entry); kfree(dev); @@ -2914,8 +2962,8 @@ static int nvme_dev_open(struct inode *inode, struct file *f) spin_lock(&dev_list_lock); list_for_each_entry(dev, &dev_list, node) { - if (dev->instance == instance) { - if (!dev->admin_q) { + if (dev->ctrl.instance == instance) { + if (!dev->ctrl.admin_q) { ret = -EWOULDBLOCK; break; } @@ -2945,12 +2993,12 @@ static long nvme_dev_ioctl(struct file *f, unsigned int cmd, unsigned long arg) switch (cmd) { case NVME_IOCTL_ADMIN_CMD: - return nvme_user_cmd(dev, NULL, (void __user *)arg); + return nvme_user_cmd(&dev->ctrl, NULL, (void __user *)arg); case NVME_IOCTL_IO_CMD: if (list_empty(&dev->namespaces)) return -ENOTTY; ns = list_first_entry(&dev->namespaces, struct nvme_ns, list); - return nvme_user_cmd(dev, ns, (void __user *)arg); + return nvme_user_cmd(&dev->ctrl, ns, (void __user *)arg); case NVME_IOCTL_RESET: dev_warn(dev->dev, "resetting controller\n"); return nvme_reset(dev); @@ -3011,7 +3059,7 @@ static void nvme_probe_work(struct work_struct *work) if (result) goto free_tags; - dev->event_limit = 1; + dev->ctrl.event_limit = 1; /* * Keep the controller around but remove all namespaces if we don't have @@ -3029,8 +3077,8 @@ static void nvme_probe_work(struct work_struct *work) free_tags: nvme_dev_remove_admin(dev); - blk_put_queue(dev->admin_q); - dev->admin_q = NULL; + blk_put_queue(dev->ctrl.admin_q); + dev->ctrl.admin_q = NULL; dev->queues[0]->tags = NULL; disable: nvme_disable_queue(dev, 0); @@ -3058,7 +3106,7 @@ static void nvme_dead_ctrl(struct nvme_dev *dev) dev_warn(dev->dev, "Device failed to resume\n"); kref_get(&dev->kref); if (IS_ERR(kthread_run(nvme_remove_dead_ctrl, dev, "nvme%d", - dev->instance))) { + dev->ctrl.instance))) { dev_err(dev->dev, "Failed to start controller remove task\n"); kref_put(&dev->kref, nvme_free_dev); @@ -3100,7 +3148,7 @@ static int nvme_reset(struct nvme_dev *dev) { int ret; - if (!dev->admin_q || blk_queue_dying(dev->admin_q)) + if (!dev->ctrl.admin_q || blk_queue_dying(dev->ctrl.admin_q)) return -ENODEV; spin_lock(&dev_list_lock); @@ -3131,6 +3179,16 @@ static ssize_t nvme_sysfs_reset(struct device *dev, } static DEVICE_ATTR(reset_controller, S_IWUSR, NULL, nvme_sysfs_reset); +static int nvme_pci_reg_read32(struct nvme_ctrl *ctrl, u32 off, u32 *val) +{ + *val = readl(to_nvme_dev(ctrl)->bar + off); + return 0; +} + +static const struct nvme_ctrl_ops nvme_pci_ctrl_ops = { + .reg_read32 = nvme_pci_reg_read32, +}; + static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id) { int node, result = -ENOMEM; @@ -3156,6 +3214,10 @@ static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id) INIT_WORK(&dev->reset_work, nvme_reset_work); dev->dev = get_device(&pdev->dev); pci_set_drvdata(pdev, dev); + + dev->ctrl.ops = &nvme_pci_ctrl_ops; + dev->ctrl.dev = dev->dev; + result = nvme_set_instance(dev); if (result) goto put_pci; @@ -3166,8 +3228,8 @@ static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id) kref_init(&dev->kref); dev->device = device_create(nvme_class, &pdev->dev, - MKDEV(nvme_char_major, dev->instance), - dev, "nvme%d", dev->instance); + MKDEV(nvme_char_major, dev->ctrl.instance), + dev, "nvme%d", dev->ctrl.instance); if (IS_ERR(dev->device)) { result = PTR_ERR(dev->device); goto release_pools; @@ -3186,7 +3248,7 @@ static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id) return 0; put_dev: - device_destroy(nvme_class, MKDEV(nvme_char_major, dev->instance)); + device_destroy(nvme_class, MKDEV(nvme_char_major, dev->ctrl.instance)); put_device(dev->device); release_pools: nvme_release_prp_pools(dev); @@ -3233,7 +3295,7 @@ static void nvme_remove(struct pci_dev *pdev) nvme_dev_remove(dev); nvme_dev_shutdown(dev); nvme_dev_remove_admin(dev); - device_destroy(nvme_class, MKDEV(nvme_char_major, dev->instance)); + device_destroy(nvme_class, MKDEV(nvme_char_major, dev->ctrl.instance)); nvme_free_queues(dev, 0); nvme_release_cmb(dev); nvme_release_prp_pools(dev); diff --git a/drivers/nvme/host/scsi.c b/drivers/nvme/host/scsi.c index 0bf90b62ec27..bba29553bc94 100644 --- a/drivers/nvme/host/scsi.c +++ b/drivers/nvme/host/scsi.c @@ -524,7 +524,7 @@ static int nvme_trans_standard_inquiry_page(struct nvme_ns *ns, struct sg_io_hdr *hdr, u8 *inq_response, int alloc_len) { - struct nvme_dev *dev = ns->dev; + struct nvme_ctrl *ctrl = ns->ctrl; struct nvme_id_ns *id_ns; int res; int nvme_sc; @@ -532,10 +532,10 @@ static int nvme_trans_standard_inquiry_page(struct nvme_ns *ns, u8 resp_data_format = 0x02; u8 protect; u8 cmdque = 0x01 << 1; - u8 fw_offset = sizeof(dev->firmware_rev); + u8 fw_offset = sizeof(ctrl->firmware_rev); /* nvme ns identify - use DPS value for PROTECT field */ - nvme_sc = nvme_identify_ns(dev, ns->ns_id, &id_ns); + nvme_sc = nvme_identify_ns(ctrl, ns->ns_id, &id_ns); res = nvme_trans_status_code(hdr, nvme_sc); if (res) return res; @@ -553,12 +553,12 @@ static int nvme_trans_standard_inquiry_page(struct nvme_ns *ns, inq_response[5] = protect; /* sccs=0 | acc=0 | tpgs=0 | pc3=0 */ inq_response[7] = cmdque; /* wbus16=0 | sync=0 | vs=0 */ strncpy(&inq_response[8], "NVMe ", 8); - strncpy(&inq_response[16], dev->model, 16); + strncpy(&inq_response[16], ctrl->model, 16); - while (dev->firmware_rev[fw_offset - 1] == ' ' && fw_offset > 4) + while (ctrl->firmware_rev[fw_offset - 1] == ' ' && fw_offset > 4) fw_offset--; fw_offset -= 4; - strncpy(&inq_response[32], dev->firmware_rev + fw_offset, 4); + strncpy(&inq_response[32], ctrl->firmware_rev + fw_offset, 4); xfer_len = min(alloc_len, STANDARD_INQUIRY_LENGTH); return nvme_trans_copy_to_user(hdr, inq_response, xfer_len); @@ -588,27 +588,26 @@ static int nvme_trans_unit_serial_page(struct nvme_ns *ns, struct sg_io_hdr *hdr, u8 *inq_response, int alloc_len) { - struct nvme_dev *dev = ns->dev; int xfer_len; memset(inq_response, 0, STANDARD_INQUIRY_LENGTH); inq_response[1] = INQ_UNIT_SERIAL_NUMBER_PAGE; /* Page Code */ inq_response[3] = INQ_SERIAL_NUMBER_LENGTH; /* Page Length */ - strncpy(&inq_response[4], dev->serial, INQ_SERIAL_NUMBER_LENGTH); + strncpy(&inq_response[4], ns->ctrl->serial, INQ_SERIAL_NUMBER_LENGTH); xfer_len = min(alloc_len, STANDARD_INQUIRY_LENGTH); return nvme_trans_copy_to_user(hdr, inq_response, xfer_len); } static int nvme_fill_device_id_eui64(struct nvme_ns *ns, struct sg_io_hdr *hdr, - u8 *inq_response, int alloc_len) + u8 *inq_response, int alloc_len, u32 vs) { struct nvme_id_ns *id_ns; int nvme_sc, res; size_t len; void *eui; - nvme_sc = nvme_identify_ns(ns->dev, ns->ns_id, &id_ns); + nvme_sc = nvme_identify_ns(ns->ctrl, ns->ns_id, &id_ns); res = nvme_trans_status_code(hdr, nvme_sc); if (res) return res; @@ -616,7 +615,7 @@ static int nvme_fill_device_id_eui64(struct nvme_ns *ns, struct sg_io_hdr *hdr, eui = id_ns->eui64; len = sizeof(id_ns->eui64); - if (readl(ns->dev->bar + NVME_REG_VS) >= NVME_VS(1, 2)) { + if (vs >= NVME_VS(1, 2)) { if (bitmap_empty(eui, len * 8)) { eui = id_ns->nguid; len = sizeof(id_ns->nguid); @@ -648,7 +647,7 @@ out_free_id: static int nvme_fill_device_id_scsi_string(struct nvme_ns *ns, struct sg_io_hdr *hdr, u8 *inq_response, int alloc_len) { - struct nvme_dev *dev = ns->dev; + struct nvme_ctrl *ctrl = ns->ctrl; struct nvme_id_ctrl *id_ctrl; int nvme_sc, res; @@ -659,7 +658,7 @@ static int nvme_fill_device_id_scsi_string(struct nvme_ns *ns, SCSI_ASCQ_CAUSE_NOT_REPORTABLE); } - nvme_sc = nvme_identify_ctrl(dev, &id_ctrl); + nvme_sc = nvme_identify_ctrl(ctrl, &id_ctrl); res = nvme_trans_status_code(hdr, nvme_sc); if (res) return res; @@ -675,9 +674,9 @@ static int nvme_fill_device_id_scsi_string(struct nvme_ns *ns, inq_response[7] = 0x44; /* Designator Length */ sprintf(&inq_response[8], "%04x", le16_to_cpu(id_ctrl->vid)); - memcpy(&inq_response[12], dev->model, sizeof(dev->model)); + memcpy(&inq_response[12], ctrl->model, sizeof(ctrl->model)); sprintf(&inq_response[52], "%04x", cpu_to_be32(ns->ns_id)); - memcpy(&inq_response[56], dev->serial, sizeof(dev->serial)); + memcpy(&inq_response[56], ctrl->serial, sizeof(ctrl->serial)); res = nvme_trans_copy_to_user(hdr, inq_response, alloc_len); kfree(id_ctrl); @@ -688,9 +687,14 @@ static int nvme_trans_device_id_page(struct nvme_ns *ns, struct sg_io_hdr *hdr, u8 *resp, int alloc_len) { int res; + u32 vs; - if (readl(ns->dev->bar + NVME_REG_VS) >= NVME_VS(1, 1)) { - res = nvme_fill_device_id_eui64(ns, hdr, resp, alloc_len); + res = ns->ctrl->ops->reg_read32(ns->ctrl, NVME_REG_VS, &vs); + if (res) + return res; + + if (vs >= NVME_VS(1, 1)) { + res = nvme_fill_device_id_eui64(ns, hdr, resp, alloc_len, vs); if (res != -EOPNOTSUPP) return res; } @@ -704,7 +708,7 @@ static int nvme_trans_ext_inq_page(struct nvme_ns *ns, struct sg_io_hdr *hdr, u8 *inq_response; int res; int nvme_sc; - struct nvme_dev *dev = ns->dev; + struct nvme_ctrl *ctrl = ns->ctrl; struct nvme_id_ctrl *id_ctrl; struct nvme_id_ns *id_ns; int xfer_len; @@ -720,7 +724,7 @@ static int nvme_trans_ext_inq_page(struct nvme_ns *ns, struct sg_io_hdr *hdr, if (inq_response == NULL) return -ENOMEM; - nvme_sc = nvme_identify_ns(dev, ns->ns_id, &id_ns); + nvme_sc = nvme_identify_ns(ctrl, ns->ns_id, &id_ns); res = nvme_trans_status_code(hdr, nvme_sc); if (res) goto out_free_inq; @@ -736,7 +740,7 @@ static int nvme_trans_ext_inq_page(struct nvme_ns *ns, struct sg_io_hdr *hdr, app_chk = protect << 1; ref_chk = protect; - nvme_sc = nvme_identify_ctrl(dev, &id_ctrl); + nvme_sc = nvme_identify_ctrl(ctrl, &id_ctrl); res = nvme_trans_status_code(hdr, nvme_sc); if (res) goto out_free_inq; @@ -847,7 +851,6 @@ static int nvme_trans_log_info_exceptions(struct nvme_ns *ns, int res; int xfer_len; u8 *log_response; - struct nvme_dev *dev = ns->dev; struct nvme_smart_log *smart_log; u8 temp_c; u16 temp_k; @@ -856,7 +859,7 @@ static int nvme_trans_log_info_exceptions(struct nvme_ns *ns, if (log_response == NULL) return -ENOMEM; - res = nvme_get_log_page(dev, &smart_log); + res = nvme_get_log_page(ns->ctrl, &smart_log); if (res < 0) goto out_free_response; @@ -894,7 +897,6 @@ static int nvme_trans_log_temperature(struct nvme_ns *ns, struct sg_io_hdr *hdr, int res; int xfer_len; u8 *log_response; - struct nvme_dev *dev = ns->dev; struct nvme_smart_log *smart_log; u32 feature_resp; u8 temp_c_cur, temp_c_thresh; @@ -904,7 +906,7 @@ static int nvme_trans_log_temperature(struct nvme_ns *ns, struct sg_io_hdr *hdr, if (log_response == NULL) return -ENOMEM; - res = nvme_get_log_page(dev, &smart_log); + res = nvme_get_log_page(ns->ctrl, &smart_log); if (res < 0) goto out_free_response; @@ -918,7 +920,7 @@ static int nvme_trans_log_temperature(struct nvme_ns *ns, struct sg_io_hdr *hdr, kfree(smart_log); /* Get Features for Temp Threshold */ - res = nvme_get_features(dev, NVME_FEAT_TEMP_THRESH, 0, 0, + res = nvme_get_features(ns->ctrl, NVME_FEAT_TEMP_THRESH, 0, 0, &feature_resp); if (res != NVME_SC_SUCCESS) temp_c_thresh = LOG_TEMP_UNKNOWN; @@ -980,7 +982,6 @@ static int nvme_trans_fill_blk_desc(struct nvme_ns *ns, struct sg_io_hdr *hdr, { int res; int nvme_sc; - struct nvme_dev *dev = ns->dev; struct nvme_id_ns *id_ns; u8 flbas; u32 lba_length; @@ -990,7 +991,7 @@ static int nvme_trans_fill_blk_desc(struct nvme_ns *ns, struct sg_io_hdr *hdr, else if (llbaa > 0 && len < MODE_PAGE_LLBAA_BLK_DES_LEN) return -EINVAL; - nvme_sc = nvme_identify_ns(dev, ns->ns_id, &id_ns); + nvme_sc = nvme_identify_ns(ns->ctrl, ns->ns_id, &id_ns); res = nvme_trans_status_code(hdr, nvme_sc); if (res) return res; @@ -1046,14 +1047,13 @@ static int nvme_trans_fill_caching_page(struct nvme_ns *ns, { int res = 0; int nvme_sc; - struct nvme_dev *dev = ns->dev; u32 feature_resp; u8 vwc; if (len < MODE_PAGE_CACHING_LEN) return -EINVAL; - nvme_sc = nvme_get_features(dev, NVME_FEAT_VOLATILE_WC, 0, 0, + nvme_sc = nvme_get_features(ns->ctrl, NVME_FEAT_VOLATILE_WC, 0, 0, &feature_resp); res = nvme_trans_status_code(hdr, nvme_sc); if (res) @@ -1239,12 +1239,11 @@ static int nvme_trans_power_state(struct nvme_ns *ns, struct sg_io_hdr *hdr, { int res; int nvme_sc; - struct nvme_dev *dev = ns->dev; struct nvme_id_ctrl *id_ctrl; int lowest_pow_st; /* max npss = lowest power consumption */ unsigned ps_desired = 0; - nvme_sc = nvme_identify_ctrl(dev, &id_ctrl); + nvme_sc = nvme_identify_ctrl(ns->ctrl, &id_ctrl); res = nvme_trans_status_code(hdr, nvme_sc); if (res) return res; @@ -1288,7 +1287,7 @@ static int nvme_trans_power_state(struct nvme_ns *ns, struct sg_io_hdr *hdr, SCSI_ASCQ_CAUSE_NOT_REPORTABLE); break; } - nvme_sc = nvme_set_features(dev, NVME_FEAT_POWER_MGMT, ps_desired, 0, + nvme_sc = nvme_set_features(ns->ctrl, NVME_FEAT_POWER_MGMT, ps_desired, 0, NULL); return nvme_trans_status_code(hdr, nvme_sc); } @@ -1312,7 +1311,6 @@ static int nvme_trans_send_download_fw_cmd(struct nvme_ns *ns, struct sg_io_hdr u8 buffer_id) { int nvme_sc; - struct nvme_dev *dev = ns->dev; struct nvme_command c; if (hdr->iovec_count > 0) { @@ -1329,7 +1327,7 @@ static int nvme_trans_send_download_fw_cmd(struct nvme_ns *ns, struct sg_io_hdr c.dlfw.numd = cpu_to_le32((tot_len/BYTES_TO_DWORDS) - 1); c.dlfw.offset = cpu_to_le32(offset/BYTES_TO_DWORDS); - nvme_sc = __nvme_submit_sync_cmd(dev->admin_q, &c, NULL, + nvme_sc = __nvme_submit_sync_cmd(ns->ctrl->admin_q, &c, NULL, hdr->dxferp, tot_len, NULL, 0); return nvme_trans_status_code(hdr, nvme_sc); } @@ -1396,14 +1394,13 @@ static int nvme_trans_modesel_get_mp(struct nvme_ns *ns, struct sg_io_hdr *hdr, { int res = 0; int nvme_sc; - struct nvme_dev *dev = ns->dev; unsigned dword11; switch (page_code) { case MODE_PAGE_CACHING: dword11 = ((mode_page[2] & CACHING_MODE_PAGE_WCE_MASK) ? 1 : 0); - nvme_sc = nvme_set_features(dev, NVME_FEAT_VOLATILE_WC, dword11, - 0, NULL); + nvme_sc = nvme_set_features(ns->ctrl, NVME_FEAT_VOLATILE_WC, + dword11, 0, NULL); res = nvme_trans_status_code(hdr, nvme_sc); break; case MODE_PAGE_CONTROL: @@ -1505,7 +1502,6 @@ static int nvme_trans_fmt_set_blk_size_count(struct nvme_ns *ns, { int res = 0; int nvme_sc; - struct nvme_dev *dev = ns->dev; u8 flbas; /* @@ -1518,7 +1514,7 @@ static int nvme_trans_fmt_set_blk_size_count(struct nvme_ns *ns, if (ns->mode_select_num_blocks == 0 || ns->mode_select_block_len == 0) { struct nvme_id_ns *id_ns; - nvme_sc = nvme_identify_ns(dev, ns->ns_id, &id_ns); + nvme_sc = nvme_identify_ns(ns->ctrl, ns->ns_id, &id_ns); res = nvme_trans_status_code(hdr, nvme_sc); if (res) return res; @@ -1602,7 +1598,6 @@ static int nvme_trans_fmt_send_cmd(struct nvme_ns *ns, struct sg_io_hdr *hdr, { int res; int nvme_sc; - struct nvme_dev *dev = ns->dev; struct nvme_id_ns *id_ns; u8 i; u8 flbas, nlbaf; @@ -1611,7 +1606,7 @@ static int nvme_trans_fmt_send_cmd(struct nvme_ns *ns, struct sg_io_hdr *hdr, struct nvme_command c; /* Loop thru LBAF's in id_ns to match reqd lbaf, put in cdw10 */ - nvme_sc = nvme_identify_ns(dev, ns->ns_id, &id_ns); + nvme_sc = nvme_identify_ns(ns->ctrl, ns->ns_id, &id_ns); res = nvme_trans_status_code(hdr, nvme_sc); if (res) return res; @@ -1643,7 +1638,7 @@ static int nvme_trans_fmt_send_cmd(struct nvme_ns *ns, struct sg_io_hdr *hdr, c.format.nsid = cpu_to_le32(ns->ns_id); c.format.cdw10 = cpu_to_le32(cdw10); - nvme_sc = nvme_submit_sync_cmd(dev->admin_q, &c, NULL, 0); + nvme_sc = nvme_submit_sync_cmd(ns->ctrl->admin_q, &c, NULL, 0); res = nvme_trans_status_code(hdr, nvme_sc); kfree(id_ns); @@ -2072,7 +2067,6 @@ static int nvme_trans_read_capacity(struct nvme_ns *ns, struct sg_io_hdr *hdr, u32 alloc_len; u32 resp_size; u32 xfer_len; - struct nvme_dev *dev = ns->dev; struct nvme_id_ns *id_ns; u8 *response; @@ -2084,7 +2078,7 @@ static int nvme_trans_read_capacity(struct nvme_ns *ns, struct sg_io_hdr *hdr, resp_size = READ_CAP_10_RESP_SIZE; } - nvme_sc = nvme_identify_ns(dev, ns->ns_id, &id_ns); + nvme_sc = nvme_identify_ns(ns->ctrl, ns->ns_id, &id_ns); res = nvme_trans_status_code(hdr, nvme_sc); if (res) return res; @@ -2112,7 +2106,6 @@ static int nvme_trans_report_luns(struct nvme_ns *ns, struct sg_io_hdr *hdr, int nvme_sc; u32 alloc_len, xfer_len, resp_size; u8 *response; - struct nvme_dev *dev = ns->dev; struct nvme_id_ctrl *id_ctrl; u32 ll_length, lun_id; u8 lun_id_offset = REPORT_LUNS_FIRST_LUN_OFFSET; @@ -2126,7 +2119,7 @@ static int nvme_trans_report_luns(struct nvme_ns *ns, struct sg_io_hdr *hdr, case ALL_LUNS_RETURNED: case ALL_WELL_KNOWN_LUNS_RETURNED: case RESTRICTED_LUNS_RETURNED: - nvme_sc = nvme_identify_ctrl(dev, &id_ctrl); + nvme_sc = nvme_identify_ctrl(ns->ctrl, &id_ctrl); res = nvme_trans_status_code(hdr, nvme_sc); if (res) return res; @@ -2327,9 +2320,7 @@ static int nvme_trans_test_unit_ready(struct nvme_ns *ns, struct sg_io_hdr *hdr, u8 *cmd) { - struct nvme_dev *dev = ns->dev; - - if (!(readl(dev->bar + NVME_REG_CSTS) & NVME_CSTS_RDY)) + if (nvme_ctrl_ready(ns->ctrl)) return nvme_trans_completion(hdr, SAM_STAT_CHECK_CONDITION, NOT_READY, SCSI_ASC_LUN_NOT_READY, SCSI_ASCQ_CAUSE_NOT_REPORTABLE); From 69d2b571746d1c3fa10b7a0aa00859b296a98d12 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 16 Oct 2015 07:58:37 +0200 Subject: [PATCH 07/67] nvme: simplify nvme_setup_prps calling convention Pass back a true/false value instead of the length which needs a compare with the bytes in the request and drop the pointless gfp_t argument. Signed-off-by: Christoph Hellwig Signed-off-by: Keith Busch Signed-off-by: Jens Axboe --- drivers/nvme/host/pci.c | 22 ++++++++++------------ 1 file changed, 10 insertions(+), 12 deletions(-) diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 8a564f4ecf99..75970fdeeecb 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -709,9 +709,8 @@ release_iod: blk_mq_complete_request(req, error); } -/* length is in bytes. gfp flags indicates whether we may sleep. */ -static int nvme_setup_prps(struct nvme_dev *dev, struct nvme_iod *iod, - int total_len, gfp_t gfp) +static bool nvme_setup_prps(struct nvme_dev *dev, struct nvme_iod *iod, + int total_len) { struct dma_pool *pool; int length = total_len; @@ -727,7 +726,7 @@ static int nvme_setup_prps(struct nvme_dev *dev, struct nvme_iod *iod, length -= (page_size - offset); if (length <= 0) - return total_len; + return true; dma_len -= (page_size - offset); if (dma_len) { @@ -740,7 +739,7 @@ static int nvme_setup_prps(struct nvme_dev *dev, struct nvme_iod *iod, if (length <= page_size) { iod->first_dma = dma_addr; - return total_len; + return true; } nprps = DIV_ROUND_UP(length, page_size); @@ -752,11 +751,11 @@ static int nvme_setup_prps(struct nvme_dev *dev, struct nvme_iod *iod, iod->npages = 1; } - prp_list = dma_pool_alloc(pool, gfp, &prp_dma); + prp_list = dma_pool_alloc(pool, GFP_ATOMIC, &prp_dma); if (!prp_list) { iod->first_dma = dma_addr; iod->npages = -1; - return (total_len - length) + page_size; + return false; } list[0] = prp_list; iod->first_dma = prp_dma; @@ -764,9 +763,9 @@ static int nvme_setup_prps(struct nvme_dev *dev, struct nvme_iod *iod, for (;;) { if (i == page_size >> 3) { __le64 *old_prp_list = prp_list; - prp_list = dma_pool_alloc(pool, gfp, &prp_dma); + prp_list = dma_pool_alloc(pool, GFP_ATOMIC, &prp_dma); if (!prp_list) - return total_len - length; + return false; list[iod->npages++] = prp_list; prp_list[0] = old_prp_list[i - 1]; old_prp_list[i - 1] = cpu_to_le64(prp_dma); @@ -786,7 +785,7 @@ static int nvme_setup_prps(struct nvme_dev *dev, struct nvme_iod *iod, dma_len = sg_dma_len(sg); } - return total_len; + return true; } static void nvme_submit_priv(struct nvme_queue *nvmeq, struct request *req, @@ -952,8 +951,7 @@ static int nvme_queue_rq(struct blk_mq_hw_ctx *hctx, if (!dma_map_sg(nvmeq->q_dmadev, iod->sg, iod->nents, dma_dir)) goto retry_cmd; - if (blk_rq_bytes(req) != - nvme_setup_prps(dev, iod, blk_rq_bytes(req), GFP_ATOMIC)) { + if (!nvme_setup_prps(dev, iod, blk_rq_bytes(req))) { dma_unmap_sg(dev->dev, iod->sg, iod->nents, dma_dir); goto retry_cmd; } From ba1ca37ea4e320c108c356eb8c91ac652afc57dd Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 16 Oct 2015 07:58:38 +0200 Subject: [PATCH 08/67] nvme: refactor nvme_queue_rq This "backports" the structure I've used for the fabrics driver. It mostly started out as a cleanup so that I could actually understand the code, but I think it also qualifies as a micro-optimization due to the reduced time we hold q_lock and disable interrupts. Signed-off-by: Christoph Hellwig Signed-off-by: Keith Busch Signed-off-by: Jens Axboe --- drivers/nvme/host/pci.c | 221 ++++++++++++++++++---------------------- 1 file changed, 98 insertions(+), 123 deletions(-) diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 75970fdeeecb..e5f53f159069 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -788,19 +788,53 @@ static bool nvme_setup_prps(struct nvme_dev *dev, struct nvme_iod *iod, return true; } -static void nvme_submit_priv(struct nvme_queue *nvmeq, struct request *req, - struct nvme_iod *iod) +static int nvme_map_data(struct nvme_dev *dev, struct nvme_iod *iod, + struct nvme_command *cmnd) { - struct nvme_command cmnd; + struct request *req = iod_get_private(iod); + struct request_queue *q = req->q; + enum dma_data_direction dma_dir = rq_data_dir(req) ? + DMA_TO_DEVICE : DMA_FROM_DEVICE; + int ret = BLK_MQ_RQ_QUEUE_ERROR; - memcpy(&cmnd, req->cmd, sizeof(cmnd)); - cmnd.rw.command_id = req->tag; - if (req->nr_phys_segments) { - cmnd.rw.prp1 = cpu_to_le64(sg_dma_address(iod->sg)); - cmnd.rw.prp2 = cpu_to_le64(iod->first_dma); + sg_init_table(iod->sg, req->nr_phys_segments); + iod->nents = blk_rq_map_sg(q, req, iod->sg); + if (!iod->nents) + goto out; + + ret = BLK_MQ_RQ_QUEUE_BUSY; + if (!dma_map_sg(dev->dev, iod->sg, iod->nents, dma_dir)) + goto out; + + if (!nvme_setup_prps(dev, iod, blk_rq_bytes(req))) + goto out_unmap; + + ret = BLK_MQ_RQ_QUEUE_ERROR; + if (blk_integrity_rq(req)) { + if (blk_rq_count_integrity_sg(q, req->bio) != 1) + goto out_unmap; + + sg_init_table(iod->meta_sg, 1); + if (blk_rq_map_integrity_sg(q, req->bio, iod->meta_sg) != 1) + goto out_unmap; + + if (rq_data_dir(req)) + nvme_dif_remap(req, nvme_dif_prep); + + if (!dma_map_sg(dev->dev, iod->meta_sg, 1, dma_dir)) + goto out_unmap; } - __nvme_submit_cmd(nvmeq, &cmnd); + cmnd->rw.prp1 = cpu_to_le64(sg_dma_address(iod->sg)); + cmnd->rw.prp2 = cpu_to_le64(iod->first_dma); + if (blk_integrity_rq(req)) + cmnd->rw.metadata = cpu_to_le64(sg_dma_address(iod->meta_sg)); + return BLK_MQ_RQ_QUEUE_OK; + +out_unmap: + dma_unmap_sg(dev->dev, iod->sg, iod->nents, dma_dir); +out: + return ret; } /* @@ -808,46 +842,42 @@ static void nvme_submit_priv(struct nvme_queue *nvmeq, struct request *req, * worth having a special pool for these or additional cases to handle freeing * the iod. */ -static void nvme_submit_discard(struct nvme_queue *nvmeq, struct nvme_ns *ns, - struct request *req, struct nvme_iod *iod) +static int nvme_setup_discard(struct nvme_queue *nvmeq, struct nvme_ns *ns, + struct nvme_iod *iod, struct nvme_command *cmnd) { - struct nvme_dsm_range *range = - (struct nvme_dsm_range *)iod_list(iod)[0]; - struct nvme_command cmnd; + struct request *req = iod_get_private(iod); + struct nvme_dsm_range *range; + + range = dma_pool_alloc(nvmeq->dev->prp_small_pool, GFP_ATOMIC, + &iod->first_dma); + if (!range) + return BLK_MQ_RQ_QUEUE_BUSY; + iod_list(iod)[0] = (__le64 *)range; + iod->npages = 0; range->cattr = cpu_to_le32(0); range->nlb = cpu_to_le32(blk_rq_bytes(req) >> ns->lba_shift); range->slba = cpu_to_le64(nvme_block_nr(ns, blk_rq_pos(req))); - memset(&cmnd, 0, sizeof(cmnd)); - cmnd.dsm.opcode = nvme_cmd_dsm; - cmnd.dsm.command_id = req->tag; - cmnd.dsm.nsid = cpu_to_le32(ns->ns_id); - cmnd.dsm.prp1 = cpu_to_le64(iod->first_dma); - cmnd.dsm.nr = 0; - cmnd.dsm.attributes = cpu_to_le32(NVME_DSMGMT_AD); - - __nvme_submit_cmd(nvmeq, &cmnd); + memset(cmnd, 0, sizeof(*cmnd)); + cmnd->dsm.opcode = nvme_cmd_dsm; + cmnd->dsm.nsid = cpu_to_le32(ns->ns_id); + cmnd->dsm.prp1 = cpu_to_le64(iod->first_dma); + cmnd->dsm.nr = 0; + cmnd->dsm.attributes = cpu_to_le32(NVME_DSMGMT_AD); + return BLK_MQ_RQ_QUEUE_OK; } -static void nvme_submit_flush(struct nvme_queue *nvmeq, struct nvme_ns *ns, - int cmdid) +static void nvme_setup_flush(struct nvme_ns *ns, struct nvme_command *cmnd) { - struct nvme_command cmnd; - - memset(&cmnd, 0, sizeof(cmnd)); - cmnd.common.opcode = nvme_cmd_flush; - cmnd.common.command_id = cmdid; - cmnd.common.nsid = cpu_to_le32(ns->ns_id); - - __nvme_submit_cmd(nvmeq, &cmnd); + memset(cmnd, 0, sizeof(*cmnd)); + cmnd->common.opcode = nvme_cmd_flush; + cmnd->common.nsid = cpu_to_le32(ns->ns_id); } -static int nvme_submit_iod(struct nvme_queue *nvmeq, struct nvme_iod *iod, - struct nvme_ns *ns) +static void nvme_setup_rw(struct nvme_ns *ns, struct request *req, + struct nvme_command *cmnd) { - struct request *req = iod_get_private(iod); - struct nvme_command cmnd; u16 control = 0; u32 dsmgmt = 0; @@ -859,14 +889,12 @@ static int nvme_submit_iod(struct nvme_queue *nvmeq, struct nvme_iod *iod, if (req->cmd_flags & REQ_RAHEAD) dsmgmt |= NVME_RW_DSM_FREQ_PREFETCH; - memset(&cmnd, 0, sizeof(cmnd)); - cmnd.rw.opcode = (rq_data_dir(req) ? nvme_cmd_write : nvme_cmd_read); - cmnd.rw.command_id = req->tag; - cmnd.rw.nsid = cpu_to_le32(ns->ns_id); - cmnd.rw.prp1 = cpu_to_le64(sg_dma_address(iod->sg)); - cmnd.rw.prp2 = cpu_to_le64(iod->first_dma); - cmnd.rw.slba = cpu_to_le64(nvme_block_nr(ns, blk_rq_pos(req))); - cmnd.rw.length = cpu_to_le16((blk_rq_bytes(req) >> ns->lba_shift) - 1); + memset(cmnd, 0, sizeof(*cmnd)); + cmnd->rw.opcode = (rq_data_dir(req) ? nvme_cmd_write : nvme_cmd_read); + cmnd->rw.command_id = req->tag; + cmnd->rw.nsid = cpu_to_le32(ns->ns_id); + cmnd->rw.slba = cpu_to_le64(nvme_block_nr(ns, blk_rq_pos(req))); + cmnd->rw.length = cpu_to_le16((blk_rq_bytes(req) >> ns->lba_shift) - 1); if (ns->ms) { switch (ns->pi_type) { @@ -877,23 +905,16 @@ static int nvme_submit_iod(struct nvme_queue *nvmeq, struct nvme_iod *iod, case NVME_NS_DPS_PI_TYPE2: control |= NVME_RW_PRINFO_PRCHK_GUARD | NVME_RW_PRINFO_PRCHK_REF; - cmnd.rw.reftag = cpu_to_le32( + cmnd->rw.reftag = cpu_to_le32( nvme_block_nr(ns, blk_rq_pos(req))); break; } - if (blk_integrity_rq(req)) - cmnd.rw.metadata = - cpu_to_le64(sg_dma_address(iod->meta_sg)); - else + if (!blk_integrity_rq(req)) control |= NVME_RW_PRINFO_PRACT; } - cmnd.rw.control = cpu_to_le16(control); - cmnd.rw.dsmgmt = cpu_to_le32(dsmgmt); - - __nvme_submit_cmd(nvmeq, &cmnd); - - return 0; + cmnd->rw.control = cpu_to_le16(control); + cmnd->rw.dsmgmt = cpu_to_le32(dsmgmt); } /* @@ -908,7 +929,8 @@ static int nvme_queue_rq(struct blk_mq_hw_ctx *hctx, struct request *req = bd->rq; struct nvme_cmd_info *cmd = blk_mq_rq_to_pdu(req); struct nvme_iod *iod; - enum dma_data_direction dma_dir; + struct nvme_command cmnd; + int ret = BLK_MQ_RQ_QUEUE_OK; /* * If formated with metadata, require the block layer provide a buffer @@ -928,80 +950,33 @@ static int nvme_queue_rq(struct blk_mq_hw_ctx *hctx, return BLK_MQ_RQ_QUEUE_BUSY; if (req->cmd_flags & REQ_DISCARD) { - void *range; - /* - * We reuse the small pool to allocate the 16-byte range here - * as it is not worth having a special pool for these or - * additional cases to handle freeing the iod. - */ - range = dma_pool_alloc(dev->prp_small_pool, GFP_ATOMIC, - &iod->first_dma); - if (!range) - goto retry_cmd; - iod_list(iod)[0] = (__le64 *)range; - iod->npages = 0; - } else if (req->nr_phys_segments) { - dma_dir = rq_data_dir(req) ? DMA_TO_DEVICE : DMA_FROM_DEVICE; + ret = nvme_setup_discard(nvmeq, ns, iod, &cmnd); + } else { + if (req->cmd_type == REQ_TYPE_DRV_PRIV) + memcpy(&cmnd, req->cmd, sizeof(cmnd)); + else if (req->cmd_flags & REQ_FLUSH) + nvme_setup_flush(ns, &cmnd); + else + nvme_setup_rw(ns, req, &cmnd); - sg_init_table(iod->sg, req->nr_phys_segments); - iod->nents = blk_rq_map_sg(req->q, req, iod->sg); - if (!iod->nents) - goto error_cmd; - - if (!dma_map_sg(nvmeq->q_dmadev, iod->sg, iod->nents, dma_dir)) - goto retry_cmd; - - if (!nvme_setup_prps(dev, iod, blk_rq_bytes(req))) { - dma_unmap_sg(dev->dev, iod->sg, iod->nents, dma_dir); - goto retry_cmd; - } - if (blk_integrity_rq(req)) { - if (blk_rq_count_integrity_sg(req->q, req->bio) != 1) { - dma_unmap_sg(dev->dev, iod->sg, iod->nents, - dma_dir); - goto error_cmd; - } - - sg_init_table(iod->meta_sg, 1); - if (blk_rq_map_integrity_sg( - req->q, req->bio, iod->meta_sg) != 1) { - dma_unmap_sg(dev->dev, iod->sg, iod->nents, - dma_dir); - goto error_cmd; - } - - if (rq_data_dir(req)) - nvme_dif_remap(req, nvme_dif_prep); - - if (!dma_map_sg(nvmeq->q_dmadev, iod->meta_sg, 1, dma_dir)) { - dma_unmap_sg(dev->dev, iod->sg, iod->nents, - dma_dir); - goto error_cmd; - } - } + if (req->nr_phys_segments) + ret = nvme_map_data(dev, iod, &cmnd); } - nvme_set_info(cmd, iod, req_completion); - spin_lock_irq(&nvmeq->q_lock); - if (req->cmd_type == REQ_TYPE_DRV_PRIV) - nvme_submit_priv(nvmeq, req, iod); - else if (req->cmd_flags & REQ_DISCARD) - nvme_submit_discard(nvmeq, ns, req, iod); - else if (req->cmd_flags & REQ_FLUSH) - nvme_submit_flush(nvmeq, ns, req->tag); - else - nvme_submit_iod(nvmeq, iod, ns); + if (ret) + goto out; + cmnd.common.command_id = req->tag; + nvme_set_info(cmd, iod, req_completion); + + spin_lock_irq(&nvmeq->q_lock); + __nvme_submit_cmd(nvmeq, &cmnd); nvme_process_cq(nvmeq); spin_unlock_irq(&nvmeq->q_lock); return BLK_MQ_RQ_QUEUE_OK; - - error_cmd: +out: nvme_free_iod(dev, iod); - return BLK_MQ_RQ_QUEUE_ERROR; - retry_cmd: - nvme_free_iod(dev, iod); - return BLK_MQ_RQ_QUEUE_BUSY; + return ret; } static void __nvme_process_cq(struct nvme_queue *nvmeq, unsigned int *tag) From d4f6c3aba5b496a2cb80a8e8e082ae51e46579f3 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 26 Nov 2015 10:51:23 +0100 Subject: [PATCH 09/67] nvme: factor out a nvme_unmap_data helper This is the counter part to nvme_map_data. Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- drivers/nvme/host/pci.c | 43 ++++++++++++++++++++++++----------------- 1 file changed, 25 insertions(+), 18 deletions(-) diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index e5f53f159069..801d51d4ea10 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -89,10 +89,12 @@ static struct class *nvme_class; struct nvme_dev; struct nvme_queue; +struct nvme_iod; static int __nvme_reset(struct nvme_dev *dev); static int nvme_reset(struct nvme_dev *dev); static void nvme_process_cq(struct nvme_queue *nvmeq); +static void nvme_unmap_data(struct nvme_dev *dev, struct nvme_iod *iod); static void nvme_dead_ctrl(struct nvme_dev *dev); struct async_cmd_info { @@ -655,7 +657,6 @@ static void req_completion(struct nvme_queue *nvmeq, void *ctx, struct request *req = iod_get_private(iod); struct nvme_cmd_info *cmd_rq = blk_mq_rq_to_pdu(req); u16 status = le16_to_cpup(&cqe->status) >> 1; - bool requeue = false; int error = 0; if (unlikely(status)) { @@ -663,13 +664,14 @@ static void req_completion(struct nvme_queue *nvmeq, void *ctx, && (jiffies - req->start_time) < req->timeout) { unsigned long flags; - requeue = true; + nvme_unmap_data(nvmeq->dev, iod); + blk_mq_requeue_request(req); spin_lock_irqsave(req->q->queue_lock, flags); if (!blk_queue_stopped(req->q)) blk_mq_kick_requeue_list(req->q); spin_unlock_irqrestore(req->q->queue_lock, flags); - goto release_iod; + return; } if (req->cmd_type == REQ_TYPE_DRV_PRIV) { @@ -692,21 +694,8 @@ static void req_completion(struct nvme_queue *nvmeq, void *ctx, "completing aborted command with status:%04x\n", error); -release_iod: - if (iod->nents) { - dma_unmap_sg(nvmeq->dev->dev, iod->sg, iod->nents, - rq_data_dir(req) ? DMA_TO_DEVICE : DMA_FROM_DEVICE); - if (blk_integrity_rq(req)) { - if (!rq_data_dir(req)) - nvme_dif_remap(req, nvme_dif_complete); - dma_unmap_sg(nvmeq->dev->dev, iod->meta_sg, 1, - rq_data_dir(req) ? DMA_TO_DEVICE : DMA_FROM_DEVICE); - } - } - nvme_free_iod(nvmeq->dev, iod); - - if (likely(!requeue)) - blk_mq_complete_request(req, error); + nvme_unmap_data(nvmeq->dev, iod); + blk_mq_complete_request(req, error); } static bool nvme_setup_prps(struct nvme_dev *dev, struct nvme_iod *iod, @@ -837,6 +826,24 @@ out: return ret; } +static void nvme_unmap_data(struct nvme_dev *dev, struct nvme_iod *iod) +{ + struct request *req = iod_get_private(iod); + enum dma_data_direction dma_dir = rq_data_dir(req) ? + DMA_TO_DEVICE : DMA_FROM_DEVICE; + + if (iod->nents) { + dma_unmap_sg(dev->dev, iod->sg, iod->nents, dma_dir); + if (blk_integrity_rq(req)) { + if (!rq_data_dir(req)) + nvme_dif_remap(req, nvme_dif_complete); + dma_unmap_sg(dev->dev, iod->meta_sg, 1, dma_dir); + } + } + + nvme_free_iod(dev, iod); +} + /* * We reuse the small pool to allocate the 16-byte range here as it is not * worth having a special pool for these or additional cases to handle freeing From 15a190f7f57a2e46717490c35ac09882042a200b Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 16 Oct 2015 07:58:39 +0200 Subject: [PATCH 10/67] nvme: move nvme_error_status to common code And mark it inline so that we don't slow down the completion path by having to turn it into a forced out of line call. Signed-off-by: Christoph Hellwig Signed-off-by: Keith Busch Signed-off-by: Jens Axboe --- drivers/nvme/host/nvme.h | 12 ++++++++++++ drivers/nvme/host/pci.c | 12 ------------ 2 files changed, 12 insertions(+), 12 deletions(-) diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index 19583e1125e6..9f771265487b 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -85,6 +85,18 @@ static inline u64 nvme_block_nr(struct nvme_ns *ns, sector_t sector) return (sector >> (ns->lba_shift - 9)); } +static inline int nvme_error_status(u16 status) +{ + switch (status & 0x7ff) { + case NVME_SC_SUCCESS: + return 0; + case NVME_SC_CAP_EXCEEDED: + return -ENOSPC; + default: + return -EIO; + } +} + int nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd, void *buf, unsigned bufflen); int __nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd, diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 801d51d4ea10..d29d36d726b5 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -547,18 +547,6 @@ static void nvme_free_iod(struct nvme_dev *dev, struct nvme_iod *iod) kfree(iod); } -static int nvme_error_status(u16 status) -{ - switch (status & 0x7ff) { - case NVME_SC_SUCCESS: - return 0; - case NVME_SC_CAP_EXCEEDED: - return -ENOSPC; - default: - return -EIO; - } -} - #ifdef CONFIG_BLK_DEV_INTEGRITY static void nvme_dif_prep(u32 p, u32 v, struct t10_pi_tuple *pi) { From 22944e9981db1e496d983298fd420a8c6b758c80 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 16 Oct 2015 07:58:40 +0200 Subject: [PATCH 11/67] nvme: move nvme_setup_flush and nvme_setup_rw to common code And mark them inline so that we don't slow down the I/O submission path by having to turn it into a forced out of line call. Signed-off-by: Christoph Hellwig Signed-off-by: Keith Busch Signed-off-by: Jens Axboe --- drivers/nvme/host/nvme.h | 51 ++++++++++++++++++++++++++++++++++++++++ drivers/nvme/host/pci.c | 49 -------------------------------------- 2 files changed, 51 insertions(+), 49 deletions(-) diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index 9f771265487b..641741252e62 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -85,6 +85,57 @@ static inline u64 nvme_block_nr(struct nvme_ns *ns, sector_t sector) return (sector >> (ns->lba_shift - 9)); } +static inline void nvme_setup_flush(struct nvme_ns *ns, + struct nvme_command *cmnd) +{ + memset(cmnd, 0, sizeof(*cmnd)); + cmnd->common.opcode = nvme_cmd_flush; + cmnd->common.nsid = cpu_to_le32(ns->ns_id); +} + +static inline void nvme_setup_rw(struct nvme_ns *ns, struct request *req, + struct nvme_command *cmnd) +{ + u16 control = 0; + u32 dsmgmt = 0; + + if (req->cmd_flags & REQ_FUA) + control |= NVME_RW_FUA; + if (req->cmd_flags & (REQ_FAILFAST_DEV | REQ_RAHEAD)) + control |= NVME_RW_LR; + + if (req->cmd_flags & REQ_RAHEAD) + dsmgmt |= NVME_RW_DSM_FREQ_PREFETCH; + + memset(cmnd, 0, sizeof(*cmnd)); + cmnd->rw.opcode = (rq_data_dir(req) ? nvme_cmd_write : nvme_cmd_read); + cmnd->rw.command_id = req->tag; + cmnd->rw.nsid = cpu_to_le32(ns->ns_id); + cmnd->rw.slba = cpu_to_le64(nvme_block_nr(ns, blk_rq_pos(req))); + cmnd->rw.length = cpu_to_le16((blk_rq_bytes(req) >> ns->lba_shift) - 1); + + if (ns->ms) { + switch (ns->pi_type) { + case NVME_NS_DPS_PI_TYPE3: + control |= NVME_RW_PRINFO_PRCHK_GUARD; + break; + case NVME_NS_DPS_PI_TYPE1: + case NVME_NS_DPS_PI_TYPE2: + control |= NVME_RW_PRINFO_PRCHK_GUARD | + NVME_RW_PRINFO_PRCHK_REF; + cmnd->rw.reftag = cpu_to_le32( + nvme_block_nr(ns, blk_rq_pos(req))); + break; + } + if (!blk_integrity_rq(req)) + control |= NVME_RW_PRINFO_PRACT; + } + + cmnd->rw.control = cpu_to_le16(control); + cmnd->rw.dsmgmt = cpu_to_le32(dsmgmt); +} + + static inline int nvme_error_status(u16 status) { switch (status & 0x7ff) { diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index d29d36d726b5..c2d2b8a1a4de 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -863,55 +863,6 @@ static int nvme_setup_discard(struct nvme_queue *nvmeq, struct nvme_ns *ns, return BLK_MQ_RQ_QUEUE_OK; } -static void nvme_setup_flush(struct nvme_ns *ns, struct nvme_command *cmnd) -{ - memset(cmnd, 0, sizeof(*cmnd)); - cmnd->common.opcode = nvme_cmd_flush; - cmnd->common.nsid = cpu_to_le32(ns->ns_id); -} - -static void nvme_setup_rw(struct nvme_ns *ns, struct request *req, - struct nvme_command *cmnd) -{ - u16 control = 0; - u32 dsmgmt = 0; - - if (req->cmd_flags & REQ_FUA) - control |= NVME_RW_FUA; - if (req->cmd_flags & (REQ_FAILFAST_DEV | REQ_RAHEAD)) - control |= NVME_RW_LR; - - if (req->cmd_flags & REQ_RAHEAD) - dsmgmt |= NVME_RW_DSM_FREQ_PREFETCH; - - memset(cmnd, 0, sizeof(*cmnd)); - cmnd->rw.opcode = (rq_data_dir(req) ? nvme_cmd_write : nvme_cmd_read); - cmnd->rw.command_id = req->tag; - cmnd->rw.nsid = cpu_to_le32(ns->ns_id); - cmnd->rw.slba = cpu_to_le64(nvme_block_nr(ns, blk_rq_pos(req))); - cmnd->rw.length = cpu_to_le16((blk_rq_bytes(req) >> ns->lba_shift) - 1); - - if (ns->ms) { - switch (ns->pi_type) { - case NVME_NS_DPS_PI_TYPE3: - control |= NVME_RW_PRINFO_PRCHK_GUARD; - break; - case NVME_NS_DPS_PI_TYPE1: - case NVME_NS_DPS_PI_TYPE2: - control |= NVME_RW_PRINFO_PRCHK_GUARD | - NVME_RW_PRINFO_PRCHK_REF; - cmnd->rw.reftag = cpu_to_le32( - nvme_block_nr(ns, blk_rq_pos(req))); - break; - } - if (!blk_integrity_rq(req)) - control |= NVME_RW_PRINFO_PRACT; - } - - cmnd->rw.control = cpu_to_le16(control); - cmnd->rw.dsmgmt = cpu_to_le32(dsmgmt); -} - /* * NOTE: ns is NULL when called on the admin queue. */ From 4160982e7594481d6b7f90aa693638a37d20ea17 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 20 Nov 2015 09:00:02 +0100 Subject: [PATCH 12/67] nvme: split __nvme_submit_sync_cmd Add a separate nvme_submit_user_cmd for commands that directly DMA to or from userspace. We'll add metadata support to that soon and the common version would become too messy. Signed-off-by: Christoph Hellwig Signed-off-by: Keith Busch Signed-off-by: Jens Axboe --- drivers/nvme/host/core.c | 81 ++++++++++++++++++++++++++++------------ drivers/nvme/host/nvme.h | 8 +++- drivers/nvme/host/pci.c | 6 +-- drivers/nvme/host/scsi.c | 4 +- 4 files changed, 68 insertions(+), 31 deletions(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index ca54a34665ac..c6b7b17a26c8 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -21,22 +21,15 @@ #include "nvme.h" -/* - * Returns 0 on success. If the result is negative, it's a Linux error code; - * if the result is positive, it's an NVM Express status code - */ -int __nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd, - void *buffer, void __user *ubuffer, unsigned bufflen, - u32 *result, unsigned timeout) +struct request *nvme_alloc_request(struct request_queue *q, + struct nvme_command *cmd, unsigned int flags) { bool write = cmd->common.opcode & 1; - struct bio *bio = NULL; struct request *req; - int ret; - req = blk_mq_alloc_request(q, write, 0); + req = blk_mq_alloc_request(q, write, flags); if (IS_ERR(req)) - return PTR_ERR(req); + return req; req->cmd_type = REQ_TYPE_DRV_PRIV; req->cmd_flags |= REQ_FAILFAST_DRIVER; @@ -44,17 +37,65 @@ int __nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd, req->__sector = (sector_t) -1; req->bio = req->biotail = NULL; - req->timeout = timeout ? timeout : ADMIN_TIMEOUT; - req->cmd = (unsigned char *)cmd; req->cmd_len = sizeof(struct nvme_command); req->special = (void *)0; + return req; +} + +/* + * Returns 0 on success. If the result is negative, it's a Linux error code; + * if the result is positive, it's an NVM Express status code + */ +int __nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd, + void *buffer, unsigned bufflen, u32 *result, unsigned timeout) +{ + struct request *req; + int ret; + + req = nvme_alloc_request(q, cmd, 0); + if (IS_ERR(req)) + return PTR_ERR(req); + + req->timeout = timeout ? timeout : ADMIN_TIMEOUT; + if (buffer && bufflen) { ret = blk_rq_map_kern(q, req, buffer, bufflen, GFP_KERNEL); if (ret) goto out; - } else if (ubuffer && bufflen) { + } + + blk_execute_rq(req->q, NULL, req, 0); + if (result) + *result = (u32)(uintptr_t)req->special; + ret = req->errors; + out: + blk_mq_free_request(req); + return ret; +} + +int nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd, + void *buffer, unsigned bufflen) +{ + return __nvme_submit_sync_cmd(q, cmd, buffer, bufflen, NULL, 0); +} + +int nvme_submit_user_cmd(struct request_queue *q, struct nvme_command *cmd, + void __user *ubuffer, unsigned bufflen, u32 *result, + unsigned timeout) +{ + struct bio *bio = NULL; + struct request *req; + int ret; + + req = nvme_alloc_request(q, cmd, 0); + if (IS_ERR(req)) + return PTR_ERR(req); + + req->timeout = timeout ? timeout : ADMIN_TIMEOUT; + + if (ubuffer && bufflen) { ret = blk_rq_map_user(q, req, NULL, ubuffer, bufflen, GFP_KERNEL); if (ret) @@ -73,12 +114,6 @@ int __nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd, return ret; } -int nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd, - void *buffer, unsigned bufflen) -{ - return __nvme_submit_sync_cmd(q, cmd, buffer, NULL, bufflen, NULL, 0); -} - int nvme_identify_ctrl(struct nvme_ctrl *dev, struct nvme_id_ctrl **id) { struct nvme_command c = { }; @@ -131,8 +166,7 @@ int nvme_get_features(struct nvme_ctrl *dev, unsigned fid, unsigned nsid, c.features.prp1 = cpu_to_le64(dma_addr); c.features.fid = cpu_to_le32(fid); - return __nvme_submit_sync_cmd(dev->admin_q, &c, NULL, NULL, 0, - result, 0); + return __nvme_submit_sync_cmd(dev->admin_q, &c, NULL, 0, result, 0); } int nvme_set_features(struct nvme_ctrl *dev, unsigned fid, unsigned dword11, @@ -146,8 +180,7 @@ int nvme_set_features(struct nvme_ctrl *dev, unsigned fid, unsigned dword11, c.features.fid = cpu_to_le32(fid); c.features.dword11 = cpu_to_le32(dword11); - return __nvme_submit_sync_cmd(dev->admin_q, &c, NULL, NULL, 0, - result, 0); + return __nvme_submit_sync_cmd(dev->admin_q, &c, NULL, 0, result, 0); } int nvme_get_log_page(struct nvme_ctrl *dev, struct nvme_smart_log **log) diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index 641741252e62..0c1dc6394e6f 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -148,11 +148,15 @@ static inline int nvme_error_status(u16 status) } } +struct request *nvme_alloc_request(struct request_queue *q, + struct nvme_command *cmd, unsigned int flags); int nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd, void *buf, unsigned bufflen); int __nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd, - void *buffer, void __user *ubuffer, unsigned bufflen, - u32 *result, unsigned timeout); + void *buffer, unsigned bufflen, u32 *result, unsigned timeout); +int nvme_submit_user_cmd(struct request_queue *q, struct nvme_command *cmd, + void __user *ubuffer, unsigned bufflen, u32 *result, + unsigned timeout); int nvme_identify_ctrl(struct nvme_ctrl *dev, struct nvme_id_ctrl **id); int nvme_identify_ns(struct nvme_ctrl *dev, unsigned nsid, struct nvme_id_ns **id); diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index c2d2b8a1a4de..91e013b8de23 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -1697,7 +1697,7 @@ static int nvme_submit_io(struct nvme_ns *ns, struct nvme_user_io __user *uio) c.rw.appmask = cpu_to_le16(io.appmask); c.rw.metadata = cpu_to_le64(meta_dma); - status = __nvme_submit_sync_cmd(ns->queue, &c, NULL, + status = nvme_submit_user_cmd(ns->queue, &c, (void __user *)(uintptr_t)io.addr, length, NULL, 0); unmap: if (meta) { @@ -1739,8 +1739,8 @@ static int nvme_user_cmd(struct nvme_ctrl *ctrl, struct nvme_ns *ns, if (cmd.timeout_ms) timeout = msecs_to_jiffies(cmd.timeout_ms); - status = __nvme_submit_sync_cmd(ns ? ns->queue : ctrl->admin_q, &c, - NULL, (void __user *)(uintptr_t)cmd.addr, cmd.data_len, + status = nvme_submit_user_cmd(ns ? ns->queue : ctrl->admin_q, &c, + (void __user *)(uintptr_t)cmd.addr, cmd.data_len, &cmd.result, timeout); if (status >= 0) { if (put_user(cmd.result, &ucmd->result)) diff --git a/drivers/nvme/host/scsi.c b/drivers/nvme/host/scsi.c index bba29553bc94..eaf725610fe2 100644 --- a/drivers/nvme/host/scsi.c +++ b/drivers/nvme/host/scsi.c @@ -1327,7 +1327,7 @@ static int nvme_trans_send_download_fw_cmd(struct nvme_ns *ns, struct sg_io_hdr c.dlfw.numd = cpu_to_le32((tot_len/BYTES_TO_DWORDS) - 1); c.dlfw.offset = cpu_to_le32(offset/BYTES_TO_DWORDS); - nvme_sc = __nvme_submit_sync_cmd(ns->ctrl->admin_q, &c, NULL, + nvme_sc = nvme_submit_user_cmd(ns->ctrl->admin_q, &c, hdr->dxferp, tot_len, NULL, 0); return nvme_trans_status_code(hdr, nvme_sc); } @@ -1731,7 +1731,7 @@ static int nvme_trans_do_nvme_io(struct nvme_ns *ns, struct sg_io_hdr *hdr, nvme_sc = NVME_SC_LBA_RANGE; break; } - nvme_sc = __nvme_submit_sync_cmd(ns->queue, &c, NULL, + nvme_sc = nvme_submit_user_cmd(ns->queue, &c, next_mapping_addr, unit_len, NULL, 0); if (nvme_sc) break; From 0b7f1f26f95a51ab11d4dc0adee230212b3cd675 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Fri, 23 Oct 2015 09:47:28 -0600 Subject: [PATCH 13/67] nvme: use the block layer for userspace passthrough metadata Use the integrity API to pass through metadata from userspace. For PI enabled devices this means that we now validate the reftag, which seems like an unintentional ommission in the old code. Thanks to Keith Busch for testing and fixes. Signed-off-by: Christoph Hellwig [Skip metadata setup on admin commands] Signed-off-by: Keith Busch Signed-off-by: Jens Axboe --- drivers/nvme/host/core.c | 83 +++++++++++++++++++++++++++++++++++----- drivers/nvme/host/nvme.h | 4 ++ drivers/nvme/host/pci.c | 39 +++---------------- 3 files changed, 83 insertions(+), 43 deletions(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index c6b7b17a26c8..cc28150fc963 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -81,12 +81,17 @@ int nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd, return __nvme_submit_sync_cmd(q, cmd, buffer, bufflen, NULL, 0); } -int nvme_submit_user_cmd(struct request_queue *q, struct nvme_command *cmd, - void __user *ubuffer, unsigned bufflen, u32 *result, - unsigned timeout) +int __nvme_submit_user_cmd(struct request_queue *q, struct nvme_command *cmd, + void __user *ubuffer, unsigned bufflen, + void __user *meta_buffer, unsigned meta_len, u32 meta_seed, + u32 *result, unsigned timeout) { - struct bio *bio = NULL; + bool write = cmd->common.opcode & 1; + struct nvme_ns *ns = q->queuedata; + struct gendisk *disk = ns ? ns->disk : NULL; struct request *req; + struct bio *bio = NULL; + void *meta = NULL; int ret; req = nvme_alloc_request(q, cmd, 0); @@ -101,19 +106,79 @@ int nvme_submit_user_cmd(struct request_queue *q, struct nvme_command *cmd, if (ret) goto out; bio = req->bio; - } - blk_execute_rq(req->q, NULL, req, 0); - if (bio) - blk_rq_unmap_user(bio); + if (!disk) + goto submit; + bio->bi_bdev = bdget_disk(disk, 0); + if (!bio->bi_bdev) { + ret = -ENODEV; + goto out_unmap; + } + + if (meta_buffer) { + struct bio_integrity_payload *bip; + + meta = kmalloc(meta_len, GFP_KERNEL); + if (!meta) { + ret = -ENOMEM; + goto out_unmap; + } + + if (write) { + if (copy_from_user(meta, meta_buffer, + meta_len)) { + ret = -EFAULT; + goto out_free_meta; + } + } + + bip = bio_integrity_alloc(bio, GFP_KERNEL, 1); + if (!bip) { + ret = -ENOMEM; + goto out_free_meta; + } + + bip->bip_iter.bi_size = meta_len; + bip->bip_iter.bi_sector = meta_seed; + + ret = bio_integrity_add_page(bio, virt_to_page(meta), + meta_len, offset_in_page(meta)); + if (ret != meta_len) { + ret = -ENOMEM; + goto out_free_meta; + } + } + } + submit: + blk_execute_rq(req->q, disk, req, 0); + ret = req->errors; if (result) *result = (u32)(uintptr_t)req->special; - ret = req->errors; + if (meta && !ret && !write) { + if (copy_to_user(meta_buffer, meta, meta_len)) + ret = -EFAULT; + } + out_free_meta: + kfree(meta); + out_unmap: + if (bio) { + if (disk && bio->bi_bdev) + bdput(bio->bi_bdev); + blk_rq_unmap_user(bio); + } out: blk_mq_free_request(req); return ret; } +int nvme_submit_user_cmd(struct request_queue *q, struct nvme_command *cmd, + void __user *ubuffer, unsigned bufflen, u32 *result, + unsigned timeout) +{ + return __nvme_submit_user_cmd(q, cmd, ubuffer, bufflen, NULL, 0, 0, + result, timeout); +} + int nvme_identify_ctrl(struct nvme_ctrl *dev, struct nvme_id_ctrl **id) { struct nvme_command c = { }; diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index 0c1dc6394e6f..5ba9acbfddfa 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -157,6 +157,10 @@ int __nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd, int nvme_submit_user_cmd(struct request_queue *q, struct nvme_command *cmd, void __user *ubuffer, unsigned bufflen, u32 *result, unsigned timeout); +int __nvme_submit_user_cmd(struct request_queue *q, struct nvme_command *cmd, + void __user *ubuffer, unsigned bufflen, + void __user *meta_buffer, unsigned meta_len, u32 meta_seed, + u32 *result, unsigned timeout); int nvme_identify_ctrl(struct nvme_ctrl *dev, struct nvme_id_ctrl **id); int nvme_identify_ns(struct nvme_ctrl *dev, unsigned nsid, struct nvme_id_ns **id); diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 91e013b8de23..aa033f047aaf 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -1635,13 +1635,9 @@ static int nvme_configure_admin_queue(struct nvme_dev *dev) static int nvme_submit_io(struct nvme_ns *ns, struct nvme_user_io __user *uio) { - struct nvme_dev *dev = to_nvme_dev(ns->ctrl); struct nvme_user_io io; struct nvme_command c; unsigned length, meta_len; - int status, write; - dma_addr_t meta_dma = 0; - void *meta = NULL; void __user *metadata; if (copy_from_user(&io, uio, sizeof(io))) @@ -1659,29 +1655,13 @@ static int nvme_submit_io(struct nvme_ns *ns, struct nvme_user_io __user *uio) length = (io.nblocks + 1) << ns->lba_shift; meta_len = (io.nblocks + 1) * ns->ms; metadata = (void __user *)(uintptr_t)io.metadata; - write = io.opcode & 1; if (ns->ext) { length += meta_len; meta_len = 0; - } - if (meta_len) { - if (((io.metadata & 3) || !io.metadata) && !ns->ext) + } else if (meta_len) { + if ((io.metadata & 3) || !io.metadata) return -EINVAL; - - meta = dma_alloc_coherent(dev->dev, meta_len, - &meta_dma, GFP_KERNEL); - - if (!meta) { - status = -ENOMEM; - goto unmap; - } - if (write) { - if (copy_from_user(meta, metadata, meta_len)) { - status = -EFAULT; - goto unmap; - } - } } memset(&c, 0, sizeof(c)); @@ -1695,19 +1675,10 @@ static int nvme_submit_io(struct nvme_ns *ns, struct nvme_user_io __user *uio) c.rw.reftag = cpu_to_le32(io.reftag); c.rw.apptag = cpu_to_le16(io.apptag); c.rw.appmask = cpu_to_le16(io.appmask); - c.rw.metadata = cpu_to_le64(meta_dma); - status = nvme_submit_user_cmd(ns->queue, &c, - (void __user *)(uintptr_t)io.addr, length, NULL, 0); - unmap: - if (meta) { - if (status == NVME_SC_SUCCESS && !write) { - if (copy_to_user(metadata, meta, meta_len)) - status = -EFAULT; - } - dma_free_coherent(dev->dev, meta_len, meta, meta_dma); - } - return status; + return __nvme_submit_user_cmd(ns->queue, &c, + (void __user *)(uintptr_t)io.addr, length, + metadata, meta_len, io.slba, NULL, 0); } static int nvme_user_cmd(struct nvme_ctrl *ctrl, struct nvme_ns *ns, From 1673f1f08c8876f3942b4fa5e8f6a40215f15a94 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 26 Nov 2015 10:54:19 +0100 Subject: [PATCH 14/67] nvme: move block_device_operations and ns/ctrl freeing to common code This moves the block_device_operations over to common code mostly as-is. The only change is that the ns and ctrl refcounting got some small refcounting to have wrappers around the kref_put operations. A new free_ctrl operation is added to allow the PCI driver to free it's ressources on the final drop. Signed-off-by: Christoph Hellwig [Moved the integrity and pr changes due to merge conflict] Signed-off-by: Keith Busch Signed-off-by: Jens Axboe --- drivers/nvme/host/core.c | 413 +++++++++++++++++++++++++++++++++++++++ drivers/nvme/host/nvme.h | 14 ++ drivers/nvme/host/pci.c | 412 ++------------------------------------ 3 files changed, 439 insertions(+), 400 deletions(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index cc28150fc963..63ec86a93b83 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -15,12 +15,55 @@ #include #include #include +#include #include #include #include +#include +#include +#include +#include +#include +#include #include "nvme.h" +DEFINE_SPINLOCK(dev_list_lock); + +static void nvme_free_ns(struct kref *kref) +{ + struct nvme_ns *ns = container_of(kref, struct nvme_ns, kref); + + if (ns->type == NVME_NS_LIGHTNVM) + nvme_nvm_unregister(ns->queue, ns->disk->disk_name); + + spin_lock(&dev_list_lock); + ns->disk->private_data = NULL; + spin_unlock(&dev_list_lock); + + nvme_put_ctrl(ns->ctrl); + put_disk(ns->disk); + kfree(ns); +} + +void nvme_put_ns(struct nvme_ns *ns) +{ + kref_put(&ns->kref, nvme_free_ns); +} + +static struct nvme_ns *nvme_get_ns_from_disk(struct gendisk *disk) +{ + struct nvme_ns *ns; + + spin_lock(&dev_list_lock); + ns = disk->private_data; + if (ns && !kref_get_unless_zero(&ns->kref)) + ns = NULL; + spin_unlock(&dev_list_lock); + + return ns; +} + struct request *nvme_alloc_request(struct request_queue *q, struct nvme_command *cmd, unsigned int flags) { @@ -269,3 +312,373 @@ int nvme_get_log_page(struct nvme_ctrl *dev, struct nvme_smart_log **log) kfree(*log); return error; } + +static int nvme_submit_io(struct nvme_ns *ns, struct nvme_user_io __user *uio) +{ + struct nvme_user_io io; + struct nvme_command c; + unsigned length, meta_len; + void __user *metadata; + + if (copy_from_user(&io, uio, sizeof(io))) + return -EFAULT; + + switch (io.opcode) { + case nvme_cmd_write: + case nvme_cmd_read: + case nvme_cmd_compare: + break; + default: + return -EINVAL; + } + + length = (io.nblocks + 1) << ns->lba_shift; + meta_len = (io.nblocks + 1) * ns->ms; + metadata = (void __user *)(uintptr_t)io.metadata; + + if (ns->ext) { + length += meta_len; + meta_len = 0; + } else if (meta_len) { + if ((io.metadata & 3) || !io.metadata) + return -EINVAL; + } + + memset(&c, 0, sizeof(c)); + c.rw.opcode = io.opcode; + c.rw.flags = io.flags; + c.rw.nsid = cpu_to_le32(ns->ns_id); + c.rw.slba = cpu_to_le64(io.slba); + c.rw.length = cpu_to_le16(io.nblocks); + c.rw.control = cpu_to_le16(io.control); + c.rw.dsmgmt = cpu_to_le32(io.dsmgmt); + c.rw.reftag = cpu_to_le32(io.reftag); + c.rw.apptag = cpu_to_le16(io.apptag); + c.rw.appmask = cpu_to_le16(io.appmask); + + return __nvme_submit_user_cmd(ns->queue, &c, + (void __user *)(uintptr_t)io.addr, length, + metadata, meta_len, io.slba, NULL, 0); +} + +int nvme_user_cmd(struct nvme_ctrl *ctrl, struct nvme_ns *ns, + struct nvme_passthru_cmd __user *ucmd) +{ + struct nvme_passthru_cmd cmd; + struct nvme_command c; + unsigned timeout = 0; + int status; + + if (!capable(CAP_SYS_ADMIN)) + return -EACCES; + if (copy_from_user(&cmd, ucmd, sizeof(cmd))) + return -EFAULT; + + memset(&c, 0, sizeof(c)); + c.common.opcode = cmd.opcode; + c.common.flags = cmd.flags; + c.common.nsid = cpu_to_le32(cmd.nsid); + c.common.cdw2[0] = cpu_to_le32(cmd.cdw2); + c.common.cdw2[1] = cpu_to_le32(cmd.cdw3); + c.common.cdw10[0] = cpu_to_le32(cmd.cdw10); + c.common.cdw10[1] = cpu_to_le32(cmd.cdw11); + c.common.cdw10[2] = cpu_to_le32(cmd.cdw12); + c.common.cdw10[3] = cpu_to_le32(cmd.cdw13); + c.common.cdw10[4] = cpu_to_le32(cmd.cdw14); + c.common.cdw10[5] = cpu_to_le32(cmd.cdw15); + + if (cmd.timeout_ms) + timeout = msecs_to_jiffies(cmd.timeout_ms); + + status = nvme_submit_user_cmd(ns ? ns->queue : ctrl->admin_q, &c, + (void __user *)cmd.addr, cmd.data_len, + &cmd.result, timeout); + if (status >= 0) { + if (put_user(cmd.result, &ucmd->result)) + return -EFAULT; + } + + return status; +} + +static int nvme_ioctl(struct block_device *bdev, fmode_t mode, + unsigned int cmd, unsigned long arg) +{ + struct nvme_ns *ns = bdev->bd_disk->private_data; + + switch (cmd) { + case NVME_IOCTL_ID: + force_successful_syscall_return(); + return ns->ns_id; + case NVME_IOCTL_ADMIN_CMD: + return nvme_user_cmd(ns->ctrl, NULL, (void __user *)arg); + case NVME_IOCTL_IO_CMD: + return nvme_user_cmd(ns->ctrl, ns, (void __user *)arg); + case NVME_IOCTL_SUBMIT_IO: + return nvme_submit_io(ns, (void __user *)arg); + case SG_GET_VERSION_NUM: + return nvme_sg_get_version_num((void __user *)arg); + case SG_IO: + return nvme_sg_io(ns, (void __user *)arg); + default: + return -ENOTTY; + } +} + +#ifdef CONFIG_COMPAT +static int nvme_compat_ioctl(struct block_device *bdev, fmode_t mode, + unsigned int cmd, unsigned long arg) +{ + switch (cmd) { + case SG_IO: + return -ENOIOCTLCMD; + } + return nvme_ioctl(bdev, mode, cmd, arg); +} +#else +#define nvme_compat_ioctl NULL +#endif + +static int nvme_open(struct block_device *bdev, fmode_t mode) +{ + return nvme_get_ns_from_disk(bdev->bd_disk) ? 0 : -ENXIO; +} + +static void nvme_release(struct gendisk *disk, fmode_t mode) +{ + nvme_put_ns(disk->private_data); +} + +static int nvme_getgeo(struct block_device *bdev, struct hd_geometry *geo) +{ + /* some standard values */ + geo->heads = 1 << 6; + geo->sectors = 1 << 5; + geo->cylinders = get_capacity(bdev->bd_disk) >> 11; + return 0; +} + +#ifdef CONFIG_BLK_DEV_INTEGRITY +static void nvme_init_integrity(struct nvme_ns *ns) +{ + struct blk_integrity integrity; + + switch (ns->pi_type) { + case NVME_NS_DPS_PI_TYPE3: + integrity.profile = &t10_pi_type3_crc; + break; + case NVME_NS_DPS_PI_TYPE1: + case NVME_NS_DPS_PI_TYPE2: + integrity.profile = &t10_pi_type1_crc; + break; + default: + integrity.profile = NULL; + break; + } + integrity.tuple_size = ns->ms; + blk_integrity_register(ns->disk, &integrity); + blk_queue_max_integrity_segments(ns->queue, 1); +} +#else +static void nvme_init_integrity(struct nvme_ns *ns) +{ +} +#endif /* CONFIG_BLK_DEV_INTEGRITY */ + +static void nvme_config_discard(struct nvme_ns *ns) +{ + u32 logical_block_size = queue_logical_block_size(ns->queue); + ns->queue->limits.discard_zeroes_data = 0; + ns->queue->limits.discard_alignment = logical_block_size; + ns->queue->limits.discard_granularity = logical_block_size; + blk_queue_max_discard_sectors(ns->queue, 0xffffffff); + queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, ns->queue); +} + +int nvme_revalidate_disk(struct gendisk *disk) +{ + struct nvme_ns *ns = disk->private_data; + struct nvme_id_ns *id; + u8 lbaf, pi_type; + u16 old_ms; + unsigned short bs; + + if (nvme_identify_ns(ns->ctrl, ns->ns_id, &id)) { + dev_warn(ns->ctrl->dev, "%s: Identify failure nvme%dn%d\n", + __func__, ns->ctrl->instance, ns->ns_id); + return -ENODEV; + } + if (id->ncap == 0) { + kfree(id); + return -ENODEV; + } + + if (nvme_nvm_ns_supported(ns, id) && ns->type != NVME_NS_LIGHTNVM) { + if (nvme_nvm_register(ns->queue, disk->disk_name)) { + dev_warn(ns->ctrl->dev, + "%s: LightNVM init failure\n", __func__); + kfree(id); + return -ENODEV; + } + ns->type = NVME_NS_LIGHTNVM; + } + + old_ms = ns->ms; + lbaf = id->flbas & NVME_NS_FLBAS_LBA_MASK; + ns->lba_shift = id->lbaf[lbaf].ds; + ns->ms = le16_to_cpu(id->lbaf[lbaf].ms); + ns->ext = ns->ms && (id->flbas & NVME_NS_FLBAS_META_EXT); + + /* + * If identify namespace failed, use default 512 byte block size so + * block layer can use before failing read/write for 0 capacity. + */ + if (ns->lba_shift == 0) + ns->lba_shift = 9; + bs = 1 << ns->lba_shift; + + /* XXX: PI implementation requires metadata equal t10 pi tuple size */ + pi_type = ns->ms == sizeof(struct t10_pi_tuple) ? + id->dps & NVME_NS_DPS_PI_MASK : 0; + + blk_mq_freeze_queue(disk->queue); + if (blk_get_integrity(disk) && (ns->pi_type != pi_type || + ns->ms != old_ms || + bs != queue_logical_block_size(disk->queue) || + (ns->ms && ns->ext))) + blk_integrity_unregister(disk); + + ns->pi_type = pi_type; + blk_queue_logical_block_size(ns->queue, bs); + + if (ns->ms && !ns->ext) + nvme_init_integrity(ns); + + if (ns->ms && !(ns->ms == 8 && ns->pi_type) && !blk_get_integrity(disk)) + set_capacity(disk, 0); + else + set_capacity(disk, le64_to_cpup(&id->nsze) << (ns->lba_shift - 9)); + + if (ns->ctrl->oncs & NVME_CTRL_ONCS_DSM) + nvme_config_discard(ns); + blk_mq_unfreeze_queue(disk->queue); + + kfree(id); + return 0; +} + +static char nvme_pr_type(enum pr_type type) +{ + switch (type) { + case PR_WRITE_EXCLUSIVE: + return 1; + case PR_EXCLUSIVE_ACCESS: + return 2; + case PR_WRITE_EXCLUSIVE_REG_ONLY: + return 3; + case PR_EXCLUSIVE_ACCESS_REG_ONLY: + return 4; + case PR_WRITE_EXCLUSIVE_ALL_REGS: + return 5; + case PR_EXCLUSIVE_ACCESS_ALL_REGS: + return 6; + default: + return 0; + } +}; + +static int nvme_pr_command(struct block_device *bdev, u32 cdw10, + u64 key, u64 sa_key, u8 op) +{ + struct nvme_ns *ns = bdev->bd_disk->private_data; + struct nvme_command c; + u8 data[16] = { 0, }; + + put_unaligned_le64(key, &data[0]); + put_unaligned_le64(sa_key, &data[8]); + + memset(&c, 0, sizeof(c)); + c.common.opcode = op; + c.common.nsid = cpu_to_le32(ns->ns_id); + c.common.cdw10[0] = cpu_to_le32(cdw10); + + return nvme_submit_sync_cmd(ns->queue, &c, data, 16); +} + +static int nvme_pr_register(struct block_device *bdev, u64 old, + u64 new, unsigned flags) +{ + u32 cdw10; + + if (flags & ~PR_FL_IGNORE_KEY) + return -EOPNOTSUPP; + + cdw10 = old ? 2 : 0; + cdw10 |= (flags & PR_FL_IGNORE_KEY) ? 1 << 3 : 0; + cdw10 |= (1 << 30) | (1 << 31); /* PTPL=1 */ + return nvme_pr_command(bdev, cdw10, old, new, nvme_cmd_resv_register); +} + +static int nvme_pr_reserve(struct block_device *bdev, u64 key, + enum pr_type type, unsigned flags) +{ + u32 cdw10; + + if (flags & ~PR_FL_IGNORE_KEY) + return -EOPNOTSUPP; + + cdw10 = nvme_pr_type(type) << 8; + cdw10 |= ((flags & PR_FL_IGNORE_KEY) ? 1 << 3 : 0); + return nvme_pr_command(bdev, cdw10, key, 0, nvme_cmd_resv_acquire); +} + +static int nvme_pr_preempt(struct block_device *bdev, u64 old, u64 new, + enum pr_type type, bool abort) +{ + u32 cdw10 = nvme_pr_type(type) << 8 | abort ? 2 : 1; + return nvme_pr_command(bdev, cdw10, old, new, nvme_cmd_resv_acquire); +} + +static int nvme_pr_clear(struct block_device *bdev, u64 key) +{ + u32 cdw10 = 1 | key ? 1 << 3 : 0; + return nvme_pr_command(bdev, cdw10, key, 0, nvme_cmd_resv_register); +} + +static int nvme_pr_release(struct block_device *bdev, u64 key, enum pr_type type) +{ + u32 cdw10 = nvme_pr_type(type) << 8 | key ? 1 << 3 : 0; + return nvme_pr_command(bdev, cdw10, key, 0, nvme_cmd_resv_release); +} + +static const struct pr_ops nvme_pr_ops = { + .pr_register = nvme_pr_register, + .pr_reserve = nvme_pr_reserve, + .pr_release = nvme_pr_release, + .pr_preempt = nvme_pr_preempt, + .pr_clear = nvme_pr_clear, +}; + +const struct block_device_operations nvme_fops = { + .owner = THIS_MODULE, + .ioctl = nvme_ioctl, + .compat_ioctl = nvme_compat_ioctl, + .open = nvme_open, + .release = nvme_release, + .getgeo = nvme_getgeo, + .revalidate_disk= nvme_revalidate_disk, + .pr_ops = &nvme_pr_ops, +}; + +static void nvme_free_ctrl(struct kref *kref) +{ + struct nvme_ctrl *ctrl = container_of(kref, struct nvme_ctrl, kref); + + ctrl->ops->free_ctrl(ctrl); +} + +void nvme_put_ctrl(struct nvme_ctrl *ctrl) +{ + kref_put(&ctrl->kref, nvme_free_ctrl); +} + diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index 5ba9acbfddfa..3b3f855580ee 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -19,6 +19,8 @@ #include #include +struct nvme_passthru_cmd; + extern unsigned char nvme_io_timeout; #define NVME_IO_TIMEOUT (nvme_io_timeout * HZ) @@ -34,6 +36,7 @@ struct nvme_ctrl { const struct nvme_ctrl_ops *ops; struct request_queue *admin_q; struct device *dev; + struct kref kref; int instance; char name[12]; @@ -69,6 +72,7 @@ struct nvme_ns { struct nvme_ctrl_ops { int (*reg_read32)(struct nvme_ctrl *ctrl, u32 off, u32 *val); + void (*free_ctrl)(struct nvme_ctrl *ctrl); }; static inline bool nvme_ctrl_ready(struct nvme_ctrl *ctrl) @@ -148,6 +152,9 @@ static inline int nvme_error_status(u16 status) } } +void nvme_put_ctrl(struct nvme_ctrl *ctrl); +void nvme_put_ns(struct nvme_ns *ns); + struct request *nvme_alloc_request(struct request_queue *q, struct nvme_command *cmd, unsigned int flags); int nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd, @@ -170,6 +177,13 @@ int nvme_get_features(struct nvme_ctrl *dev, unsigned fid, unsigned nsid, int nvme_set_features(struct nvme_ctrl *dev, unsigned fid, unsigned dword11, dma_addr_t dma_addr, u32 *result); +extern const struct block_device_operations nvme_fops; +extern spinlock_t dev_list_lock; + +int nvme_revalidate_disk(struct gendisk *disk); +int nvme_user_cmd(struct nvme_ctrl *ctrl, struct nvme_ns *ns, + struct nvme_passthru_cmd __user *ucmd); + struct sg_io_hdr; int nvme_sg_io(struct nvme_ns *ns, struct sg_io_hdr __user *u_hdr); diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index aa033f047aaf..e0f40afbf01b 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -79,7 +79,6 @@ static bool use_cmb_sqes = true; module_param(use_cmb_sqes, bool, 0644); MODULE_PARM_DESC(use_cmb_sqes, "use controller's memory buffer for I/O SQes"); -static DEFINE_SPINLOCK(dev_list_lock); static LIST_HEAD(dev_list); static struct task_struct *nvme_thread; static struct workqueue_struct *nvme_workq; @@ -127,7 +126,6 @@ struct nvme_dev { struct msix_entry *entry; void __iomem *bar; struct list_head namespaces; - struct kref kref; struct device *device; struct work_struct reset_work; struct work_struct probe_work; @@ -601,27 +599,6 @@ static void nvme_dif_remap(struct request *req, } kunmap_atomic(pmap); } - -static void nvme_init_integrity(struct nvme_ns *ns) -{ - struct blk_integrity integrity; - - switch (ns->pi_type) { - case NVME_NS_DPS_PI_TYPE3: - integrity.profile = &t10_pi_type3_crc; - break; - case NVME_NS_DPS_PI_TYPE1: - case NVME_NS_DPS_PI_TYPE2: - integrity.profile = &t10_pi_type1_crc; - break; - default: - integrity.profile = NULL; - break; - } - integrity.tuple_size = ns->ms; - blk_integrity_register(ns->disk, &integrity); - blk_queue_max_integrity_segments(ns->queue, 1); -} #else /* CONFIG_BLK_DEV_INTEGRITY */ static void nvme_dif_remap(struct request *req, void (*dif_swap)(u32 p, u32 v, struct t10_pi_tuple *pi)) @@ -633,9 +610,6 @@ static void nvme_dif_prep(u32 p, u32 v, struct t10_pi_tuple *pi) static void nvme_dif_complete(u32 p, u32 v, struct t10_pi_tuple *pi) { } -static void nvme_init_integrity(struct nvme_ns *ns) -{ -} #endif static void req_completion(struct nvme_queue *nvmeq, void *ctx, @@ -1633,94 +1607,6 @@ static int nvme_configure_admin_queue(struct nvme_dev *dev) return result; } -static int nvme_submit_io(struct nvme_ns *ns, struct nvme_user_io __user *uio) -{ - struct nvme_user_io io; - struct nvme_command c; - unsigned length, meta_len; - void __user *metadata; - - if (copy_from_user(&io, uio, sizeof(io))) - return -EFAULT; - - switch (io.opcode) { - case nvme_cmd_write: - case nvme_cmd_read: - case nvme_cmd_compare: - break; - default: - return -EINVAL; - } - - length = (io.nblocks + 1) << ns->lba_shift; - meta_len = (io.nblocks + 1) * ns->ms; - metadata = (void __user *)(uintptr_t)io.metadata; - - if (ns->ext) { - length += meta_len; - meta_len = 0; - } else if (meta_len) { - if ((io.metadata & 3) || !io.metadata) - return -EINVAL; - } - - memset(&c, 0, sizeof(c)); - c.rw.opcode = io.opcode; - c.rw.flags = io.flags; - c.rw.nsid = cpu_to_le32(ns->ns_id); - c.rw.slba = cpu_to_le64(io.slba); - c.rw.length = cpu_to_le16(io.nblocks); - c.rw.control = cpu_to_le16(io.control); - c.rw.dsmgmt = cpu_to_le32(io.dsmgmt); - c.rw.reftag = cpu_to_le32(io.reftag); - c.rw.apptag = cpu_to_le16(io.apptag); - c.rw.appmask = cpu_to_le16(io.appmask); - - return __nvme_submit_user_cmd(ns->queue, &c, - (void __user *)(uintptr_t)io.addr, length, - metadata, meta_len, io.slba, NULL, 0); -} - -static int nvme_user_cmd(struct nvme_ctrl *ctrl, struct nvme_ns *ns, - struct nvme_passthru_cmd __user *ucmd) -{ - struct nvme_passthru_cmd cmd; - struct nvme_command c; - unsigned timeout = 0; - int status; - - if (!capable(CAP_SYS_ADMIN)) - return -EACCES; - if (copy_from_user(&cmd, ucmd, sizeof(cmd))) - return -EFAULT; - - memset(&c, 0, sizeof(c)); - c.common.opcode = cmd.opcode; - c.common.flags = cmd.flags; - c.common.nsid = cpu_to_le32(cmd.nsid); - c.common.cdw2[0] = cpu_to_le32(cmd.cdw2); - c.common.cdw2[1] = cpu_to_le32(cmd.cdw3); - c.common.cdw10[0] = cpu_to_le32(cmd.cdw10); - c.common.cdw10[1] = cpu_to_le32(cmd.cdw11); - c.common.cdw10[2] = cpu_to_le32(cmd.cdw12); - c.common.cdw10[3] = cpu_to_le32(cmd.cdw13); - c.common.cdw10[4] = cpu_to_le32(cmd.cdw14); - c.common.cdw10[5] = cpu_to_le32(cmd.cdw15); - - if (cmd.timeout_ms) - timeout = msecs_to_jiffies(cmd.timeout_ms); - - status = nvme_submit_user_cmd(ns ? ns->queue : ctrl->admin_q, &c, - (void __user *)(uintptr_t)cmd.addr, cmd.data_len, - &cmd.result, timeout); - if (status >= 0) { - if (put_user(cmd.result, &ucmd->result)) - return -EFAULT; - } - - return status; -} - static int nvme_subsys_reset(struct nvme_dev *dev) { if (!dev->subsystem) @@ -1730,281 +1616,6 @@ static int nvme_subsys_reset(struct nvme_dev *dev) return 0; } -static int nvme_ioctl(struct block_device *bdev, fmode_t mode, unsigned int cmd, - unsigned long arg) -{ - struct nvme_ns *ns = bdev->bd_disk->private_data; - - switch (cmd) { - case NVME_IOCTL_ID: - force_successful_syscall_return(); - return ns->ns_id; - case NVME_IOCTL_ADMIN_CMD: - return nvme_user_cmd(ns->ctrl, NULL, (void __user *)arg); - case NVME_IOCTL_IO_CMD: - return nvme_user_cmd(ns->ctrl, ns, (void __user *)arg); - case NVME_IOCTL_SUBMIT_IO: - return nvme_submit_io(ns, (void __user *)arg); - case SG_GET_VERSION_NUM: - return nvme_sg_get_version_num((void __user *)arg); - case SG_IO: - return nvme_sg_io(ns, (void __user *)arg); - default: - return -ENOTTY; - } -} - -#ifdef CONFIG_COMPAT -static int nvme_compat_ioctl(struct block_device *bdev, fmode_t mode, - unsigned int cmd, unsigned long arg) -{ - switch (cmd) { - case SG_IO: - return -ENOIOCTLCMD; - } - return nvme_ioctl(bdev, mode, cmd, arg); -} -#else -#define nvme_compat_ioctl NULL -#endif - -static void nvme_free_dev(struct kref *kref); -static void nvme_free_ns(struct kref *kref) -{ - struct nvme_ns *ns = container_of(kref, struct nvme_ns, kref); - struct nvme_dev *dev = to_nvme_dev(ns->ctrl); - - if (ns->type == NVME_NS_LIGHTNVM) - nvme_nvm_unregister(ns->queue, ns->disk->disk_name); - - spin_lock(&dev_list_lock); - ns->disk->private_data = NULL; - spin_unlock(&dev_list_lock); - - kref_put(&dev->kref, nvme_free_dev); - put_disk(ns->disk); - kfree(ns); -} - -static int nvme_open(struct block_device *bdev, fmode_t mode) -{ - int ret = 0; - struct nvme_ns *ns; - - spin_lock(&dev_list_lock); - ns = bdev->bd_disk->private_data; - if (!ns) - ret = -ENXIO; - else if (!kref_get_unless_zero(&ns->kref)) - ret = -ENXIO; - spin_unlock(&dev_list_lock); - - return ret; -} - -static void nvme_release(struct gendisk *disk, fmode_t mode) -{ - struct nvme_ns *ns = disk->private_data; - kref_put(&ns->kref, nvme_free_ns); -} - -static int nvme_getgeo(struct block_device *bd, struct hd_geometry *geo) -{ - /* some standard values */ - geo->heads = 1 << 6; - geo->sectors = 1 << 5; - geo->cylinders = get_capacity(bd->bd_disk) >> 11; - return 0; -} - -static void nvme_config_discard(struct nvme_ns *ns) -{ - u32 logical_block_size = queue_logical_block_size(ns->queue); - ns->queue->limits.discard_zeroes_data = 0; - ns->queue->limits.discard_alignment = logical_block_size; - ns->queue->limits.discard_granularity = logical_block_size; - blk_queue_max_discard_sectors(ns->queue, 0xffffffff); - queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, ns->queue); -} - -static int nvme_revalidate_disk(struct gendisk *disk) -{ - struct nvme_ns *ns = disk->private_data; - struct nvme_dev *dev = to_nvme_dev(ns->ctrl); - struct nvme_id_ns *id; - u8 lbaf, pi_type; - u16 old_ms; - unsigned short bs; - - if (nvme_identify_ns(&dev->ctrl, ns->ns_id, &id)) { - dev_warn(dev->dev, "%s: Identify failure nvme%dn%d\n", __func__, - dev->ctrl.instance, ns->ns_id); - return -ENODEV; - } - if (id->ncap == 0) { - kfree(id); - return -ENODEV; - } - - if (nvme_nvm_ns_supported(ns, id) && ns->type != NVME_NS_LIGHTNVM) { - if (nvme_nvm_register(ns->queue, disk->disk_name)) { - dev_warn(dev->dev, - "%s: LightNVM init failure\n", __func__); - kfree(id); - return -ENODEV; - } - ns->type = NVME_NS_LIGHTNVM; - } - - old_ms = ns->ms; - lbaf = id->flbas & NVME_NS_FLBAS_LBA_MASK; - ns->lba_shift = id->lbaf[lbaf].ds; - ns->ms = le16_to_cpu(id->lbaf[lbaf].ms); - ns->ext = ns->ms && (id->flbas & NVME_NS_FLBAS_META_EXT); - - /* - * If identify namespace failed, use default 512 byte block size so - * block layer can use before failing read/write for 0 capacity. - */ - if (ns->lba_shift == 0) - ns->lba_shift = 9; - bs = 1 << ns->lba_shift; - - /* XXX: PI implementation requires metadata equal t10 pi tuple size */ - pi_type = ns->ms == sizeof(struct t10_pi_tuple) ? - id->dps & NVME_NS_DPS_PI_MASK : 0; - - blk_mq_freeze_queue(disk->queue); - if (blk_get_integrity(disk) && (ns->pi_type != pi_type || - ns->ms != old_ms || - bs != queue_logical_block_size(disk->queue) || - (ns->ms && ns->ext))) - blk_integrity_unregister(disk); - - ns->pi_type = pi_type; - blk_queue_logical_block_size(ns->queue, bs); - - if (ns->ms && !ns->ext) - nvme_init_integrity(ns); - - if ((ns->ms && !(ns->ms == 8 && ns->pi_type) && - !blk_get_integrity(disk)) || - ns->type == NVME_NS_LIGHTNVM) - set_capacity(disk, 0); - else - set_capacity(disk, le64_to_cpup(&id->nsze) << (ns->lba_shift - 9)); - - if (dev->ctrl.oncs & NVME_CTRL_ONCS_DSM) - nvme_config_discard(ns); - blk_mq_unfreeze_queue(disk->queue); - - kfree(id); - return 0; -} - -static char nvme_pr_type(enum pr_type type) -{ - switch (type) { - case PR_WRITE_EXCLUSIVE: - return 1; - case PR_EXCLUSIVE_ACCESS: - return 2; - case PR_WRITE_EXCLUSIVE_REG_ONLY: - return 3; - case PR_EXCLUSIVE_ACCESS_REG_ONLY: - return 4; - case PR_WRITE_EXCLUSIVE_ALL_REGS: - return 5; - case PR_EXCLUSIVE_ACCESS_ALL_REGS: - return 6; - default: - return 0; - } -}; - -static int nvme_pr_command(struct block_device *bdev, u32 cdw10, - u64 key, u64 sa_key, u8 op) -{ - struct nvme_ns *ns = bdev->bd_disk->private_data; - struct nvme_command c; - u8 data[16] = { 0, }; - - put_unaligned_le64(key, &data[0]); - put_unaligned_le64(sa_key, &data[8]); - - memset(&c, 0, sizeof(c)); - c.common.opcode = op; - c.common.nsid = cpu_to_le32(ns->ns_id); - c.common.cdw10[0] = cpu_to_le32(cdw10); - - return nvme_submit_sync_cmd(ns->queue, &c, data, 16); -} - -static int nvme_pr_register(struct block_device *bdev, u64 old, - u64 new, unsigned flags) -{ - u32 cdw10; - - if (flags & ~PR_FL_IGNORE_KEY) - return -EOPNOTSUPP; - - cdw10 = old ? 2 : 0; - cdw10 |= (flags & PR_FL_IGNORE_KEY) ? 1 << 3 : 0; - cdw10 |= (1 << 30) | (1 << 31); /* PTPL=1 */ - return nvme_pr_command(bdev, cdw10, old, new, nvme_cmd_resv_register); -} - -static int nvme_pr_reserve(struct block_device *bdev, u64 key, - enum pr_type type, unsigned flags) -{ - u32 cdw10; - - if (flags & ~PR_FL_IGNORE_KEY) - return -EOPNOTSUPP; - - cdw10 = nvme_pr_type(type) << 8; - cdw10 |= ((flags & PR_FL_IGNORE_KEY) ? 1 << 3 : 0); - return nvme_pr_command(bdev, cdw10, key, 0, nvme_cmd_resv_acquire); -} - -static int nvme_pr_preempt(struct block_device *bdev, u64 old, u64 new, - enum pr_type type, bool abort) -{ - u32 cdw10 = nvme_pr_type(type) << 8 | abort ? 2 : 1; - return nvme_pr_command(bdev, cdw10, old, new, nvme_cmd_resv_acquire); -} - -static int nvme_pr_clear(struct block_device *bdev, u64 key) -{ - u32 cdw10 = 1 | (key ? 1 << 3 : 0); - return nvme_pr_command(bdev, cdw10, key, 0, nvme_cmd_resv_register); -} - -static int nvme_pr_release(struct block_device *bdev, u64 key, enum pr_type type) -{ - u32 cdw10 = nvme_pr_type(type) << 8 | key ? 1 << 3 : 0; - return nvme_pr_command(bdev, cdw10, key, 0, nvme_cmd_resv_release); -} - -static const struct pr_ops nvme_pr_ops = { - .pr_register = nvme_pr_register, - .pr_reserve = nvme_pr_reserve, - .pr_release = nvme_pr_release, - .pr_preempt = nvme_pr_preempt, - .pr_clear = nvme_pr_clear, -}; - -static const struct block_device_operations nvme_fops = { - .owner = THIS_MODULE, - .ioctl = nvme_ioctl, - .compat_ioctl = nvme_compat_ioctl, - .open = nvme_open, - .release = nvme_release, - .getgeo = nvme_getgeo, - .revalidate_disk= nvme_revalidate_disk, - .pr_ops = &nvme_pr_ops, -}; - static int nvme_kthread(void *data) { struct nvme_dev *dev, *next; @@ -2105,7 +1716,7 @@ static void nvme_alloc_ns(struct nvme_dev *dev, unsigned nsid) if (nvme_revalidate_disk(ns->disk)) goto out_free_disk; - kref_get(&dev->kref); + kref_get(&dev->ctrl.kref); if (ns->type != NVME_NS_LIGHTNVM) { add_disk(ns->disk); if (ns->ms) { @@ -2354,7 +1965,7 @@ static void nvme_ns_remove(struct nvme_ns *ns) blk_cleanup_queue(ns->queue); } list_del_init(&ns->list); - kref_put(&ns->kref, nvme_free_ns); + nvme_put_ns(ns); } static void nvme_scan_namespaces(struct nvme_dev *dev, unsigned nn) @@ -2828,9 +2439,9 @@ static void nvme_release_instance(struct nvme_dev *dev) spin_unlock(&dev_list_lock); } -static void nvme_free_dev(struct kref *kref) +static void nvme_pci_free_ctrl(struct nvme_ctrl *ctrl) { - struct nvme_dev *dev = container_of(kref, struct nvme_dev, kref); + struct nvme_dev *dev = to_nvme_dev(ctrl); put_device(dev->dev); put_device(dev->device); @@ -2857,7 +2468,7 @@ static int nvme_dev_open(struct inode *inode, struct file *f) ret = -EWOULDBLOCK; break; } - if (!kref_get_unless_zero(&dev->kref)) + if (!kref_get_unless_zero(&dev->ctrl.kref)) break; f->private_data = dev; ret = 0; @@ -2872,7 +2483,7 @@ static int nvme_dev_open(struct inode *inode, struct file *f) static int nvme_dev_release(struct inode *inode, struct file *f) { struct nvme_dev *dev = f->private_data; - kref_put(&dev->kref, nvme_free_dev); + nvme_put_ctrl(&dev->ctrl); return 0; } @@ -2987,19 +2598,19 @@ static int nvme_remove_dead_ctrl(void *arg) if (pci_get_drvdata(pdev)) pci_stop_and_remove_bus_device_locked(pdev); - kref_put(&dev->kref, nvme_free_dev); + nvme_put_ctrl(&dev->ctrl); return 0; } static void nvme_dead_ctrl(struct nvme_dev *dev) { dev_warn(dev->dev, "Device failed to resume\n"); - kref_get(&dev->kref); + kref_get(&dev->ctrl.kref); if (IS_ERR(kthread_run(nvme_remove_dead_ctrl, dev, "nvme%d", dev->ctrl.instance))) { dev_err(dev->dev, "Failed to start controller remove task\n"); - kref_put(&dev->kref, nvme_free_dev); + nvme_put_ctrl(&dev->ctrl); } } @@ -3077,6 +2688,7 @@ static int nvme_pci_reg_read32(struct nvme_ctrl *ctrl, u32 off, u32 *val) static const struct nvme_ctrl_ops nvme_pci_ctrl_ops = { .reg_read32 = nvme_pci_reg_read32, + .free_ctrl = nvme_pci_free_ctrl, }; static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id) @@ -3116,7 +2728,7 @@ static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id) if (result) goto release; - kref_init(&dev->kref); + kref_init(&dev->ctrl.kref); dev->device = device_create(nvme_class, &pdev->dev, MKDEV(nvme_char_major, dev->ctrl.instance), dev, "nvme%d", dev->ctrl.instance); @@ -3189,7 +2801,7 @@ static void nvme_remove(struct pci_dev *pdev) nvme_free_queues(dev, 0); nvme_release_cmb(dev); nvme_release_prp_pools(dev); - kref_put(&dev->kref, nvme_free_dev); + nvme_put_ctrl(&dev->ctrl); } /* These functions are yet to be implemented */ From 106198edb74cdf3fe1aefa6ad1e199b58ab7c4cb Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 26 Nov 2015 10:07:41 +0100 Subject: [PATCH 15/67] nvme: add explicit quirk handling Add an enum for all workarounds not in the spec and identify the affected controllers at probe time. Signed-off-by: Christoph Hellwig Signed-off-by: Keith Busch Signed-off-by: Jens Axboe --- drivers/nvme/host/nvme.h | 13 +++++++++++++ drivers/nvme/host/pci.c | 8 +++++--- 2 files changed, 18 insertions(+), 3 deletions(-) diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index 3b3f855580ee..f7f16e32104f 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -32,6 +32,18 @@ enum { NVME_NS_LIGHTNVM = 1, }; +/* + * List of workarounds for devices that required behavior not specified in + * the standard. + */ +enum nvme_quirks { + /* + * Prefers I/O aligned to a stripe size specified in a vendor + * specific Identify field. + */ + NVME_QUIRK_STRIPE_SIZE = (1 << 0), +}; + struct nvme_ctrl { const struct nvme_ctrl_ops *ops; struct request_queue *admin_q; @@ -47,6 +59,7 @@ struct nvme_ctrl { u16 abort_limit; u8 event_limit; u8 vwc; + unsigned long quirks; }; /* diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index e0f40afbf01b..27d74490ff87 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -2026,7 +2026,6 @@ static void nvme_dev_scan(struct work_struct *work) */ static int nvme_dev_add(struct nvme_dev *dev) { - struct pci_dev *pdev = to_pci_dev(dev->dev); int res; struct nvme_id_ctrl *ctrl; int shift = NVME_CAP_MPSMIN(lo_hi_readq(dev->bar + NVME_REG_CAP)) + 12; @@ -2047,8 +2046,8 @@ static int nvme_dev_add(struct nvme_dev *dev) dev->max_hw_sectors = 1 << (ctrl->mdts + shift - 9); else dev->max_hw_sectors = UINT_MAX; - if ((pdev->vendor == PCI_VENDOR_ID_INTEL) && - (pdev->device == 0x0953) && ctrl->vs[3]) { + + if ((dev->ctrl.quirks & NVME_QUIRK_STRIPE_SIZE) && ctrl->vs[3]) { unsigned int max_hw_sectors; dev->stripe_size = 1 << (ctrl->vs[3] + shift); @@ -2719,6 +2718,7 @@ static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id) dev->ctrl.ops = &nvme_pci_ctrl_ops; dev->ctrl.dev = dev->dev; + dev->ctrl.quirks = id->driver_data; result = nvme_set_instance(dev); if (result) @@ -2846,6 +2846,8 @@ static const struct pci_error_handlers nvme_err_handler = { #define PCI_CLASS_STORAGE_EXPRESS 0x010802 static const struct pci_device_id nvme_id_table[] = { + { PCI_VDEVICE(INTEL, 0x0953), + .driver_data = NVME_QUIRK_STRIPE_SIZE, }, { PCI_DEVICE_CLASS(PCI_CLASS_STORAGE_EXPRESS, 0xffffff) }, { PCI_DEVICE(PCI_VENDOR_ID_APPLE, 0x2001) }, { 0, } From 1b2eb374651f0496b86ed5f095d4c448bff214fa Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sat, 28 Nov 2015 15:01:09 +0100 Subject: [PATCH 16/67] nvme: move remaining CC setup into nvme_enable_ctrl Remove the calculation of all the bits written into the CC register into nvme_enable_ctrl, so that they can be moved into the core NVMe driver in the future. Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- drivers/nvme/host/pci.c | 44 ++++++++++++++++++++--------------------- 1 file changed, 21 insertions(+), 23 deletions(-) diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 27d74490ff87..1ae94cd74702 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -1446,8 +1446,28 @@ static int nvme_disable_ctrl(struct nvme_dev *dev, u64 cap) static int nvme_enable_ctrl(struct nvme_dev *dev, u64 cap) { - dev->ctrl_config &= ~NVME_CC_SHN_MASK; + /* + * Default to a 4K page size, with the intention to update this + * path in the future to accomodate architectures with differing + * kernel and IO page sizes. + */ + unsigned dev_page_min = NVME_CAP_MPSMIN(cap) + 12, page_shift = 12; + + if (page_shift < dev_page_min) { + dev_err(dev->dev, + "Minimum device page size %u too large for host (%u)\n", + 1 << dev_page_min, 1 << page_shift); + return -ENODEV; + } + + dev->page_size = 1 << page_shift; + + dev->ctrl_config = NVME_CC_CSS_NVM; + dev->ctrl_config |= (page_shift - 12) << NVME_CC_MPS_SHIFT; + dev->ctrl_config |= NVME_CC_ARB_RR | NVME_CC_SHN_NONE; + dev->ctrl_config |= NVME_CC_IOSQES | NVME_CC_IOCQES; dev->ctrl_config |= NVME_CC_ENABLE; + writel(dev->ctrl_config, dev->bar + NVME_REG_CC); return nvme_wait_ready(dev, cap, true); @@ -1541,21 +1561,6 @@ static int nvme_configure_admin_queue(struct nvme_dev *dev) u32 aqa; u64 cap = lo_hi_readq(dev->bar + NVME_REG_CAP); struct nvme_queue *nvmeq; - /* - * default to a 4K page size, with the intention to update this - * path in the future to accomodate architectures with differing - * kernel and IO page sizes. - */ - unsigned page_shift = 12; - unsigned dev_page_min = NVME_CAP_MPSMIN(cap) + 12; - - if (page_shift < dev_page_min) { - dev_err(dev->dev, - "Minimum device page size (%u) too large for " - "host (%u)\n", 1 << dev_page_min, - 1 << page_shift); - return -ENODEV; - } dev->subsystem = readl(dev->bar + NVME_REG_VS) >= NVME_VS(1, 1) ? NVME_CAP_NSSRC(cap) : 0; @@ -1578,13 +1583,6 @@ static int nvme_configure_admin_queue(struct nvme_dev *dev) aqa = nvmeq->q_depth - 1; aqa |= aqa << 16; - dev->page_size = 1 << page_shift; - - dev->ctrl_config = NVME_CC_CSS_NVM; - dev->ctrl_config |= (page_shift - 12) << NVME_CC_MPS_SHIFT; - dev->ctrl_config |= NVME_CC_ARB_RR | NVME_CC_SHN_NONE; - dev->ctrl_config |= NVME_CC_IOSQES | NVME_CC_IOCQES; - writel(aqa, dev->bar + NVME_REG_AQA); lo_hi_writeq(nvmeq->sq_dma_addr, dev->bar + NVME_REG_ASQ); lo_hi_writeq(nvmeq->cq_dma_addr, dev->bar + NVME_REG_ACQ); From 5fd4ce1b005bd6ede913763f65efae9af6f7f386 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sat, 28 Nov 2015 15:03:49 +0100 Subject: [PATCH 17/67] nvme: move nvme_{enable,disable,shutdown}_ctrl to common code Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- drivers/nvme/host/core.c | 106 +++++++++++++++++++++++++++++++ drivers/nvme/host/nvme.h | 11 ++++ drivers/nvme/host/pci.c | 133 +++++++-------------------------------- 3 files changed, 141 insertions(+), 109 deletions(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 63ec86a93b83..e3179b33ff81 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -14,6 +14,7 @@ #include #include +#include #include #include #include @@ -670,6 +671,111 @@ const struct block_device_operations nvme_fops = { .pr_ops = &nvme_pr_ops, }; +static int nvme_wait_ready(struct nvme_ctrl *ctrl, u64 cap, bool enabled) +{ + unsigned long timeout = + ((NVME_CAP_TIMEOUT(cap) + 1) * HZ / 2) + jiffies; + u32 csts, bit = enabled ? NVME_CSTS_RDY : 0; + int ret; + + while ((ret = ctrl->ops->reg_read32(ctrl, NVME_REG_CSTS, &csts)) == 0) { + if ((csts & NVME_CSTS_RDY) == bit) + break; + + msleep(100); + if (fatal_signal_pending(current)) + return -EINTR; + if (time_after(jiffies, timeout)) { + dev_err(ctrl->dev, + "Device not ready; aborting %s\n", enabled ? + "initialisation" : "reset"); + return -ENODEV; + } + } + + return ret; +} + +/* + * If the device has been passed off to us in an enabled state, just clear + * the enabled bit. The spec says we should set the 'shutdown notification + * bits', but doing so may cause the device to complete commands to the + * admin queue ... and we don't know what memory that might be pointing at! + */ +int nvme_disable_ctrl(struct nvme_ctrl *ctrl, u64 cap) +{ + int ret; + + ctrl->ctrl_config &= ~NVME_CC_SHN_MASK; + ctrl->ctrl_config &= ~NVME_CC_ENABLE; + + ret = ctrl->ops->reg_write32(ctrl, NVME_REG_CC, ctrl->ctrl_config); + if (ret) + return ret; + return nvme_wait_ready(ctrl, cap, false); +} + +int nvme_enable_ctrl(struct nvme_ctrl *ctrl, u64 cap) +{ + /* + * Default to a 4K page size, with the intention to update this + * path in the future to accomodate architectures with differing + * kernel and IO page sizes. + */ + unsigned dev_page_min = NVME_CAP_MPSMIN(cap) + 12, page_shift = 12; + int ret; + + if (page_shift < dev_page_min) { + dev_err(ctrl->dev, + "Minimum device page size %u too large for host (%u)\n", + 1 << dev_page_min, 1 << page_shift); + return -ENODEV; + } + + ctrl->page_size = 1 << page_shift; + + ctrl->ctrl_config = NVME_CC_CSS_NVM; + ctrl->ctrl_config |= (page_shift - 12) << NVME_CC_MPS_SHIFT; + ctrl->ctrl_config |= NVME_CC_ARB_RR | NVME_CC_SHN_NONE; + ctrl->ctrl_config |= NVME_CC_IOSQES | NVME_CC_IOCQES; + ctrl->ctrl_config |= NVME_CC_ENABLE; + + ret = ctrl->ops->reg_write32(ctrl, NVME_REG_CC, ctrl->ctrl_config); + if (ret) + return ret; + return nvme_wait_ready(ctrl, cap, true); +} + +int nvme_shutdown_ctrl(struct nvme_ctrl *ctrl) +{ + unsigned long timeout = SHUTDOWN_TIMEOUT + jiffies; + u32 csts; + int ret; + + ctrl->ctrl_config &= ~NVME_CC_SHN_MASK; + ctrl->ctrl_config |= NVME_CC_SHN_NORMAL; + + ret = ctrl->ops->reg_write32(ctrl, NVME_REG_CC, ctrl->ctrl_config); + if (ret) + return ret; + + while ((ret = ctrl->ops->reg_read32(ctrl, NVME_REG_CSTS, &csts)) == 0) { + if ((csts & NVME_CSTS_SHST_MASK) == NVME_CSTS_SHST_CMPLT) + break; + + msleep(100); + if (fatal_signal_pending(current)) + return -EINTR; + if (time_after(jiffies, timeout)) { + dev_err(ctrl->dev, + "Device shutdown incomplete; abort shutdown\n"); + return -ENODEV; + } + } + + return ret; +} + static void nvme_free_ctrl(struct kref *kref) { struct nvme_ctrl *ctrl = container_of(kref, struct nvme_ctrl, kref); diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index f7f16e32104f..b6c5a55ed59f 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -27,6 +27,9 @@ extern unsigned char nvme_io_timeout; extern unsigned char admin_timeout; #define ADMIN_TIMEOUT (admin_timeout * HZ) +extern unsigned char shutdown_timeout; +#define SHUTDOWN_TIMEOUT (shutdown_timeout * HZ) + enum { NVME_NS_LBA = 0, NVME_NS_LIGHTNVM = 1, @@ -55,6 +58,10 @@ struct nvme_ctrl { char serial[20]; char model[40]; char firmware_rev[8]; + + u32 ctrl_config; + + u32 page_size; u16 oncs; u16 abort_limit; u8 event_limit; @@ -85,6 +92,7 @@ struct nvme_ns { struct nvme_ctrl_ops { int (*reg_read32)(struct nvme_ctrl *ctrl, u32 off, u32 *val); + int (*reg_write32)(struct nvme_ctrl *ctrl, u32 off, u32 val); void (*free_ctrl)(struct nvme_ctrl *ctrl); }; @@ -165,6 +173,9 @@ static inline int nvme_error_status(u16 status) } } +int nvme_disable_ctrl(struct nvme_ctrl *ctrl, u64 cap); +int nvme_enable_ctrl(struct nvme_ctrl *ctrl, u64 cap); +int nvme_shutdown_ctrl(struct nvme_ctrl *ctrl); void nvme_put_ctrl(struct nvme_ctrl *ctrl); void nvme_put_ns(struct nvme_ns *ns); diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 1ae94cd74702..ccb315101a5e 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -52,7 +52,6 @@ #define NVME_AQ_DEPTH 256 #define SQ_SIZE(depth) (depth * sizeof(struct nvme_command)) #define CQ_SIZE(depth) (depth * sizeof(struct nvme_completion)) -#define SHUTDOWN_TIMEOUT (shutdown_timeout * HZ) unsigned char admin_timeout = 60; module_param(admin_timeout, byte, 0644); @@ -62,7 +61,7 @@ unsigned char nvme_io_timeout = 30; module_param_named(io_timeout, nvme_io_timeout, byte, 0644); MODULE_PARM_DESC(io_timeout, "timeout in seconds for I/O"); -static unsigned char shutdown_timeout = 5; +unsigned char shutdown_timeout = 5; module_param(shutdown_timeout, byte, 0644); MODULE_PARM_DESC(shutdown_timeout, "timeout in seconds for controller shutdown"); @@ -122,7 +121,6 @@ struct nvme_dev { unsigned max_qid; int q_depth; u32 db_stride; - u32 ctrl_config; struct msix_entry *entry; void __iomem *bar; struct list_head namespaces; @@ -133,7 +131,6 @@ struct nvme_dev { bool subsystem; u32 max_hw_sectors; u32 stripe_size; - u32 page_size; void __iomem *cmb; dma_addr_t cmb_dma_addr; u64 cmb_size; @@ -225,7 +222,7 @@ struct nvme_cmd_info { * Max size of iod being embedded in the request payload */ #define NVME_INT_PAGES 2 -#define NVME_INT_BYTES(dev) (NVME_INT_PAGES * (dev)->page_size) +#define NVME_INT_BYTES(dev) (NVME_INT_PAGES * (dev)->ctrl.page_size) #define NVME_INT_MASK 0x01 /* @@ -235,7 +232,8 @@ struct nvme_cmd_info { */ static int nvme_npages(unsigned size, struct nvme_dev *dev) { - unsigned nprps = DIV_ROUND_UP(size + dev->page_size, dev->page_size); + unsigned nprps = DIV_ROUND_UP(size + dev->ctrl.page_size, + dev->ctrl.page_size); return DIV_ROUND_UP(8 * nprps, PAGE_SIZE - 8); } @@ -527,7 +525,7 @@ static struct nvme_iod *nvme_alloc_iod(struct request *rq, struct nvme_dev *dev, static void nvme_free_iod(struct nvme_dev *dev, struct nvme_iod *iod) { - const int last_prp = dev->page_size / 8 - 1; + const int last_prp = dev->ctrl.page_size / 8 - 1; int i; __le64 **list = iod_list(iod); dma_addr_t prp_dma = iod->first_dma; @@ -668,7 +666,7 @@ static bool nvme_setup_prps(struct nvme_dev *dev, struct nvme_iod *iod, struct scatterlist *sg = iod->sg; int dma_len = sg_dma_len(sg); u64 dma_addr = sg_dma_address(sg); - u32 page_size = dev->page_size; + u32 page_size = dev->ctrl.page_size; int offset = dma_addr & (page_size - 1); __le64 *prp_list; __le64 **list = iod_list(iod); @@ -1275,11 +1273,12 @@ static int nvme_cmb_qdepth(struct nvme_dev *dev, int nr_io_queues, int entry_size) { int q_depth = dev->q_depth; - unsigned q_size_aligned = roundup(q_depth * entry_size, dev->page_size); + unsigned q_size_aligned = roundup(q_depth * entry_size, + dev->ctrl.page_size); if (q_size_aligned * nr_io_queues > dev->cmb_size) { u64 mem_per_q = div_u64(dev->cmb_size, nr_io_queues); - mem_per_q = round_down(mem_per_q, dev->page_size); + mem_per_q = round_down(mem_per_q, dev->ctrl.page_size); q_depth = div_u64(mem_per_q, entry_size); /* @@ -1298,8 +1297,8 @@ static int nvme_alloc_sq_cmds(struct nvme_dev *dev, struct nvme_queue *nvmeq, int qid, int depth) { if (qid && dev->cmb && use_cmb_sqes && NVME_CMB_SQS(dev->cmbsz)) { - unsigned offset = (qid - 1) * - roundup(SQ_SIZE(depth), dev->page_size); + unsigned offset = (qid - 1) * roundup(SQ_SIZE(depth), + dev->ctrl.page_size); nvmeq->sq_dma_addr = dev->cmb_dma_addr + offset; nvmeq->sq_cmds_io = dev->cmb + offset; } else { @@ -1407,97 +1406,6 @@ static int nvme_create_queue(struct nvme_queue *nvmeq, int qid) return result; } -static int nvme_wait_ready(struct nvme_dev *dev, u64 cap, bool enabled) -{ - unsigned long timeout; - u32 bit = enabled ? NVME_CSTS_RDY : 0; - - timeout = ((NVME_CAP_TIMEOUT(cap) + 1) * HZ / 2) + jiffies; - - while ((readl(dev->bar + NVME_REG_CSTS) & NVME_CSTS_RDY) != bit) { - msleep(100); - if (fatal_signal_pending(current)) - return -EINTR; - if (time_after(jiffies, timeout)) { - dev_err(dev->dev, - "Device not ready; aborting %s\n", enabled ? - "initialisation" : "reset"); - return -ENODEV; - } - } - - return 0; -} - -/* - * If the device has been passed off to us in an enabled state, just clear - * the enabled bit. The spec says we should set the 'shutdown notification - * bits', but doing so may cause the device to complete commands to the - * admin queue ... and we don't know what memory that might be pointing at! - */ -static int nvme_disable_ctrl(struct nvme_dev *dev, u64 cap) -{ - dev->ctrl_config &= ~NVME_CC_SHN_MASK; - dev->ctrl_config &= ~NVME_CC_ENABLE; - writel(dev->ctrl_config, dev->bar + NVME_REG_CC); - - return nvme_wait_ready(dev, cap, false); -} - -static int nvme_enable_ctrl(struct nvme_dev *dev, u64 cap) -{ - /* - * Default to a 4K page size, with the intention to update this - * path in the future to accomodate architectures with differing - * kernel and IO page sizes. - */ - unsigned dev_page_min = NVME_CAP_MPSMIN(cap) + 12, page_shift = 12; - - if (page_shift < dev_page_min) { - dev_err(dev->dev, - "Minimum device page size %u too large for host (%u)\n", - 1 << dev_page_min, 1 << page_shift); - return -ENODEV; - } - - dev->page_size = 1 << page_shift; - - dev->ctrl_config = NVME_CC_CSS_NVM; - dev->ctrl_config |= (page_shift - 12) << NVME_CC_MPS_SHIFT; - dev->ctrl_config |= NVME_CC_ARB_RR | NVME_CC_SHN_NONE; - dev->ctrl_config |= NVME_CC_IOSQES | NVME_CC_IOCQES; - dev->ctrl_config |= NVME_CC_ENABLE; - - writel(dev->ctrl_config, dev->bar + NVME_REG_CC); - - return nvme_wait_ready(dev, cap, true); -} - -static int nvme_shutdown_ctrl(struct nvme_dev *dev) -{ - unsigned long timeout; - - dev->ctrl_config &= ~NVME_CC_SHN_MASK; - dev->ctrl_config |= NVME_CC_SHN_NORMAL; - - writel(dev->ctrl_config, dev->bar + NVME_REG_CC); - - timeout = SHUTDOWN_TIMEOUT + jiffies; - while ((readl(dev->bar + NVME_REG_CSTS) & NVME_CSTS_SHST_MASK) != - NVME_CSTS_SHST_CMPLT) { - msleep(100); - if (fatal_signal_pending(current)) - return -EINTR; - if (time_after(jiffies, timeout)) { - dev_err(dev->dev, - "Device shutdown incomplete; abort shutdown\n"); - return -ENODEV; - } - } - - return 0; -} - static struct blk_mq_ops nvme_mq_admin_ops = { .queue_rq = nvme_queue_rq, .map_queue = blk_mq_map_queue, @@ -1569,7 +1477,7 @@ static int nvme_configure_admin_queue(struct nvme_dev *dev) (readl(dev->bar + NVME_REG_CSTS) & NVME_CSTS_NSSRO)) writel(NVME_CSTS_NSSRO, dev->bar + NVME_REG_CSTS); - result = nvme_disable_ctrl(dev, cap); + result = nvme_disable_ctrl(&dev->ctrl, cap); if (result < 0) return result; @@ -1587,7 +1495,7 @@ static int nvme_configure_admin_queue(struct nvme_dev *dev) lo_hi_writeq(nvmeq->sq_dma_addr, dev->bar + NVME_REG_ASQ); lo_hi_writeq(nvmeq->cq_dma_addr, dev->bar + NVME_REG_ACQ); - result = nvme_enable_ctrl(dev, cap); + result = nvme_enable_ctrl(&dev->ctrl, cap); if (result) goto free_nvmeq; @@ -1687,13 +1595,13 @@ static void nvme_alloc_ns(struct nvme_dev *dev, unsigned nsid) if (dev->max_hw_sectors) { blk_queue_max_hw_sectors(ns->queue, dev->max_hw_sectors); blk_queue_max_segments(ns->queue, - (dev->max_hw_sectors / (dev->page_size >> 9)) + 1); + (dev->max_hw_sectors / (dev->ctrl.page_size >> 9)) + 1); } if (dev->stripe_size) blk_queue_chunk_sectors(ns->queue, dev->stripe_size >> 9); if (dev->ctrl.vwc & NVME_CTRL_VWC_PRESENT) blk_queue_flush(ns->queue, REQ_FLUSH | REQ_FUA); - blk_queue_virt_boundary(ns->queue, dev->page_size - 1); + blk_queue_virt_boundary(ns->queue, dev->ctrl.page_size - 1); disk->major = nvme_major; disk->first_minor = 0; @@ -2181,7 +2089,7 @@ static void nvme_wait_dq(struct nvme_delq_ctx *dq, struct nvme_dev *dev) * queues than admin tags. */ set_current_state(TASK_RUNNING); - nvme_disable_ctrl(dev, + nvme_disable_ctrl(&dev->ctrl, lo_hi_readq(dev->bar + NVME_REG_CAP)); nvme_clear_queue(dev->queues[0]); flush_kthread_worker(dq->worker); @@ -2367,7 +2275,7 @@ static void nvme_dev_shutdown(struct nvme_dev *dev) } } else { nvme_disable_io_queues(dev); - nvme_shutdown_ctrl(dev); + nvme_shutdown_ctrl(&dev->ctrl); nvme_disable_queue(dev, 0); } nvme_dev_unmap(dev); @@ -2683,8 +2591,15 @@ static int nvme_pci_reg_read32(struct nvme_ctrl *ctrl, u32 off, u32 *val) return 0; } +static int nvme_pci_reg_write32(struct nvme_ctrl *ctrl, u32 off, u32 val) +{ + writel(val, to_nvme_dev(ctrl)->bar + off); + return 0; +} + static const struct nvme_ctrl_ops nvme_pci_ctrl_ops = { .reg_read32 = nvme_pci_reg_read32, + .reg_write32 = nvme_pci_reg_write32, .free_ctrl = nvme_pci_free_ctrl, }; From 7fd8930f26be4c9078684b2fef14da0503771bf2 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sat, 28 Nov 2015 15:37:52 +0100 Subject: [PATCH 18/67] nvme: add a common helper to read Identify Controller data And add the 64-bit register read operation for it. Signed-off-by: Christoph Hellwig Signed-off-by: Keith Busch Signed-off-by: Jens Axboe --- drivers/nvme/host/core.c | 52 +++++++++++++++++++++++++++++++++++++++ drivers/nvme/host/nvme.h | 4 +++ drivers/nvme/host/pci.c | 53 ++++++++++++---------------------------- 3 files changed, 71 insertions(+), 38 deletions(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index e3179b33ff81..1c9f09c80b9d 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -776,6 +776,58 @@ int nvme_shutdown_ctrl(struct nvme_ctrl *ctrl) return ret; } +/* + * Initialize the cached copies of the Identify data and various controller + * register in our nvme_ctrl structure. This should be called as soon as + * the admin queue is fully up and running. + */ +int nvme_init_identify(struct nvme_ctrl *ctrl) +{ + struct nvme_id_ctrl *id; + u64 cap; + int ret, page_shift; + + ret = ctrl->ops->reg_read64(ctrl, NVME_REG_CAP, &cap); + if (ret) { + dev_err(ctrl->dev, "Reading CAP failed (%d)\n", ret); + return ret; + } + page_shift = NVME_CAP_MPSMIN(cap) + 12; + + ret = nvme_identify_ctrl(ctrl, &id); + if (ret) { + dev_err(ctrl->dev, "Identify Controller failed (%d)\n", ret); + return -EIO; + } + + ctrl->oncs = le16_to_cpup(&id->oncs); + ctrl->abort_limit = id->acl + 1; + ctrl->vwc = id->vwc; + memcpy(ctrl->serial, id->sn, sizeof(id->sn)); + memcpy(ctrl->model, id->mn, sizeof(id->mn)); + memcpy(ctrl->firmware_rev, id->fr, sizeof(id->fr)); + if (id->mdts) + ctrl->max_hw_sectors = 1 << (id->mdts + page_shift - 9); + else + ctrl->max_hw_sectors = UINT_MAX; + + if ((ctrl->quirks & NVME_QUIRK_STRIPE_SIZE) && id->vs[3]) { + unsigned int max_hw_sectors; + + ctrl->stripe_size = 1 << (id->vs[3] + page_shift); + max_hw_sectors = ctrl->stripe_size >> (page_shift - 9); + if (ctrl->max_hw_sectors) { + ctrl->max_hw_sectors = min(max_hw_sectors, + ctrl->max_hw_sectors); + } else { + ctrl->max_hw_sectors = max_hw_sectors; + } + } + + kfree(id); + return 0; +} + static void nvme_free_ctrl(struct kref *kref) { struct nvme_ctrl *ctrl = container_of(kref, struct nvme_ctrl, kref); diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index b6c5a55ed59f..a624add7ca22 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -62,6 +62,8 @@ struct nvme_ctrl { u32 ctrl_config; u32 page_size; + u32 max_hw_sectors; + u32 stripe_size; u16 oncs; u16 abort_limit; u8 event_limit; @@ -93,6 +95,7 @@ struct nvme_ns { struct nvme_ctrl_ops { int (*reg_read32)(struct nvme_ctrl *ctrl, u32 off, u32 *val); int (*reg_write32)(struct nvme_ctrl *ctrl, u32 off, u32 val); + int (*reg_read64)(struct nvme_ctrl *ctrl, u32 off, u64 *val); void (*free_ctrl)(struct nvme_ctrl *ctrl); }; @@ -177,6 +180,7 @@ int nvme_disable_ctrl(struct nvme_ctrl *ctrl, u64 cap); int nvme_enable_ctrl(struct nvme_ctrl *ctrl, u64 cap); int nvme_shutdown_ctrl(struct nvme_ctrl *ctrl); void nvme_put_ctrl(struct nvme_ctrl *ctrl); +int nvme_init_identify(struct nvme_ctrl *ctrl); void nvme_put_ns(struct nvme_ns *ns); struct request *nvme_alloc_request(struct request_queue *q, diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index ccb315101a5e..086563fe6ed1 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -129,8 +129,6 @@ struct nvme_dev { struct work_struct probe_work; struct work_struct scan_work; bool subsystem; - u32 max_hw_sectors; - u32 stripe_size; void __iomem *cmb; dma_addr_t cmb_dma_addr; u64 cmb_size; @@ -1592,13 +1590,13 @@ static void nvme_alloc_ns(struct nvme_dev *dev, unsigned nsid) list_add_tail(&ns->list, &dev->namespaces); blk_queue_logical_block_size(ns->queue, 1 << ns->lba_shift); - if (dev->max_hw_sectors) { - blk_queue_max_hw_sectors(ns->queue, dev->max_hw_sectors); + if (dev->ctrl.max_hw_sectors) { + blk_queue_max_hw_sectors(ns->queue, dev->ctrl.max_hw_sectors); blk_queue_max_segments(ns->queue, - (dev->max_hw_sectors / (dev->ctrl.page_size >> 9)) + 1); + (dev->ctrl.max_hw_sectors / (dev->ctrl.page_size >> 9)) + 1); } - if (dev->stripe_size) - blk_queue_chunk_sectors(ns->queue, dev->stripe_size >> 9); + if (dev->ctrl.stripe_size) + blk_queue_chunk_sectors(ns->queue, dev->ctrl.stripe_size >> 9); if (dev->ctrl.vwc & NVME_CTRL_VWC_PRESENT) blk_queue_flush(ns->queue, REQ_FLUSH | REQ_FUA); blk_queue_virt_boundary(ns->queue, dev->ctrl.page_size - 1); @@ -1933,38 +1931,10 @@ static void nvme_dev_scan(struct work_struct *work) static int nvme_dev_add(struct nvme_dev *dev) { int res; - struct nvme_id_ctrl *ctrl; - int shift = NVME_CAP_MPSMIN(lo_hi_readq(dev->bar + NVME_REG_CAP)) + 12; - res = nvme_identify_ctrl(&dev->ctrl, &ctrl); - if (res) { - dev_err(dev->dev, "Identify Controller failed (%d)\n", res); - return -EIO; - } - - dev->ctrl.oncs = le16_to_cpup(&ctrl->oncs); - dev->ctrl.abort_limit = ctrl->acl + 1; - dev->ctrl.vwc = ctrl->vwc; - memcpy(dev->ctrl.serial, ctrl->sn, sizeof(ctrl->sn)); - memcpy(dev->ctrl.model, ctrl->mn, sizeof(ctrl->mn)); - memcpy(dev->ctrl.firmware_rev, ctrl->fr, sizeof(ctrl->fr)); - if (ctrl->mdts) - dev->max_hw_sectors = 1 << (ctrl->mdts + shift - 9); - else - dev->max_hw_sectors = UINT_MAX; - - if ((dev->ctrl.quirks & NVME_QUIRK_STRIPE_SIZE) && ctrl->vs[3]) { - unsigned int max_hw_sectors; - - dev->stripe_size = 1 << (ctrl->vs[3] + shift); - max_hw_sectors = dev->stripe_size >> (shift - 9); - if (dev->max_hw_sectors) { - dev->max_hw_sectors = min(max_hw_sectors, - dev->max_hw_sectors); - } else - dev->max_hw_sectors = max_hw_sectors; - } - kfree(ctrl); + res = nvme_init_identify(&dev->ctrl); + if (res) + return res; if (!dev->tagset.tags) { dev->tagset.ops = &nvme_mq_ops; @@ -2597,9 +2567,16 @@ static int nvme_pci_reg_write32(struct nvme_ctrl *ctrl, u32 off, u32 val) return 0; } +static int nvme_pci_reg_read64(struct nvme_ctrl *ctrl, u32 off, u64 *val) +{ + *val = readq(to_nvme_dev(ctrl)->bar + off); + return 0; +} + static const struct nvme_ctrl_ops nvme_pci_ctrl_ops = { .reg_read32 = nvme_pci_reg_read32, .reg_write32 = nvme_pci_reg_write32, + .reg_read64 = nvme_pci_reg_read64, .free_ctrl = nvme_pci_free_ctrl, }; From ce4541f40a949cd9a9c9f308b1a6a86914ce6e1a Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 16 Oct 2015 07:58:46 +0200 Subject: [PATCH 19/67] nvme: move the call to nvme_init_identify earlier We want to record the identify and CAP values even if no I/O queue is available. Signed-off-by: Christoph Hellwig Signed-off-by: Keith Busch Signed-off-by: Jens Axboe --- drivers/nvme/host/pci.c | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 086563fe6ed1..4d64aee61aea 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -1930,12 +1930,6 @@ static void nvme_dev_scan(struct work_struct *work) */ static int nvme_dev_add(struct nvme_dev *dev) { - int res; - - res = nvme_init_identify(&dev->ctrl); - if (res) - return res; - if (!dev->tagset.tags) { dev->tagset.ops = &nvme_mq_ops; dev->tagset.nr_hw_queues = dev->online_queues - 1; @@ -2431,6 +2425,10 @@ static void nvme_probe_work(struct work_struct *work) if (result) goto disable; + result = nvme_init_identify(&dev->ctrl); + if (result) + goto free_tags; + result = nvme_setup_io_queues(dev); if (result) goto free_tags; From 5bae7f73d378a986671a3cad717c721b38f80d9e Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sat, 28 Nov 2015 15:39:07 +0100 Subject: [PATCH 20/67] nvme: move namespace scanning to common code The namespace scanning code has been mostly generic already, we just need to store a pointer to the tagset in the nvme_ctrl structure, and add a method to check if a controller is I/O incapable. The latter will hopefully be replaced by a proper controller state machine soon. Signed-off-by: Christoph Hellwig [Fixed pr conflicts] Signed-off-by: Keith Busch Signed-off-by: Jens Axboe --- drivers/nvme/host/core.c | 192 +++++++++++++++++++++++++++++++++- drivers/nvme/host/nvme.h | 24 ++++- drivers/nvme/host/pci.c | 220 ++++++--------------------------------- 3 files changed, 240 insertions(+), 196 deletions(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 1c9f09c80b9d..1b8498434b49 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -18,6 +18,8 @@ #include #include #include +#include +#include #include #include #include @@ -29,6 +31,9 @@ #include "nvme.h" +static int nvme_major; +module_param(nvme_major, int, 0); + DEFINE_SPINLOCK(dev_list_lock); static void nvme_free_ns(struct kref *kref) @@ -47,7 +52,7 @@ static void nvme_free_ns(struct kref *kref) kfree(ns); } -void nvme_put_ns(struct nvme_ns *ns) +static void nvme_put_ns(struct nvme_ns *ns) { kref_put(&ns->kref, nvme_free_ns); } @@ -496,7 +501,7 @@ static void nvme_config_discard(struct nvme_ns *ns) queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, ns->queue); } -int nvme_revalidate_disk(struct gendisk *disk) +static int nvme_revalidate_disk(struct gendisk *disk) { struct nvme_ns *ns = disk->private_data; struct nvme_id_ns *id; @@ -660,7 +665,7 @@ static const struct pr_ops nvme_pr_ops = { .pr_clear = nvme_pr_clear, }; -const struct block_device_operations nvme_fops = { +static const struct block_device_operations nvme_fops = { .owner = THIS_MODULE, .ioctl = nvme_ioctl, .compat_ioctl = nvme_compat_ioctl, @@ -840,3 +845,184 @@ void nvme_put_ctrl(struct nvme_ctrl *ctrl) kref_put(&ctrl->kref, nvme_free_ctrl); } +static int ns_cmp(void *priv, struct list_head *a, struct list_head *b) +{ + struct nvme_ns *nsa = container_of(a, struct nvme_ns, list); + struct nvme_ns *nsb = container_of(b, struct nvme_ns, list); + + return nsa->ns_id - nsb->ns_id; +} + +static struct nvme_ns *nvme_find_ns(struct nvme_ctrl *ctrl, unsigned nsid) +{ + struct nvme_ns *ns; + + list_for_each_entry(ns, &ctrl->namespaces, list) { + if (ns->ns_id == nsid) + return ns; + if (ns->ns_id > nsid) + break; + } + return NULL; +} + +static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid) +{ + struct nvme_ns *ns; + struct gendisk *disk; + int node = dev_to_node(ctrl->dev); + + ns = kzalloc_node(sizeof(*ns), GFP_KERNEL, node); + if (!ns) + return; + + ns->queue = blk_mq_init_queue(ctrl->tagset); + if (IS_ERR(ns->queue)) + goto out_free_ns; + queue_flag_set_unlocked(QUEUE_FLAG_NOMERGES, ns->queue); + queue_flag_set_unlocked(QUEUE_FLAG_NONROT, ns->queue); + ns->queue->queuedata = ns; + ns->ctrl = ctrl; + + disk = alloc_disk_node(0, node); + if (!disk) + goto out_free_queue; + + kref_init(&ns->kref); + ns->ns_id = nsid; + ns->disk = disk; + ns->lba_shift = 9; /* set to a default value for 512 until disk is validated */ + list_add_tail(&ns->list, &ctrl->namespaces); + + blk_queue_logical_block_size(ns->queue, 1 << ns->lba_shift); + if (ctrl->max_hw_sectors) { + blk_queue_max_hw_sectors(ns->queue, ctrl->max_hw_sectors); + blk_queue_max_segments(ns->queue, + (ctrl->max_hw_sectors / (ctrl->page_size >> 9)) + 1); + } + if (ctrl->stripe_size) + blk_queue_chunk_sectors(ns->queue, ctrl->stripe_size >> 9); + if (ctrl->vwc & NVME_CTRL_VWC_PRESENT) + blk_queue_flush(ns->queue, REQ_FLUSH | REQ_FUA); + blk_queue_virt_boundary(ns->queue, ctrl->page_size - 1); + + disk->major = nvme_major; + disk->first_minor = 0; + disk->fops = &nvme_fops; + disk->private_data = ns; + disk->queue = ns->queue; + disk->driverfs_dev = ctrl->device; + disk->flags = GENHD_FL_EXT_DEVT; + sprintf(disk->disk_name, "nvme%dn%d", ctrl->instance, nsid); + + /* + * Initialize capacity to 0 until we establish the namespace format and + * setup integrity extentions if necessary. The revalidate_disk after + * add_disk allows the driver to register with integrity if the format + * requires it. + */ + set_capacity(disk, 0); + if (nvme_revalidate_disk(ns->disk)) + goto out_free_disk; + + kref_get(&ctrl->kref); + if (ns->type != NVME_NS_LIGHTNVM) { + add_disk(ns->disk); + if (ns->ms) { + struct block_device *bd = bdget_disk(ns->disk, 0); + if (!bd) + return; + if (blkdev_get(bd, FMODE_READ, NULL)) { + bdput(bd); + return; + } + blkdev_reread_part(bd); + blkdev_put(bd, FMODE_READ); + } + } + + return; + out_free_disk: + kfree(disk); + list_del(&ns->list); + out_free_queue: + blk_cleanup_queue(ns->queue); + out_free_ns: + kfree(ns); +} + +static void nvme_ns_remove(struct nvme_ns *ns) +{ + bool kill = nvme_io_incapable(ns->ctrl) && + !blk_queue_dying(ns->queue); + + if (kill) + blk_set_queue_dying(ns->queue); + if (ns->disk->flags & GENHD_FL_UP) { + if (blk_get_integrity(ns->disk)) + blk_integrity_unregister(ns->disk); + del_gendisk(ns->disk); + } + if (kill || !blk_queue_dying(ns->queue)) { + blk_mq_abort_requeue_list(ns->queue); + blk_cleanup_queue(ns->queue); + } + list_del_init(&ns->list); + nvme_put_ns(ns); +} + +static void __nvme_scan_namespaces(struct nvme_ctrl *ctrl, unsigned nn) +{ + struct nvme_ns *ns, *next; + unsigned i; + + for (i = 1; i <= nn; i++) { + ns = nvme_find_ns(ctrl, i); + if (ns) { + if (revalidate_disk(ns->disk)) + nvme_ns_remove(ns); + } else + nvme_alloc_ns(ctrl, i); + } + list_for_each_entry_safe(ns, next, &ctrl->namespaces, list) { + if (ns->ns_id > nn) + nvme_ns_remove(ns); + } + list_sort(NULL, &ctrl->namespaces, ns_cmp); +} + +void nvme_scan_namespaces(struct nvme_ctrl *ctrl) +{ + struct nvme_id_ctrl *id; + + if (nvme_identify_ctrl(ctrl, &id)) + return; + __nvme_scan_namespaces(ctrl, le32_to_cpup(&id->nn)); + kfree(id); +} + +void nvme_remove_namespaces(struct nvme_ctrl *ctrl) +{ + struct nvme_ns *ns, *next; + + list_for_each_entry_safe(ns, next, &ctrl->namespaces, list) + nvme_ns_remove(ns); +} + +int __init nvme_core_init(void) +{ + int result; + + result = register_blkdev(nvme_major, "nvme"); + if (result < 0) + return result; + else if (result > 0) + nvme_major = result; + + return 0; +} + +void nvme_core_exit(void) +{ + unregister_blkdev(nvme_major, "nvme"); +} diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index a624add7ca22..dfedaaa2633b 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -53,6 +53,9 @@ struct nvme_ctrl { struct device *dev; struct kref kref; int instance; + struct blk_mq_tag_set *tagset; + struct list_head namespaces; + struct device *device; /* char device */ char name[12]; char serial[20]; @@ -96,6 +99,7 @@ struct nvme_ctrl_ops { int (*reg_read32)(struct nvme_ctrl *ctrl, u32 off, u32 *val); int (*reg_write32)(struct nvme_ctrl *ctrl, u32 off, u32 val); int (*reg_read64)(struct nvme_ctrl *ctrl, u32 off, u64 *val); + bool (*io_incapable)(struct nvme_ctrl *ctrl); void (*free_ctrl)(struct nvme_ctrl *ctrl); }; @@ -108,6 +112,17 @@ static inline bool nvme_ctrl_ready(struct nvme_ctrl *ctrl) return val & NVME_CSTS_RDY; } +static inline bool nvme_io_incapable(struct nvme_ctrl *ctrl) +{ + u32 val = 0; + + if (ctrl->ops->io_incapable(ctrl)) + return false; + if (ctrl->ops->reg_read32(ctrl, NVME_REG_CSTS, &val)) + return false; + return val & NVME_CSTS_CFS; +} + static inline u64 nvme_block_nr(struct nvme_ns *ns, sector_t sector) { return (sector >> (ns->lba_shift - 9)); @@ -181,7 +196,9 @@ int nvme_enable_ctrl(struct nvme_ctrl *ctrl, u64 cap); int nvme_shutdown_ctrl(struct nvme_ctrl *ctrl); void nvme_put_ctrl(struct nvme_ctrl *ctrl); int nvme_init_identify(struct nvme_ctrl *ctrl); -void nvme_put_ns(struct nvme_ns *ns); + +void nvme_scan_namespaces(struct nvme_ctrl *ctrl); +void nvme_remove_namespaces(struct nvme_ctrl *ctrl); struct request *nvme_alloc_request(struct request_queue *q, struct nvme_command *cmd, unsigned int flags); @@ -205,10 +222,8 @@ int nvme_get_features(struct nvme_ctrl *dev, unsigned fid, unsigned nsid, int nvme_set_features(struct nvme_ctrl *dev, unsigned fid, unsigned dword11, dma_addr_t dma_addr, u32 *result); -extern const struct block_device_operations nvme_fops; extern spinlock_t dev_list_lock; -int nvme_revalidate_disk(struct gendisk *disk); int nvme_user_cmd(struct nvme_ctrl *ctrl, struct nvme_ns *ns, struct nvme_passthru_cmd __user *ucmd); @@ -222,4 +237,7 @@ int nvme_nvm_ns_supported(struct nvme_ns *ns, struct nvme_id_ns *id); int nvme_nvm_register(struct request_queue *q, char *disk_name); void nvme_nvm_unregister(struct request_queue *q, char *disk_name); +int __init nvme_core_init(void); +void nvme_core_exit(void); + #endif /* _NVME_H */ diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 4d64aee61aea..697dc1fb5ef9 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -28,7 +28,6 @@ #include #include #include -#include #include #include #include @@ -65,9 +64,6 @@ unsigned char shutdown_timeout = 5; module_param(shutdown_timeout, byte, 0644); MODULE_PARM_DESC(shutdown_timeout, "timeout in seconds for controller shutdown"); -static int nvme_major; -module_param(nvme_major, int, 0); - static int nvme_char_major; module_param(nvme_char_major, int, 0); @@ -123,8 +119,6 @@ struct nvme_dev { u32 db_stride; struct msix_entry *entry; void __iomem *bar; - struct list_head namespaces; - struct device *device; struct work_struct reset_work; struct work_struct probe_work; struct work_struct scan_work; @@ -1561,90 +1555,6 @@ static int nvme_kthread(void *data) return 0; } -static void nvme_alloc_ns(struct nvme_dev *dev, unsigned nsid) -{ - struct nvme_ns *ns; - struct gendisk *disk; - int node = dev_to_node(dev->dev); - - ns = kzalloc_node(sizeof(*ns), GFP_KERNEL, node); - if (!ns) - return; - - ns->queue = blk_mq_init_queue(&dev->tagset); - if (IS_ERR(ns->queue)) - goto out_free_ns; - queue_flag_set_unlocked(QUEUE_FLAG_NOMERGES, ns->queue); - queue_flag_set_unlocked(QUEUE_FLAG_NONROT, ns->queue); - ns->ctrl = &dev->ctrl; - ns->queue->queuedata = ns; - - disk = alloc_disk_node(0, node); - if (!disk) - goto out_free_queue; - - kref_init(&ns->kref); - ns->ns_id = nsid; - ns->disk = disk; - ns->lba_shift = 9; /* set to a default value for 512 until disk is validated */ - list_add_tail(&ns->list, &dev->namespaces); - - blk_queue_logical_block_size(ns->queue, 1 << ns->lba_shift); - if (dev->ctrl.max_hw_sectors) { - blk_queue_max_hw_sectors(ns->queue, dev->ctrl.max_hw_sectors); - blk_queue_max_segments(ns->queue, - (dev->ctrl.max_hw_sectors / (dev->ctrl.page_size >> 9)) + 1); - } - if (dev->ctrl.stripe_size) - blk_queue_chunk_sectors(ns->queue, dev->ctrl.stripe_size >> 9); - if (dev->ctrl.vwc & NVME_CTRL_VWC_PRESENT) - blk_queue_flush(ns->queue, REQ_FLUSH | REQ_FUA); - blk_queue_virt_boundary(ns->queue, dev->ctrl.page_size - 1); - - disk->major = nvme_major; - disk->first_minor = 0; - disk->fops = &nvme_fops; - disk->private_data = ns; - disk->queue = ns->queue; - disk->driverfs_dev = dev->device; - disk->flags = GENHD_FL_EXT_DEVT; - sprintf(disk->disk_name, "nvme%dn%d", dev->ctrl.instance, nsid); - - /* - * Initialize capacity to 0 until we establish the namespace format and - * setup integrity extentions if necessary. The revalidate_disk after - * add_disk allows the driver to register with integrity if the format - * requires it. - */ - set_capacity(disk, 0); - if (nvme_revalidate_disk(ns->disk)) - goto out_free_disk; - - kref_get(&dev->ctrl.kref); - if (ns->type != NVME_NS_LIGHTNVM) { - add_disk(ns->disk); - if (ns->ms) { - struct block_device *bd = bdget_disk(ns->disk, 0); - if (!bd) - return; - if (blkdev_get(bd, FMODE_READ, NULL)) { - bdput(bd); - return; - } - blkdev_reread_part(bd); - blkdev_put(bd, FMODE_READ); - } - } - return; - out_free_disk: - kfree(disk); - list_del(&ns->list); - out_free_queue: - blk_cleanup_queue(ns->queue); - out_free_ns: - kfree(ns); -} - /* * Create I/O queues. Failing to create an I/O queue is not an issue, * we can continue with less than the desired amount of queues, and @@ -1827,71 +1737,6 @@ static int nvme_setup_io_queues(struct nvme_dev *dev) return result; } -static int ns_cmp(void *priv, struct list_head *a, struct list_head *b) -{ - struct nvme_ns *nsa = container_of(a, struct nvme_ns, list); - struct nvme_ns *nsb = container_of(b, struct nvme_ns, list); - - return nsa->ns_id - nsb->ns_id; -} - -static struct nvme_ns *nvme_find_ns(struct nvme_dev *dev, unsigned nsid) -{ - struct nvme_ns *ns; - - list_for_each_entry(ns, &dev->namespaces, list) { - if (ns->ns_id == nsid) - return ns; - if (ns->ns_id > nsid) - break; - } - return NULL; -} - -static inline bool nvme_io_incapable(struct nvme_dev *dev) -{ - return (!dev->bar || - readl(dev->bar + NVME_REG_CSTS) & NVME_CSTS_CFS || - dev->online_queues < 2); -} - -static void nvme_ns_remove(struct nvme_ns *ns) -{ - bool kill = nvme_io_incapable(to_nvme_dev(ns->ctrl)) && - !blk_queue_dying(ns->queue); - - if (kill) - blk_set_queue_dying(ns->queue); - if (ns->disk->flags & GENHD_FL_UP) - del_gendisk(ns->disk); - if (kill || !blk_queue_dying(ns->queue)) { - blk_mq_abort_requeue_list(ns->queue); - blk_cleanup_queue(ns->queue); - } - list_del_init(&ns->list); - nvme_put_ns(ns); -} - -static void nvme_scan_namespaces(struct nvme_dev *dev, unsigned nn) -{ - struct nvme_ns *ns, *next; - unsigned i; - - for (i = 1; i <= nn; i++) { - ns = nvme_find_ns(dev, i); - if (ns) { - if (revalidate_disk(ns->disk)) - nvme_ns_remove(ns); - } else - nvme_alloc_ns(dev, i); - } - list_for_each_entry_safe(ns, next, &dev->namespaces, list) { - if (ns->ns_id > nn) - nvme_ns_remove(ns); - } - list_sort(NULL, &dev->namespaces, ns_cmp); -} - static void nvme_set_irq_hints(struct nvme_dev *dev) { struct nvme_queue *nvmeq; @@ -1911,14 +1756,10 @@ static void nvme_set_irq_hints(struct nvme_dev *dev) static void nvme_dev_scan(struct work_struct *work) { struct nvme_dev *dev = container_of(work, struct nvme_dev, scan_work); - struct nvme_id_ctrl *ctrl; if (!dev->tagset.tags) return; - if (nvme_identify_ctrl(&dev->ctrl, &ctrl)) - return; - nvme_scan_namespaces(dev, le32_to_cpup(&ctrl->nn)); - kfree(ctrl); + nvme_scan_namespaces(&dev->ctrl); nvme_set_irq_hints(dev); } @@ -1930,7 +1771,7 @@ static void nvme_dev_scan(struct work_struct *work) */ static int nvme_dev_add(struct nvme_dev *dev) { - if (!dev->tagset.tags) { + if (!dev->ctrl.tagset) { dev->tagset.ops = &nvme_mq_ops; dev->tagset.nr_hw_queues = dev->online_queues - 1; dev->tagset.timeout = NVME_IO_TIMEOUT; @@ -1943,6 +1784,7 @@ static int nvme_dev_add(struct nvme_dev *dev) if (blk_mq_alloc_tag_set(&dev->tagset)) return 0; + dev->ctrl.tagset = &dev->tagset; } schedule_work(&dev->scan_work); return 0; @@ -2197,7 +2039,7 @@ static void nvme_freeze_queues(struct nvme_dev *dev) { struct nvme_ns *ns; - list_for_each_entry(ns, &dev->namespaces, list) { + list_for_each_entry(ns, &dev->ctrl.namespaces, list) { blk_mq_freeze_queue_start(ns->queue); spin_lock_irq(ns->queue->queue_lock); @@ -2213,7 +2055,7 @@ static void nvme_unfreeze_queues(struct nvme_dev *dev) { struct nvme_ns *ns; - list_for_each_entry(ns, &dev->namespaces, list) { + list_for_each_entry(ns, &dev->ctrl.namespaces, list) { queue_flag_clear_unlocked(QUEUE_FLAG_STOPPED, ns->queue); blk_mq_unfreeze_queue(ns->queue); blk_mq_start_stopped_hw_queues(ns->queue, true); @@ -2248,14 +2090,6 @@ static void nvme_dev_shutdown(struct nvme_dev *dev) nvme_clear_queue(dev->queues[i]); } -static void nvme_dev_remove(struct nvme_dev *dev) -{ - struct nvme_ns *ns, *next; - - list_for_each_entry_safe(ns, next, &dev->namespaces, list) - nvme_ns_remove(ns); -} - static int nvme_setup_prp_pools(struct nvme_dev *dev) { dev->prp_page_pool = dma_pool_create("prp list page", dev->dev, @@ -2313,7 +2147,7 @@ static void nvme_pci_free_ctrl(struct nvme_ctrl *ctrl) struct nvme_dev *dev = to_nvme_dev(ctrl); put_device(dev->dev); - put_device(dev->device); + put_device(ctrl->device); nvme_release_instance(dev); if (dev->tagset.tags) blk_mq_free_tag_set(&dev->tagset); @@ -2365,9 +2199,9 @@ static long nvme_dev_ioctl(struct file *f, unsigned int cmd, unsigned long arg) case NVME_IOCTL_ADMIN_CMD: return nvme_user_cmd(&dev->ctrl, NULL, (void __user *)arg); case NVME_IOCTL_IO_CMD: - if (list_empty(&dev->namespaces)) + if (list_empty(&dev->ctrl.namespaces)) return -ENOTTY; - ns = list_first_entry(&dev->namespaces, struct nvme_ns, list); + ns = list_first_entry(&dev->ctrl.namespaces, struct nvme_ns, list); return nvme_user_cmd(&dev->ctrl, ns, (void __user *)arg); case NVME_IOCTL_RESET: dev_warn(dev->dev, "resetting controller\n"); @@ -2441,7 +2275,7 @@ static void nvme_probe_work(struct work_struct *work) */ if (dev->online_queues < 2) { dev_warn(dev->dev, "IO queues not created\n"); - nvme_dev_remove(dev); + nvme_remove_namespaces(&dev->ctrl); } else { nvme_unfreeze_queues(dev); nvme_dev_add(dev); @@ -2571,10 +2405,18 @@ static int nvme_pci_reg_read64(struct nvme_ctrl *ctrl, u32 off, u64 *val) return 0; } +static bool nvme_pci_io_incapable(struct nvme_ctrl *ctrl) +{ + struct nvme_dev *dev = to_nvme_dev(ctrl); + + return !dev->bar || dev->online_queues < 2; +} + static const struct nvme_ctrl_ops nvme_pci_ctrl_ops = { .reg_read32 = nvme_pci_reg_read32, .reg_write32 = nvme_pci_reg_write32, .reg_read64 = nvme_pci_reg_read64, + .io_incapable = nvme_pci_io_incapable, .free_ctrl = nvme_pci_free_ctrl, }; @@ -2599,7 +2441,7 @@ static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id) if (!dev->queues) goto free; - INIT_LIST_HEAD(&dev->namespaces); + INIT_LIST_HEAD(&dev->ctrl.namespaces); INIT_WORK(&dev->reset_work, nvme_reset_work); dev->dev = get_device(&pdev->dev); pci_set_drvdata(pdev, dev); @@ -2617,17 +2459,17 @@ static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id) goto release; kref_init(&dev->ctrl.kref); - dev->device = device_create(nvme_class, &pdev->dev, + dev->ctrl.device = device_create(nvme_class, &pdev->dev, MKDEV(nvme_char_major, dev->ctrl.instance), dev, "nvme%d", dev->ctrl.instance); - if (IS_ERR(dev->device)) { - result = PTR_ERR(dev->device); + if (IS_ERR(dev->ctrl.device)) { + result = PTR_ERR(dev->ctrl.device); goto release_pools; } - get_device(dev->device); - dev_set_drvdata(dev->device, dev); + get_device(dev->ctrl.device); + dev_set_drvdata(dev->ctrl.device, dev); - result = device_create_file(dev->device, &dev_attr_reset_controller); + result = device_create_file(dev->ctrl.device, &dev_attr_reset_controller); if (result) goto put_dev; @@ -2639,7 +2481,7 @@ static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id) put_dev: device_destroy(nvme_class, MKDEV(nvme_char_major, dev->ctrl.instance)); - put_device(dev->device); + put_device(dev->ctrl.device); release_pools: nvme_release_prp_pools(dev); release: @@ -2681,8 +2523,8 @@ static void nvme_remove(struct pci_dev *pdev) flush_work(&dev->probe_work); flush_work(&dev->reset_work); flush_work(&dev->scan_work); - device_remove_file(dev->device, &dev_attr_reset_controller); - nvme_dev_remove(dev); + device_remove_file(dev->ctrl.device, &dev_attr_reset_controller); + nvme_remove_namespaces(&dev->ctrl); nvme_dev_shutdown(dev); nvme_dev_remove_admin(dev); device_destroy(nvme_class, MKDEV(nvme_char_major, dev->ctrl.instance)); @@ -2764,11 +2606,9 @@ static int __init nvme_init(void) if (!nvme_workq) return -ENOMEM; - result = register_blkdev(nvme_major, "nvme"); + result = nvme_core_init(); if (result < 0) goto kill_workq; - else if (result > 0) - nvme_major = result; result = __register_chrdev(nvme_char_major, 0, NVME_MINORS, "nvme", &nvme_dev_fops); @@ -2793,7 +2633,7 @@ static int __init nvme_init(void) unregister_chrdev: __unregister_chrdev(nvme_char_major, 0, NVME_MINORS, "nvme"); unregister_blkdev: - unregister_blkdev(nvme_major, "nvme"); + nvme_core_exit(); kill_workq: destroy_workqueue(nvme_workq); return result; @@ -2802,7 +2642,7 @@ static int __init nvme_init(void) static void __exit nvme_exit(void) { pci_unregister_driver(&nvme_driver); - unregister_blkdev(nvme_major, "nvme"); + nvme_core_exit(); destroy_workqueue(nvme_workq); class_destroy(nvme_class); __unregister_chrdev(nvme_char_major, 0, NVME_MINORS, "nvme"); From f3ca80fc11c3af566eacd99cf821c1a48035c63b Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sat, 28 Nov 2015 15:40:19 +0100 Subject: [PATCH 21/67] nvme: move chardev and sysfs interface to common code For this we need to add a proper controller init routine and a list of all controllers that is in addition to the list of PCIe controllers, which stays in pci.c. Note that we remove the sysfs device when the last reference to a controller is dropped now - the old code would have kept it around longer, which doesn't make much sense. This requires a new ->reset_ctrl operation to implement controleller resets, and a new ->write_reg32 operation that is required to implement subsystem resets. We also now store caches copied of the NVMe compliance version and the flag if a controller is attached to a subsystem or not in the generic controller structure now. Signed-off-by: Christoph Hellwig [Fixes for pr merge] Signed-off-by: Keith Busch Signed-off-by: Jens Axboe --- drivers/nvme/host/core.c | 215 +++++++++++++++++++++++++++++++++++++-- drivers/nvme/host/nvme.h | 18 +++- drivers/nvme/host/pci.c | 203 ++++-------------------------------- drivers/nvme/host/scsi.c | 13 +-- 4 files changed, 245 insertions(+), 204 deletions(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 1b8498434b49..9c7dfd1476a7 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -31,11 +31,19 @@ #include "nvme.h" +#define NVME_MINORS (1U << MINORBITS) + static int nvme_major; module_param(nvme_major, int, 0); +static int nvme_char_major; +module_param(nvme_char_major, int, 0); + +static LIST_HEAD(nvme_ctrl_list); DEFINE_SPINLOCK(dev_list_lock); +static struct class *nvme_class; + static void nvme_free_ns(struct kref *kref) { struct nvme_ns *ns = container_of(kref, struct nvme_ns, kref); @@ -367,7 +375,7 @@ static int nvme_submit_io(struct nvme_ns *ns, struct nvme_user_io __user *uio) metadata, meta_len, io.slba, NULL, 0); } -int nvme_user_cmd(struct nvme_ctrl *ctrl, struct nvme_ns *ns, +static int nvme_user_cmd(struct nvme_ctrl *ctrl, struct nvme_ns *ns, struct nvme_passthru_cmd __user *ucmd) { struct nvme_passthru_cmd cmd; @@ -792,6 +800,12 @@ int nvme_init_identify(struct nvme_ctrl *ctrl) u64 cap; int ret, page_shift; + ret = ctrl->ops->reg_read32(ctrl, NVME_REG_VS, &ctrl->vs); + if (ret) { + dev_err(ctrl->dev, "Reading VS failed (%d)\n", ret); + return ret; + } + ret = ctrl->ops->reg_read64(ctrl, NVME_REG_CAP, &cap); if (ret) { dev_err(ctrl->dev, "Reading CAP failed (%d)\n", ret); @@ -799,6 +813,9 @@ int nvme_init_identify(struct nvme_ctrl *ctrl) } page_shift = NVME_CAP_MPSMIN(cap) + 12; + if (ctrl->vs >= NVME_VS(1, 1)) + ctrl->subsystem = NVME_CAP_NSSRC(cap); + ret = nvme_identify_ctrl(ctrl, &id); if (ret) { dev_err(ctrl->dev, "Identify Controller failed (%d)\n", ret); @@ -833,18 +850,85 @@ int nvme_init_identify(struct nvme_ctrl *ctrl) return 0; } -static void nvme_free_ctrl(struct kref *kref) +static int nvme_dev_open(struct inode *inode, struct file *file) { - struct nvme_ctrl *ctrl = container_of(kref, struct nvme_ctrl, kref); + struct nvme_ctrl *ctrl; + int instance = iminor(inode); + int ret = -ENODEV; - ctrl->ops->free_ctrl(ctrl); + spin_lock(&dev_list_lock); + list_for_each_entry(ctrl, &nvme_ctrl_list, node) { + if (ctrl->instance != instance) + continue; + + if (!ctrl->admin_q) { + ret = -EWOULDBLOCK; + break; + } + if (!kref_get_unless_zero(&ctrl->kref)) + break; + file->private_data = ctrl; + ret = 0; + break; + } + spin_unlock(&dev_list_lock); + + return ret; } -void nvme_put_ctrl(struct nvme_ctrl *ctrl) +static int nvme_dev_release(struct inode *inode, struct file *file) { - kref_put(&ctrl->kref, nvme_free_ctrl); + nvme_put_ctrl(file->private_data); + return 0; } +static long nvme_dev_ioctl(struct file *file, unsigned int cmd, + unsigned long arg) +{ + struct nvme_ctrl *ctrl = file->private_data; + void __user *argp = (void __user *)arg; + struct nvme_ns *ns; + + switch (cmd) { + case NVME_IOCTL_ADMIN_CMD: + return nvme_user_cmd(ctrl, NULL, argp); + case NVME_IOCTL_IO_CMD: + if (list_empty(&ctrl->namespaces)) + return -ENOTTY; + ns = list_first_entry(&ctrl->namespaces, struct nvme_ns, list); + return nvme_user_cmd(ctrl, ns, argp); + case NVME_IOCTL_RESET: + dev_warn(ctrl->dev, "resetting controller\n"); + return ctrl->ops->reset_ctrl(ctrl); + case NVME_IOCTL_SUBSYS_RESET: + return nvme_reset_subsystem(ctrl); + default: + return -ENOTTY; + } +} + +static const struct file_operations nvme_dev_fops = { + .owner = THIS_MODULE, + .open = nvme_dev_open, + .release = nvme_dev_release, + .unlocked_ioctl = nvme_dev_ioctl, + .compat_ioctl = nvme_dev_ioctl, +}; + +static ssize_t nvme_sysfs_reset(struct device *dev, + struct device_attribute *attr, const char *buf, + size_t count) +{ + struct nvme_ctrl *ctrl = dev_get_drvdata(dev); + int ret; + + ret = ctrl->ops->reset_ctrl(ctrl); + if (ret < 0) + return ret; + return count; +} +static DEVICE_ATTR(reset_controller, S_IWUSR, NULL, nvme_sysfs_reset); + static int ns_cmp(void *priv, struct list_head *a, struct list_head *b) { struct nvme_ns *nsa = container_of(a, struct nvme_ns, list); @@ -1009,6 +1093,104 @@ void nvme_remove_namespaces(struct nvme_ctrl *ctrl) nvme_ns_remove(ns); } +static DEFINE_IDA(nvme_instance_ida); + +static int nvme_set_instance(struct nvme_ctrl *ctrl) +{ + int instance, error; + + do { + if (!ida_pre_get(&nvme_instance_ida, GFP_KERNEL)) + return -ENODEV; + + spin_lock(&dev_list_lock); + error = ida_get_new(&nvme_instance_ida, &instance); + spin_unlock(&dev_list_lock); + } while (error == -EAGAIN); + + if (error) + return -ENODEV; + + ctrl->instance = instance; + return 0; +} + +static void nvme_release_instance(struct nvme_ctrl *ctrl) +{ + spin_lock(&dev_list_lock); + ida_remove(&nvme_instance_ida, ctrl->instance); + spin_unlock(&dev_list_lock); +} + +static void nvme_free_ctrl(struct kref *kref) +{ + struct nvme_ctrl *ctrl = container_of(kref, struct nvme_ctrl, kref); + + spin_lock(&dev_list_lock); + list_del(&ctrl->node); + spin_unlock(&dev_list_lock); + + put_device(ctrl->device); + nvme_release_instance(ctrl); + device_destroy(nvme_class, MKDEV(nvme_char_major, ctrl->instance)); + + ctrl->ops->free_ctrl(ctrl); +} + +void nvme_put_ctrl(struct nvme_ctrl *ctrl) +{ + kref_put(&ctrl->kref, nvme_free_ctrl); +} + +/* + * Initialize a NVMe controller structures. This needs to be called during + * earliest initialization so that we have the initialized structured around + * during probing. + */ +int nvme_init_ctrl(struct nvme_ctrl *ctrl, struct device *dev, + const struct nvme_ctrl_ops *ops, unsigned long quirks) +{ + int ret; + + INIT_LIST_HEAD(&ctrl->namespaces); + kref_init(&ctrl->kref); + ctrl->dev = dev; + ctrl->ops = ops; + ctrl->quirks = quirks; + + ret = nvme_set_instance(ctrl); + if (ret) + goto out; + + ctrl->device = device_create(nvme_class, ctrl->dev, + MKDEV(nvme_char_major, ctrl->instance), + dev, "nvme%d", ctrl->instance); + if (IS_ERR(ctrl->device)) { + ret = PTR_ERR(ctrl->device); + goto out_release_instance; + } + get_device(ctrl->device); + dev_set_drvdata(ctrl->device, ctrl); + + ret = device_create_file(ctrl->device, &dev_attr_reset_controller); + if (ret) + goto out_put_device; + + spin_lock(&dev_list_lock); + list_add_tail(&ctrl->node, &nvme_ctrl_list); + spin_unlock(&dev_list_lock); + + return 0; + +out_put_device: + put_device(ctrl->device); + device_destroy(nvme_class, MKDEV(nvme_char_major, ctrl->instance)); +out_release_instance: + nvme_release_instance(ctrl); +out: + return ret; +} + int __init nvme_core_init(void) { int result; @@ -1019,10 +1201,31 @@ int __init nvme_core_init(void) else if (result > 0) nvme_major = result; + result = __register_chrdev(nvme_char_major, 0, NVME_MINORS, "nvme", + &nvme_dev_fops); + if (result < 0) + goto unregister_blkdev; + else if (result > 0) + nvme_char_major = result; + + nvme_class = class_create(THIS_MODULE, "nvme"); + if (IS_ERR(nvme_class)) { + result = PTR_ERR(nvme_class); + goto unregister_chrdev; + } + return 0; + + unregister_chrdev: + __unregister_chrdev(nvme_char_major, 0, NVME_MINORS, "nvme"); + unregister_blkdev: + unregister_blkdev(nvme_major, "nvme"); + return result; } void nvme_core_exit(void) { unregister_blkdev(nvme_major, "nvme"); + class_destroy(nvme_class); + __unregister_chrdev(nvme_char_major, 0, NVME_MINORS, "nvme"); } diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index dfedaaa2633b..93378be874e1 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -19,8 +19,6 @@ #include #include -struct nvme_passthru_cmd; - extern unsigned char nvme_io_timeout; #define NVME_IO_TIMEOUT (nvme_io_timeout * HZ) @@ -56,6 +54,7 @@ struct nvme_ctrl { struct blk_mq_tag_set *tagset; struct list_head namespaces; struct device *device; /* char device */ + struct list_head node; char name[12]; char serial[20]; @@ -71,6 +70,8 @@ struct nvme_ctrl { u16 abort_limit; u8 event_limit; u8 vwc; + u32 vs; + bool subsystem; unsigned long quirks; }; @@ -100,6 +101,7 @@ struct nvme_ctrl_ops { int (*reg_write32)(struct nvme_ctrl *ctrl, u32 off, u32 val); int (*reg_read64)(struct nvme_ctrl *ctrl, u32 off, u64 *val); bool (*io_incapable)(struct nvme_ctrl *ctrl); + int (*reset_ctrl)(struct nvme_ctrl *ctrl); void (*free_ctrl)(struct nvme_ctrl *ctrl); }; @@ -123,6 +125,13 @@ static inline bool nvme_io_incapable(struct nvme_ctrl *ctrl) return val & NVME_CSTS_CFS; } +static inline int nvme_reset_subsystem(struct nvme_ctrl *ctrl) +{ + if (!ctrl->subsystem) + return -ENOTTY; + return ctrl->ops->reg_write32(ctrl, NVME_REG_NSSR, 0x4E564D65); +} + static inline u64 nvme_block_nr(struct nvme_ns *ns, sector_t sector) { return (sector >> (ns->lba_shift - 9)); @@ -194,6 +203,8 @@ static inline int nvme_error_status(u16 status) int nvme_disable_ctrl(struct nvme_ctrl *ctrl, u64 cap); int nvme_enable_ctrl(struct nvme_ctrl *ctrl, u64 cap); int nvme_shutdown_ctrl(struct nvme_ctrl *ctrl); +int nvme_init_ctrl(struct nvme_ctrl *ctrl, struct device *dev, + const struct nvme_ctrl_ops *ops, unsigned long quirks); void nvme_put_ctrl(struct nvme_ctrl *ctrl); int nvme_init_identify(struct nvme_ctrl *ctrl); @@ -224,9 +235,6 @@ int nvme_set_features(struct nvme_ctrl *dev, unsigned fid, unsigned dword11, extern spinlock_t dev_list_lock; -int nvme_user_cmd(struct nvme_ctrl *ctrl, struct nvme_ns *ns, - struct nvme_passthru_cmd __user *ucmd); - struct sg_io_hdr; int nvme_sg_io(struct nvme_ns *ns, struct sg_io_hdr __user *u_hdr); diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 697dc1fb5ef9..87ad57bcc7ed 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -38,15 +38,11 @@ #include #include #include -#include -#include #include #include -#include #include "nvme.h" -#define NVME_MINORS (1U << MINORBITS) #define NVME_Q_DEPTH 1024 #define NVME_AQ_DEPTH 256 #define SQ_SIZE(depth) (depth * sizeof(struct nvme_command)) @@ -64,9 +60,6 @@ unsigned char shutdown_timeout = 5; module_param(shutdown_timeout, byte, 0644); MODULE_PARM_DESC(shutdown_timeout, "timeout in seconds for controller shutdown"); -static int nvme_char_major; -module_param(nvme_char_major, int, 0); - static int use_threaded_interrupts; module_param(use_threaded_interrupts, int, 0); @@ -79,8 +72,6 @@ static struct task_struct *nvme_thread; static struct workqueue_struct *nvme_workq; static wait_queue_head_t nvme_kthread_wait; -static struct class *nvme_class; - struct nvme_dev; struct nvme_queue; struct nvme_iod; @@ -1505,15 +1496,6 @@ static int nvme_configure_admin_queue(struct nvme_dev *dev) return result; } -static int nvme_subsys_reset(struct nvme_dev *dev) -{ - if (!dev->subsystem) - return -ENOTTY; - - writel(0x4E564D65, dev->bar + NVME_REG_NSSR); /* "NVMe" */ - return 0; -} - static int nvme_kthread(void *data) { struct nvme_dev *dev, *next; @@ -2113,42 +2095,11 @@ static void nvme_release_prp_pools(struct nvme_dev *dev) dma_pool_destroy(dev->prp_small_pool); } -static DEFINE_IDA(nvme_instance_ida); - -static int nvme_set_instance(struct nvme_dev *dev) -{ - int instance, error; - - do { - if (!ida_pre_get(&nvme_instance_ida, GFP_KERNEL)) - return -ENODEV; - - spin_lock(&dev_list_lock); - error = ida_get_new(&nvme_instance_ida, &instance); - spin_unlock(&dev_list_lock); - } while (error == -EAGAIN); - - if (error) - return -ENODEV; - - dev->ctrl.instance = instance; - return 0; -} - -static void nvme_release_instance(struct nvme_dev *dev) -{ - spin_lock(&dev_list_lock); - ida_remove(&nvme_instance_ida, dev->ctrl.instance); - spin_unlock(&dev_list_lock); -} - static void nvme_pci_free_ctrl(struct nvme_ctrl *ctrl) { struct nvme_dev *dev = to_nvme_dev(ctrl); put_device(dev->dev); - put_device(ctrl->device); - nvme_release_instance(dev); if (dev->tagset.tags) blk_mq_free_tag_set(&dev->tagset); if (dev->ctrl.admin_q) @@ -2158,69 +2109,6 @@ static void nvme_pci_free_ctrl(struct nvme_ctrl *ctrl) kfree(dev); } -static int nvme_dev_open(struct inode *inode, struct file *f) -{ - struct nvme_dev *dev; - int instance = iminor(inode); - int ret = -ENODEV; - - spin_lock(&dev_list_lock); - list_for_each_entry(dev, &dev_list, node) { - if (dev->ctrl.instance == instance) { - if (!dev->ctrl.admin_q) { - ret = -EWOULDBLOCK; - break; - } - if (!kref_get_unless_zero(&dev->ctrl.kref)) - break; - f->private_data = dev; - ret = 0; - break; - } - } - spin_unlock(&dev_list_lock); - - return ret; -} - -static int nvme_dev_release(struct inode *inode, struct file *f) -{ - struct nvme_dev *dev = f->private_data; - nvme_put_ctrl(&dev->ctrl); - return 0; -} - -static long nvme_dev_ioctl(struct file *f, unsigned int cmd, unsigned long arg) -{ - struct nvme_dev *dev = f->private_data; - struct nvme_ns *ns; - - switch (cmd) { - case NVME_IOCTL_ADMIN_CMD: - return nvme_user_cmd(&dev->ctrl, NULL, (void __user *)arg); - case NVME_IOCTL_IO_CMD: - if (list_empty(&dev->ctrl.namespaces)) - return -ENOTTY; - ns = list_first_entry(&dev->ctrl.namespaces, struct nvme_ns, list); - return nvme_user_cmd(&dev->ctrl, ns, (void __user *)arg); - case NVME_IOCTL_RESET: - dev_warn(dev->dev, "resetting controller\n"); - return nvme_reset(dev); - case NVME_IOCTL_SUBSYS_RESET: - return nvme_subsys_reset(dev); - default: - return -ENOTTY; - } -} - -static const struct file_operations nvme_dev_fops = { - .owner = THIS_MODULE, - .open = nvme_dev_open, - .release = nvme_dev_release, - .unlocked_ioctl = nvme_dev_ioctl, - .compat_ioctl = nvme_dev_ioctl, -}; - static void nvme_probe_work(struct work_struct *work) { struct nvme_dev *dev = container_of(work, struct nvme_dev, probe_work); @@ -2372,21 +2260,6 @@ static int nvme_reset(struct nvme_dev *dev) return ret; } -static ssize_t nvme_sysfs_reset(struct device *dev, - struct device_attribute *attr, const char *buf, - size_t count) -{ - struct nvme_dev *ndev = dev_get_drvdata(dev); - int ret; - - ret = nvme_reset(ndev); - if (ret < 0) - return ret; - - return count; -} -static DEVICE_ATTR(reset_controller, S_IWUSR, NULL, nvme_sysfs_reset); - static int nvme_pci_reg_read32(struct nvme_ctrl *ctrl, u32 off, u32 *val) { *val = readl(to_nvme_dev(ctrl)->bar + off); @@ -2412,11 +2285,17 @@ static bool nvme_pci_io_incapable(struct nvme_ctrl *ctrl) return !dev->bar || dev->online_queues < 2; } +static int nvme_pci_reset_ctrl(struct nvme_ctrl *ctrl) +{ + return nvme_reset(to_nvme_dev(ctrl)); +} + static const struct nvme_ctrl_ops nvme_pci_ctrl_ops = { .reg_read32 = nvme_pci_reg_read32, .reg_write32 = nvme_pci_reg_write32, .reg_read64 = nvme_pci_reg_read64, .io_incapable = nvme_pci_io_incapable, + .reset_ctrl = nvme_pci_reset_ctrl, .free_ctrl = nvme_pci_free_ctrl, }; @@ -2441,51 +2320,28 @@ static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id) if (!dev->queues) goto free; - INIT_LIST_HEAD(&dev->ctrl.namespaces); - INIT_WORK(&dev->reset_work, nvme_reset_work); dev->dev = get_device(&pdev->dev); pci_set_drvdata(pdev, dev); - dev->ctrl.ops = &nvme_pci_ctrl_ops; - dev->ctrl.dev = dev->dev; - dev->ctrl.quirks = id->driver_data; - - result = nvme_set_instance(dev); - if (result) - goto put_pci; - - result = nvme_setup_prp_pools(dev); - if (result) - goto release; - - kref_init(&dev->ctrl.kref); - dev->ctrl.device = device_create(nvme_class, &pdev->dev, - MKDEV(nvme_char_major, dev->ctrl.instance), - dev, "nvme%d", dev->ctrl.instance); - if (IS_ERR(dev->ctrl.device)) { - result = PTR_ERR(dev->ctrl.device); - goto release_pools; - } - get_device(dev->ctrl.device); - dev_set_drvdata(dev->ctrl.device, dev); - - result = device_create_file(dev->ctrl.device, &dev_attr_reset_controller); - if (result) - goto put_dev; - INIT_LIST_HEAD(&dev->node); INIT_WORK(&dev->scan_work, nvme_dev_scan); INIT_WORK(&dev->probe_work, nvme_probe_work); + INIT_WORK(&dev->reset_work, nvme_reset_work); + + result = nvme_setup_prp_pools(dev); + if (result) + goto put_pci; + + result = nvme_init_ctrl(&dev->ctrl, &pdev->dev, &nvme_pci_ctrl_ops, + id->driver_data); + if (result) + goto release_pools; + schedule_work(&dev->probe_work); return 0; - put_dev: - device_destroy(nvme_class, MKDEV(nvme_char_major, dev->ctrl.instance)); - put_device(dev->ctrl.device); release_pools: nvme_release_prp_pools(dev); - release: - nvme_release_instance(dev); put_pci: put_device(dev->dev); free: @@ -2523,11 +2379,9 @@ static void nvme_remove(struct pci_dev *pdev) flush_work(&dev->probe_work); flush_work(&dev->reset_work); flush_work(&dev->scan_work); - device_remove_file(dev->ctrl.device, &dev_attr_reset_controller); nvme_remove_namespaces(&dev->ctrl); nvme_dev_shutdown(dev); nvme_dev_remove_admin(dev); - device_destroy(nvme_class, MKDEV(nvme_char_major, dev->ctrl.instance)); nvme_free_queues(dev, 0); nvme_release_cmb(dev); nvme_release_prp_pools(dev); @@ -2610,29 +2464,12 @@ static int __init nvme_init(void) if (result < 0) goto kill_workq; - result = __register_chrdev(nvme_char_major, 0, NVME_MINORS, "nvme", - &nvme_dev_fops); - if (result < 0) - goto unregister_blkdev; - else if (result > 0) - nvme_char_major = result; - - nvme_class = class_create(THIS_MODULE, "nvme"); - if (IS_ERR(nvme_class)) { - result = PTR_ERR(nvme_class); - goto unregister_chrdev; - } - result = pci_register_driver(&nvme_driver); if (result) - goto destroy_class; + goto core_exit; return 0; - destroy_class: - class_destroy(nvme_class); - unregister_chrdev: - __unregister_chrdev(nvme_char_major, 0, NVME_MINORS, "nvme"); - unregister_blkdev: + core_exit: nvme_core_exit(); kill_workq: destroy_workqueue(nvme_workq); @@ -2644,8 +2481,6 @@ static void __exit nvme_exit(void) pci_unregister_driver(&nvme_driver); nvme_core_exit(); destroy_workqueue(nvme_workq); - class_destroy(nvme_class); - __unregister_chrdev(nvme_char_major, 0, NVME_MINORS, "nvme"); BUG_ON(nvme_thread && !IS_ERR(nvme_thread)); _nvme_check_size(); } diff --git a/drivers/nvme/host/scsi.c b/drivers/nvme/host/scsi.c index eaf725610fe2..e947e298a737 100644 --- a/drivers/nvme/host/scsi.c +++ b/drivers/nvme/host/scsi.c @@ -600,7 +600,7 @@ static int nvme_trans_unit_serial_page(struct nvme_ns *ns, } static int nvme_fill_device_id_eui64(struct nvme_ns *ns, struct sg_io_hdr *hdr, - u8 *inq_response, int alloc_len, u32 vs) + u8 *inq_response, int alloc_len) { struct nvme_id_ns *id_ns; int nvme_sc, res; @@ -615,7 +615,7 @@ static int nvme_fill_device_id_eui64(struct nvme_ns *ns, struct sg_io_hdr *hdr, eui = id_ns->eui64; len = sizeof(id_ns->eui64); - if (vs >= NVME_VS(1, 2)) { + if (ns->ctrl->vs >= NVME_VS(1, 2)) { if (bitmap_empty(eui, len * 8)) { eui = id_ns->nguid; len = sizeof(id_ns->nguid); @@ -687,14 +687,9 @@ static int nvme_trans_device_id_page(struct nvme_ns *ns, struct sg_io_hdr *hdr, u8 *resp, int alloc_len) { int res; - u32 vs; - res = ns->ctrl->ops->reg_read32(ns->ctrl, NVME_REG_VS, &vs); - if (res) - return res; - - if (vs >= NVME_VS(1, 1)) { - res = nvme_fill_device_id_eui64(ns, hdr, resp, alloc_len, vs); + if (ns->ctrl->vs >= NVME_VS(1, 1)) { + res = nvme_fill_device_id_eui64(ns, hdr, resp, alloc_len); if (res != -EOPNOTSUPP) return res; } From 9a0be7abb62ff2a7dc3360ab45c31f29b3faf642 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 26 Nov 2015 11:09:06 +0100 Subject: [PATCH 22/67] nvme: refactor set_queue_count Split out a helper that just issues the Set Features and interprets the result which can go to common code, and document why we are ignoring non-timeout error returns in the PCIe driver. Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- drivers/nvme/host/core.c | 16 ++++++++++++++++ drivers/nvme/host/nvme.h | 1 + drivers/nvme/host/pci.c | 34 +++++++++++++--------------------- 3 files changed, 30 insertions(+), 21 deletions(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 9c7dfd1476a7..c61bde9921d2 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -327,6 +327,22 @@ int nvme_get_log_page(struct nvme_ctrl *dev, struct nvme_smart_log **log) return error; } +int nvme_set_queue_count(struct nvme_ctrl *ctrl, int *count) +{ + u32 q_count = (*count - 1) | ((*count - 1) << 16); + u32 result; + int status, nr_io_queues; + + status = nvme_set_features(ctrl, NVME_FEAT_NUM_QUEUES, q_count, 0, + &result); + if (status) + return status; + + nr_io_queues = min(result & 0xffff, result >> 16) + 1; + *count = min(*count, nr_io_queues); + return 0; +} + static int nvme_submit_io(struct nvme_ns *ns, struct nvme_user_io __user *uio) { struct nvme_user_io io; diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index 93378be874e1..b75d41e5c378 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -232,6 +232,7 @@ int nvme_get_features(struct nvme_ctrl *dev, unsigned fid, unsigned nsid, dma_addr_t dma_addr, u32 *result); int nvme_set_features(struct nvme_ctrl *dev, unsigned fid, unsigned dword11, dma_addr_t dma_addr, u32 *result); +int nvme_set_queue_count(struct nvme_ctrl *ctrl, int *count); extern spinlock_t dev_list_lock; diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 87ad57bcc7ed..a64d0baacc58 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -1559,23 +1559,6 @@ static void nvme_create_io_queues(struct nvme_dev *dev) } } -static int set_queue_count(struct nvme_dev *dev, int count) -{ - int status; - u32 result; - u32 q_count = (count - 1) | ((count - 1) << 16); - - status = nvme_set_features(&dev->ctrl, NVME_FEAT_NUM_QUEUES, q_count, 0, - &result); - if (status < 0) - return status; - if (status > 0) { - dev_err(dev->dev, "Could not set queue count (%d)\n", status); - return 0; - } - return min(result & 0xffff, result >> 16) + 1; -} - static void __iomem *nvme_map_cmb(struct nvme_dev *dev) { u64 szu, size, offset; @@ -1640,11 +1623,20 @@ static int nvme_setup_io_queues(struct nvme_dev *dev) int result, i, vecs, nr_io_queues, size; nr_io_queues = num_possible_cpus(); - result = set_queue_count(dev, nr_io_queues); - if (result <= 0) + result = nvme_set_queue_count(&dev->ctrl, &nr_io_queues); + if (result < 0) return result; - if (result < nr_io_queues) - nr_io_queues = result; + + /* + * Degraded controllers might return an error when setting the queue + * count. We still want to be able to bring them online and offer + * access to the admin queue, as that might be only way to fix them up. + */ + if (result > 0) { + dev_err(dev->dev, "Could not set queue count (%d)\n", result); + nr_io_queues = 0; + result = 0; + } if (dev->cmb && NVME_CMB_SQS(dev->cmbsz)) { result = nvme_cmb_qdepth(dev, nr_io_queues, From 06c1e3902aa74b7432a7e82bb4a5aca233a42839 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Thu, 3 Dec 2015 09:32:21 -0700 Subject: [PATCH 23/67] blk-integrity: empty implementation when disabled This patch moves the blk_integrity_payload definition outside the CONFIG_BLK_DEV_INTERITY dependency and provides empty function implementations when the kernel configuration disables integrity extensions. This simplifies drivers that make use of these to map user data so they don't need to repeat the same configuration checks. Signed-off-by: Keith Busch Updated by Jens to pass an error pointer return from bio_integrity_alloc(), otherwise if CONFIG_BLK_DEV_INTEGRITY isn't set, we return a weird ENOMEM from __nvme_submit_user_cmd() if a meta buffer is set. Signed-off-by: Jens Axboe --- block/bio-integrity.c | 4 ++-- drivers/nvme/host/core.c | 4 ++-- drivers/target/target_core_iblock.c | 4 ++-- include/linux/bio.h | 32 ++++++++++++++++++++--------- 4 files changed, 28 insertions(+), 16 deletions(-) diff --git a/block/bio-integrity.c b/block/bio-integrity.c index f6325d573c10..e6ba501eb746 100644 --- a/block/bio-integrity.c +++ b/block/bio-integrity.c @@ -66,7 +66,7 @@ struct bio_integrity_payload *bio_integrity_alloc(struct bio *bio, } if (unlikely(!bip)) - return NULL; + return ERR_PTR(-ENOMEM); memset(bip, 0, sizeof(*bip)); @@ -89,7 +89,7 @@ struct bio_integrity_payload *bio_integrity_alloc(struct bio *bio, return bip; err: mempool_free(bip, bs->bio_integrity_pool); - return NULL; + return ERR_PTR(-ENOMEM); } EXPORT_SYMBOL(bio_integrity_alloc); diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index c61bde9921d2..f9c4e80c2441 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -190,8 +190,8 @@ int __nvme_submit_user_cmd(struct request_queue *q, struct nvme_command *cmd, } bip = bio_integrity_alloc(bio, GFP_KERNEL, 1); - if (!bip) { - ret = -ENOMEM; + if (IS_ERR(bip)) { + ret = PTR_ERR(bip); goto out_free_meta; } diff --git a/drivers/target/target_core_iblock.c b/drivers/target/target_core_iblock.c index f29c69120054..d5891b6ea737 100644 --- a/drivers/target/target_core_iblock.c +++ b/drivers/target/target_core_iblock.c @@ -613,9 +613,9 @@ iblock_alloc_bip(struct se_cmd *cmd, struct bio *bio) } bip = bio_integrity_alloc(bio, GFP_NOIO, cmd->t_prot_nents); - if (!bip) { + if (IS_ERR(bip)) { pr_err("Unable to allocate bio_integrity_payload\n"); - return -ENOMEM; + return PTR_ERR(bip); } bip->bip_iter.bi_size = (cmd->data_length / dev->dev_attrib.block_size) * diff --git a/include/linux/bio.h b/include/linux/bio.h index b9b6e046b52e..5349e6816cbb 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -318,16 +318,6 @@ enum bip_flags { BIP_IP_CHECKSUM = 1 << 4, /* IP checksum */ }; -#if defined(CONFIG_BLK_DEV_INTEGRITY) - -static inline struct bio_integrity_payload *bio_integrity(struct bio *bio) -{ - if (bio->bi_rw & REQ_INTEGRITY) - return bio->bi_integrity; - - return NULL; -} - /* * bio integrity payload */ @@ -349,6 +339,16 @@ struct bio_integrity_payload { struct bio_vec bip_inline_vecs[0];/* embedded bvec array */ }; +#if defined(CONFIG_BLK_DEV_INTEGRITY) + +static inline struct bio_integrity_payload *bio_integrity(struct bio *bio) +{ + if (bio->bi_rw & REQ_INTEGRITY) + return bio->bi_integrity; + + return NULL; +} + static inline bool bio_integrity_flagged(struct bio *bio, enum bip_flags flag) { struct bio_integrity_payload *bip = bio_integrity(bio); @@ -795,6 +795,18 @@ static inline bool bio_integrity_flagged(struct bio *bio, enum bip_flags flag) return false; } +static inline void *bio_integrity_alloc(struct bio * bio, gfp_t gfp, + unsigned int nr) +{ + return ERR_PTR(-EINVAL); +} + +static inline int bio_integrity_add_page(struct bio *bio, struct page *page, + unsigned int len, unsigned int offset) +{ + return 0; +} + #endif /* CONFIG_BLK_DEV_INTEGRITY */ #endif /* CONFIG_BLOCK */ From ac02dddec63385ffef1397d3f56cec4108bcafe9 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 3 Dec 2015 09:52:05 -0700 Subject: [PATCH 24/67] NVMe: fix build with CONFIG_NVM enabled Looks like I didn't test with CONFIG_NVM enabled, and neither did the build bot. Most of this is really weird crazy shit in the lighnvm support, though. Struct nvme_ns is a structure for the NVM I/O command set, and it has no business poking into it. Second this commit: commit 47b3115ae7b799be8b77b0f024215ad4f68d6460 Author: Wenwei Tao Date: Fri Nov 20 13:47:55 2015 +0100 nvme: lightnvm: use admin queues for admin cmds Does even more crazy stuff. If a function gets a request_queue parameter passed it'd better use that and not look for another one. Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- drivers/nvme/host/lightnvm.c | 35 ++++++++++++++++------------------- 1 file changed, 16 insertions(+), 19 deletions(-) diff --git a/drivers/nvme/host/lightnvm.c b/drivers/nvme/host/lightnvm.c index d5622f9164ad..09cf0b99d2fa 100644 --- a/drivers/nvme/host/lightnvm.c +++ b/drivers/nvme/host/lightnvm.c @@ -276,7 +276,6 @@ static int init_grps(struct nvm_id *nvm_id, struct nvme_nvm_id *nvme_nvm_id) static int nvme_nvm_identity(struct request_queue *q, struct nvm_id *nvm_id) { struct nvme_ns *ns = q->queuedata; - struct nvme_dev *dev = ns->dev; struct nvme_nvm_id *nvme_nvm_id; struct nvme_nvm_command c = {}; int ret; @@ -289,7 +288,7 @@ static int nvme_nvm_identity(struct request_queue *q, struct nvm_id *nvm_id) if (!nvme_nvm_id) return -ENOMEM; - ret = nvme_submit_sync_cmd(dev->admin_q, (struct nvme_command *)&c, + ret = nvme_submit_sync_cmd(ns->ctrl->admin_q, (struct nvme_command *)&c, nvme_nvm_id, sizeof(struct nvme_nvm_id)); if (ret) { ret = -EIO; @@ -314,9 +313,8 @@ static int nvme_nvm_get_l2p_tbl(struct request_queue *q, u64 slba, u32 nlb, nvm_l2p_update_fn *update_l2p, void *priv) { struct nvme_ns *ns = q->queuedata; - struct nvme_dev *dev = ns->dev; struct nvme_nvm_command c = {}; - u32 len = queue_max_hw_sectors(dev->admin_q) << 9; + u32 len = queue_max_hw_sectors(ns->ctrl->admin_q) << 9; u32 nlb_pr_rq = len / sizeof(u64); u64 cmd_slba = slba; void *entries; @@ -334,10 +332,10 @@ static int nvme_nvm_get_l2p_tbl(struct request_queue *q, u64 slba, u32 nlb, c.l2p.slba = cpu_to_le64(cmd_slba); c.l2p.nlb = cpu_to_le32(cmd_nlb); - ret = nvme_submit_sync_cmd(dev->admin_q, + ret = nvme_submit_sync_cmd(ns->ctrl->admin_q, (struct nvme_command *)&c, entries, len); if (ret) { - dev_err(dev->dev, "L2P table transfer failed (%d)\n", + dev_err(ns->ctrl->dev, "L2P table transfer failed (%d)\n", ret); ret = -EIO; goto out; @@ -362,7 +360,7 @@ static int nvme_nvm_get_bb_tbl(struct request_queue *q, struct ppa_addr ppa, void *priv) { struct nvme_ns *ns = q->queuedata; - struct nvme_dev *dev = ns->dev; + struct nvme_ctrl *ctrl = ns->ctrl; struct nvme_nvm_command c = {}; struct nvme_nvm_bb_tbl *bb_tbl; int tblsz = sizeof(struct nvme_nvm_bb_tbl) + nr_blocks; @@ -376,30 +374,30 @@ static int nvme_nvm_get_bb_tbl(struct request_queue *q, struct ppa_addr ppa, if (!bb_tbl) return -ENOMEM; - ret = nvme_submit_sync_cmd(dev->admin_q, (struct nvme_command *)&c, + ret = nvme_submit_sync_cmd(ctrl->admin_q, (struct nvme_command *)&c, bb_tbl, tblsz); if (ret) { - dev_err(dev->dev, "get bad block table failed (%d)\n", ret); + dev_err(ctrl->dev, "get bad block table failed (%d)\n", ret); ret = -EIO; goto out; } if (bb_tbl->tblid[0] != 'B' || bb_tbl->tblid[1] != 'B' || bb_tbl->tblid[2] != 'L' || bb_tbl->tblid[3] != 'T') { - dev_err(dev->dev, "bbt format mismatch\n"); + dev_err(ctrl->dev, "bbt format mismatch\n"); ret = -EINVAL; goto out; } if (le16_to_cpu(bb_tbl->verid) != 1) { ret = -EINVAL; - dev_err(dev->dev, "bbt version not supported\n"); + dev_err(ctrl->dev, "bbt version not supported\n"); goto out; } if (le32_to_cpu(bb_tbl->tblks) != nr_blocks) { ret = -EINVAL; - dev_err(dev->dev, "bbt unsuspected blocks returned (%u!=%u)", + dev_err(ctrl->dev, "bbt unsuspected blocks returned (%u!=%u)", le32_to_cpu(bb_tbl->tblks), nr_blocks); goto out; } @@ -419,7 +417,6 @@ static int nvme_nvm_set_bb_tbl(struct request_queue *q, struct nvm_rq *rqd, int type) { struct nvme_ns *ns = q->queuedata; - struct nvme_dev *dev = ns->dev; struct nvme_nvm_command c = {}; int ret = 0; @@ -429,10 +426,10 @@ static int nvme_nvm_set_bb_tbl(struct request_queue *q, struct nvm_rq *rqd, c.set_bb.nlb = cpu_to_le16(rqd->nr_pages - 1); c.set_bb.value = type; - ret = nvme_submit_sync_cmd(dev->admin_q, (struct nvme_command *)&c, + ret = nvme_submit_sync_cmd(ns->ctrl->admin_q, (struct nvme_command *)&c, NULL, 0); if (ret) - dev_err(dev->dev, "set bad block table failed (%d)\n", ret); + dev_err(ns->ctrl->dev, "set bad block table failed (%d)\n", ret); return ret; } @@ -518,9 +515,8 @@ static int nvme_nvm_erase_block(struct request_queue *q, struct nvm_rq *rqd) static void *nvme_nvm_create_dma_pool(struct request_queue *q, char *name) { struct nvme_ns *ns = q->queuedata; - struct nvme_dev *dev = ns->dev; - return dma_pool_create(name, dev->dev, PAGE_SIZE, PAGE_SIZE, 0); + return dma_pool_create(name, ns->ctrl->dev, PAGE_SIZE, PAGE_SIZE, 0); } static void nvme_nvm_destroy_dma_pool(void *pool) @@ -573,8 +569,9 @@ void nvme_nvm_unregister(struct request_queue *q, char *disk_name) int nvme_nvm_ns_supported(struct nvme_ns *ns, struct nvme_id_ns *id) { - struct nvme_dev *dev = ns->dev; - struct pci_dev *pdev = to_pci_dev(dev->dev); + struct nvme_ctrl *ctrl = ns->ctrl; + /* XXX: this is poking into PCI structures from generic code! */ + struct pci_dev *pdev = to_pci_dev(ctrl->dev); /* QEMU NVMe simulator - PCI ID + Vendor specific bit */ if (pdev->vendor == PCI_VENDOR_ID_INTEL && pdev->device == 0x5845 && From d1ea7be5f755bf1a4d4fdccc35880fcf5069df60 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Tue, 8 Dec 2015 16:22:17 +0100 Subject: [PATCH 25/67] nvme: fix another 32-bit build warning The nvme_user_cmd function was recently moved around from one file to another, which made a warning reappear that I had fixed before at some point: drivers/nvme/host/core.c: In function 'nvme_user_cmd': drivers/nvme/host/core.c:424:4: warning: cast to pointer from integer of different size [-Wint-to-pointer-cast] This applies the same workaround that we have elsewhere in the driver with an extra type cast to uintptr_t. Signed-off-by: Arnd Bergmann Fixes: 1673f1f08c88 ("nvme: move block_device_operations and ns/ctrl freeing to common code") Link: https://lkml.org/lkml/2015/10/9/611 Signed-off-by: Jens Axboe --- drivers/nvme/host/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index f9c4e80c2441..47ebfb85b14b 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -421,7 +421,7 @@ static int nvme_user_cmd(struct nvme_ctrl *ctrl, struct nvme_ns *ns, timeout = msecs_to_jiffies(cmd.timeout_ms); status = nvme_submit_user_cmd(ns ? ns->queue : ctrl->admin_q, &c, - (void __user *)cmd.addr, cmd.data_len, + (void __user *)(uintptr_t)cmd.addr, cmd.data_len, &cmd.result, timeout); if (status >= 0) { if (put_user(cmd.result, &ucmd->result)) From 7b6c0f8034d78390f9185e2ec2edb0a3e4ad244e Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Wed, 9 Dec 2015 13:21:23 +0300 Subject: [PATCH 26/67] blk-integrity: checking for NULL instead of IS_ERR We recently changed bio_integrity_alloc() to return ERR_PTRs instead of NULL but these calls were missed. Fixes: 06c1e3902aa7 ('blk-integrity: empty implementation when disabled') Signed-off-by: Dan Carpenter Reviewed-by: Johannes Thumshirn Signed-off-by: Jens Axboe --- block/bio-integrity.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/block/bio-integrity.c b/block/bio-integrity.c index e6ba501eb746..711e4d8de6fa 100644 --- a/block/bio-integrity.c +++ b/block/bio-integrity.c @@ -298,10 +298,10 @@ int bio_integrity_prep(struct bio *bio) /* Allocate bio integrity payload and integrity vectors */ bip = bio_integrity_alloc(bio, GFP_NOIO, nr_pages); - if (unlikely(bip == NULL)) { + if (IS_ERR(bip)) { printk(KERN_ERR "could not allocate data integrity bioset\n"); kfree(buf); - return -EIO; + return PTR_ERR(bip); } bip->bip_flags |= BIP_BLOCK_INTEGRITY; @@ -465,9 +465,8 @@ int bio_integrity_clone(struct bio *bio, struct bio *bio_src, BUG_ON(bip_src == NULL); bip = bio_integrity_alloc(bio, gfp_mask, bip_src->bip_vcnt); - - if (bip == NULL) - return -EIO; + if (IS_ERR(bip)) + return PTR_ERR(bip); memcpy(bip->bip_vec, bip_src->bip_vec, bip_src->bip_vcnt * sizeof(struct bio_vec)); From 8c0b39155048d5a24f25c6c60aa83729927b04cd Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Wed, 9 Dec 2015 13:24:06 +0300 Subject: [PATCH 27/67] nvme: precedence bug in nvme_pr_clear() The "|" operator has higher precedence than "?:" so this didn't work as intended. I had previously fixed this bug, but it we copied the older unfixed version when we moved the function between files. Fixes: 1673f1f08c88 ('nvme: move block_device_operations and ns/ctrl freeing to common code') Signed-off-by: Dan Carpenter Signed-off-by: Jens Axboe --- drivers/nvme/host/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 47ebfb85b14b..64891ebc4c52 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -671,7 +671,7 @@ static int nvme_pr_preempt(struct block_device *bdev, u64 old, u64 new, static int nvme_pr_clear(struct block_device *bdev, u64 key) { - u32 cdw10 = 1 | key ? 1 << 3 : 0; + u32 cdw10 = 1 | (key ? 1 << 3 : 0); return nvme_pr_command(bdev, cdw10, key, 0, nvme_cmd_resv_register); } From 287922eb0b186e2a5bf54fdd04b734c25c90035c Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 30 Oct 2015 20:57:30 +0800 Subject: [PATCH 28/67] block: defer timeouts to a workqueue Timer context is not very useful for drivers to perform any meaningful abort action from. So instead of calling the driver from this useless context defer it to a workqueue as soon as possible. Note that while a delayed_work item would seem the right thing here I didn't dare to use it due to the magic in blk_add_timer that pokes deep into timer internals. But maybe this encourages Tejun to add a sensible API for that to the workqueue API and we'll all be fine in the end :) Contains a major update from Keith Bush: "This patch removes synchronizing the timeout work so that the timer can start a freeze on its own queue. The timer enters the queue, so timer context can only start a freeze, but not wait for frozen." Signed-off-by: Christoph Hellwig Acked-by: Keith Busch Signed-off-by: Jens Axboe --- block/blk-core.c | 8 ++++++++ block/blk-mq.c | 11 ++++++++--- block/blk-timeout.c | 8 ++++++-- block/blk.h | 2 +- include/linux/blkdev.h | 1 + 5 files changed, 24 insertions(+), 6 deletions(-) diff --git a/block/blk-core.c b/block/blk-core.c index 5ec996036e16..7e01002dfdde 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -664,6 +664,13 @@ static void blk_queue_usage_counter_release(struct percpu_ref *ref) wake_up_all(&q->mq_freeze_wq); } +static void blk_rq_timed_out_timer(unsigned long data) +{ + struct request_queue *q = (struct request_queue *)data; + + kblockd_schedule_work(&q->timeout_work); +} + struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id) { struct request_queue *q; @@ -825,6 +832,7 @@ blk_init_allocated_queue(struct request_queue *q, request_fn_proc *rfn, if (blk_init_rl(&q->root_rl, q, GFP_KERNEL)) goto fail; + INIT_WORK(&q->timeout_work, blk_timeout_work); q->request_fn = rfn; q->prep_rq_fn = NULL; q->unprep_rq_fn = NULL; diff --git a/block/blk-mq.c b/block/blk-mq.c index 93a4e1956915..9cb2894840ab 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -615,15 +615,19 @@ static void blk_mq_check_expired(struct blk_mq_hw_ctx *hctx, } } -static void blk_mq_rq_timer(unsigned long priv) +static void blk_mq_timeout_work(struct work_struct *work) { - struct request_queue *q = (struct request_queue *)priv; + struct request_queue *q = + container_of(work, struct request_queue, timeout_work); struct blk_mq_timeout_data data = { .next = 0, .next_set = 0, }; int i; + if (blk_queue_enter(q, true)) + return; + blk_mq_queue_tag_busy_iter(q, blk_mq_check_expired, &data); if (data.next_set) { @@ -638,6 +642,7 @@ static void blk_mq_rq_timer(unsigned long priv) blk_mq_tag_idle(hctx); } } + blk_queue_exit(q); } /* @@ -2015,7 +2020,7 @@ struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set, hctxs[i]->queue_num = i; } - setup_timer(&q->timeout, blk_mq_rq_timer, (unsigned long) q); + INIT_WORK(&q->timeout_work, blk_mq_timeout_work); blk_queue_rq_timeout(q, set->timeout ? set->timeout : 30 * HZ); q->nr_queues = nr_cpu_ids; diff --git a/block/blk-timeout.c b/block/blk-timeout.c index 3610af561748..dd4fdfbcb3dd 100644 --- a/block/blk-timeout.c +++ b/block/blk-timeout.c @@ -127,13 +127,16 @@ static void blk_rq_check_expired(struct request *rq, unsigned long *next_timeout } } -void blk_rq_timed_out_timer(unsigned long data) +void blk_timeout_work(struct work_struct *work) { - struct request_queue *q = (struct request_queue *) data; + struct request_queue *q = + container_of(work, struct request_queue, timeout_work); unsigned long flags, next = 0; struct request *rq, *tmp; int next_set = 0; + if (blk_queue_enter(q, true)) + return; spin_lock_irqsave(q->queue_lock, flags); list_for_each_entry_safe(rq, tmp, &q->timeout_list, timeout_list) @@ -143,6 +146,7 @@ void blk_rq_timed_out_timer(unsigned long data) mod_timer(&q->timeout, round_jiffies_up(next)); spin_unlock_irqrestore(q->queue_lock, flags); + blk_queue_exit(q); } /** diff --git a/block/blk.h b/block/blk.h index c43926d3d74d..70e4aee9cdcb 100644 --- a/block/blk.h +++ b/block/blk.h @@ -93,7 +93,7 @@ static inline void blk_flush_integrity(void) } #endif -void blk_rq_timed_out_timer(unsigned long data); +void blk_timeout_work(struct work_struct *work); unsigned long blk_rq_timeout(unsigned long timeout); void blk_add_timer(struct request *req); void blk_delete_timer(struct request *); diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index e711f294934c..221dc3bac49f 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -407,6 +407,7 @@ struct request_queue { unsigned int rq_timeout; struct timer_list timeout; + struct work_struct timeout_work; struct list_head timeout_list; struct list_head icq_list; From 749941f2365db8198b5d75c83a575ee6e55bf03b Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 26 Nov 2015 11:46:39 +0100 Subject: [PATCH 29/67] nvme: only ignore hardware errors in nvme_create_io_queues Half initialized queues due to kernel error returns or timeout are still a good reason to give up on initializing a controller. Signed-off-by: Christoph Hellwig Acked-by: Keith Busch Signed-off-by: Jens Axboe --- drivers/nvme/host/pci.c | 35 ++++++++++++++++++++--------------- 1 file changed, 20 insertions(+), 15 deletions(-) diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index a64d0baacc58..1f92b328522a 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -1537,26 +1537,33 @@ static int nvme_kthread(void *data) return 0; } -/* - * Create I/O queues. Failing to create an I/O queue is not an issue, - * we can continue with less than the desired amount of queues, and - * even a controller without I/O queues an still be used to issue - * admin commands. This might be useful to upgrade a buggy firmware - * for example. - */ -static void nvme_create_io_queues(struct nvme_dev *dev) +static int nvme_create_io_queues(struct nvme_dev *dev) { unsigned i; + int ret = 0; - for (i = dev->queue_count; i <= dev->max_qid; i++) - if (!nvme_alloc_queue(dev, i, dev->q_depth)) + for (i = dev->queue_count; i <= dev->max_qid; i++) { + if (!nvme_alloc_queue(dev, i, dev->q_depth)) { + ret = -ENOMEM; break; + } + } - for (i = dev->online_queues; i <= dev->queue_count - 1; i++) - if (nvme_create_queue(dev->queues[i], i)) { + for (i = dev->online_queues; i <= dev->queue_count - 1; i++) { + ret = nvme_create_queue(dev->queues[i], i); + if (ret) { nvme_free_queues(dev, i); break; } + } + + /* + * Ignore failing Create SQ/CQ commands, we can continue with less + * than the desired aount of queues, and even a controller without + * I/O queues an still be used to issue admin commands. This might + * be useful to upgrade a buggy firmware for example. + */ + return ret >= 0 ? 0 : ret; } static void __iomem *nvme_map_cmb(struct nvme_dev *dev) @@ -1702,9 +1709,7 @@ static int nvme_setup_io_queues(struct nvme_dev *dev) /* Free previously allocated queues that are no longer usable */ nvme_free_queues(dev, nr_io_queues + 1); - nvme_create_io_queues(dev); - - return 0; + return nvme_create_io_queues(dev); free_queues: nvme_free_queues(dev, 1); From 7385014c073263b077442439299fad013edd4409 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 22 Oct 2015 14:03:33 +0200 Subject: [PATCH 30/67] nvme: only add a controller to dev_list after it's been fully initialized Without this we can easily get bad derferences on nvmeq->d_db when the nvme kthread tries to poll the CQs for controllers that are in half initialized state. Signed-off-by: Christoph Hellwig Signed-off-by: Keith Busch Signed-off-by: Jens Axboe --- drivers/nvme/host/pci.c | 51 ++++++++++++++++++++++++----------------- 1 file changed, 30 insertions(+), 21 deletions(-) diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 1f92b328522a..d82f08d671e6 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -1994,6 +1994,30 @@ static void nvme_disable_io_queues(struct nvme_dev *dev) kthread_stop(kworker_task); } +static int nvme_dev_list_add(struct nvme_dev *dev) +{ + bool start_thread = false; + + spin_lock(&dev_list_lock); + if (list_empty(&dev_list) && IS_ERR_OR_NULL(nvme_thread)) { + start_thread = true; + nvme_thread = NULL; + } + list_add(&dev->node, &dev_list); + spin_unlock(&dev_list_lock); + + if (start_thread) { + nvme_thread = kthread_run(nvme_kthread, NULL, "nvme"); + wake_up_all(&nvme_kthread_wait); + } else + wait_event_killable(nvme_kthread_wait, nvme_thread); + + if (IS_ERR_OR_NULL(nvme_thread)) + return nvme_thread ? PTR_ERR(nvme_thread) : -EINTR; + + return 0; +} + /* * Remove the node from the device list and check * for whether or not we need to stop the nvme_thread. @@ -2109,7 +2133,6 @@ static void nvme_pci_free_ctrl(struct nvme_ctrl *ctrl) static void nvme_probe_work(struct work_struct *work) { struct nvme_dev *dev = container_of(work, struct nvme_dev, probe_work); - bool start_thread = false; int result; result = nvme_dev_map(dev); @@ -2120,25 +2143,6 @@ static void nvme_probe_work(struct work_struct *work) if (result) goto unmap; - spin_lock(&dev_list_lock); - if (list_empty(&dev_list) && IS_ERR_OR_NULL(nvme_thread)) { - start_thread = true; - nvme_thread = NULL; - } - list_add(&dev->node, &dev_list); - spin_unlock(&dev_list_lock); - - if (start_thread) { - nvme_thread = kthread_run(nvme_kthread, NULL, "nvme"); - wake_up_all(&nvme_kthread_wait); - } else - wait_event_killable(nvme_kthread_wait, nvme_thread); - - if (IS_ERR_OR_NULL(nvme_thread)) { - result = nvme_thread ? PTR_ERR(nvme_thread) : -EINTR; - goto disable; - } - nvme_init_queue(dev->queues[0], 0); result = nvme_alloc_admin_tags(dev); if (result) @@ -2154,6 +2158,10 @@ static void nvme_probe_work(struct work_struct *work) dev->ctrl.event_limit = 1; + result = nvme_dev_list_add(dev); + if (result) + goto remove; + /* * Keep the controller around but remove all namespaces if we don't have * any working I/O queue. @@ -2168,6 +2176,8 @@ static void nvme_probe_work(struct work_struct *work) return; + remove: + nvme_dev_list_remove(dev); free_tags: nvme_dev_remove_admin(dev); blk_put_queue(dev->ctrl.admin_q); @@ -2175,7 +2185,6 @@ static void nvme_probe_work(struct work_struct *work) dev->queues[0]->tags = NULL; disable: nvme_disable_queue(dev, 0); - nvme_dev_list_remove(dev); unmap: nvme_dev_unmap(dev); out: From 77bf25ea70200cddf083f74b7f617e5f07fac8bd Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Thu, 26 Nov 2015 12:21:29 +0100 Subject: [PATCH 31/67] nvme: protect against simultaneous shutdown invocations Signed-off-by: Keith Busch [hch: split from a larger patch] Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- drivers/nvme/host/pci.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index d82f08d671e6..ad6d5cce42d5 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -31,6 +31,7 @@ #include #include #include +#include #include #include #include @@ -113,6 +114,7 @@ struct nvme_dev { struct work_struct reset_work; struct work_struct probe_work; struct work_struct scan_work; + struct mutex shutdown_lock; bool subsystem; void __iomem *cmb; dma_addr_t cmb_dma_addr; @@ -2073,6 +2075,7 @@ static void nvme_dev_shutdown(struct nvme_dev *dev) nvme_dev_list_remove(dev); + mutex_lock(&dev->shutdown_lock); if (dev->bar) { nvme_freeze_queues(dev); csts = readl(dev->bar + NVME_REG_CSTS); @@ -2091,6 +2094,7 @@ static void nvme_dev_shutdown(struct nvme_dev *dev) for (i = dev->queue_count - 1; i >= 0; i--) nvme_clear_queue(dev->queues[i]); + mutex_unlock(&dev->shutdown_lock); } static int nvme_setup_prp_pools(struct nvme_dev *dev) @@ -2333,6 +2337,7 @@ static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id) INIT_WORK(&dev->scan_work, nvme_dev_scan); INIT_WORK(&dev->probe_work, nvme_probe_work); INIT_WORK(&dev->reset_work, nvme_reset_work); + mutex_init(&dev->shutdown_lock); result = nvme_setup_prp_pools(dev); if (result) From 4c9f748f0ee88447b28546991f60f43a7319aafd Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 22 Oct 2015 14:03:34 +0200 Subject: [PATCH 32/67] nvme: don't take the I/O queue q_lock in nvme_timeout There is nothing it protects, but it makes lockdep unhappy in many different ways. Signed-off-by: Christoph Hellwig Signed-off-by: Keith Busch Signed-off-by: Jens Axboe --- drivers/nvme/host/pci.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index ad6d5cce42d5..d4fef8190093 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -1094,13 +1094,13 @@ static void nvme_abort_req(struct request *req) struct nvme_command cmd; if (!nvmeq->qid || cmd_rq->aborted) { - spin_lock(&dev_list_lock); + spin_lock_irq(&dev_list_lock); if (!__nvme_reset(dev)) { dev_warn(dev->dev, "I/O %d QID %d timeout, reset controller\n", req->tag, nvmeq->qid); } - spin_unlock(&dev_list_lock); + spin_unlock_irq(&dev_list_lock); return; } @@ -1164,9 +1164,7 @@ static enum blk_eh_timer_return nvme_timeout(struct request *req, bool reserved) dev_warn(nvmeq->q_dmadev, "Timeout I/O %d QID %d\n", req->tag, nvmeq->qid); - spin_lock_irq(&nvmeq->q_lock); nvme_abort_req(req); - spin_unlock_irq(&nvmeq->q_lock); /* * The aborted req will be completed on receiving the abort req. From 31c7c7d2c9f17dc98a98c59c17e184bf164ee760 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 22 Oct 2015 14:03:35 +0200 Subject: [PATCH 33/67] nvme: merge nvme_abort_req and nvme_timeout We want to be able to return bettern error values frmo nvme_timeout, which is significantly easier if the two functions are merged. Also clean up and reduce the printk spew so that we only get one message per abort. Signed-off-by: Christoph Hellwig Signed-off-by: Keith Busch Signed-off-by: Jens Axboe --- drivers/nvme/host/pci.c | 47 ++++++++++++++++------------------------- 1 file changed, 18 insertions(+), 29 deletions(-) diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index d4fef8190093..99c5b6319d8d 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -1078,13 +1078,7 @@ static int adapter_delete_sq(struct nvme_dev *dev, u16 sqid) return adapter_delete_queue(dev, nvme_admin_delete_sq, sqid); } -/** - * nvme_abort_req - Attempt aborting a request - * - * Schedule controller reset if the command was already aborted once before and - * still hasn't been returned to the driver, or if this is the admin queue. - */ -static void nvme_abort_req(struct request *req) +static enum blk_eh_timer_return nvme_timeout(struct request *req, bool reserved) { struct nvme_cmd_info *cmd_rq = blk_mq_rq_to_pdu(req); struct nvme_queue *nvmeq = cmd_rq->nvmeq; @@ -1093,6 +1087,11 @@ static void nvme_abort_req(struct request *req) struct nvme_cmd_info *abort_cmd; struct nvme_command cmd; + /* + * Schedule controller reset if the command was already aborted once + * before and still hasn't been returned to the driver, or if this is + * the admin queue. + */ if (!nvmeq->qid || cmd_rq->aborted) { spin_lock_irq(&dev_list_lock); if (!__nvme_reset(dev)) { @@ -1101,16 +1100,16 @@ static void nvme_abort_req(struct request *req) req->tag, nvmeq->qid); } spin_unlock_irq(&dev_list_lock); - return; + return BLK_EH_RESET_TIMER; } if (!dev->ctrl.abort_limit) - return; + return BLK_EH_RESET_TIMER; abort_req = blk_mq_alloc_request(dev->ctrl.admin_q, WRITE, BLK_MQ_REQ_NOWAIT); if (IS_ERR(abort_req)) - return; + return BLK_EH_RESET_TIMER; abort_cmd = blk_mq_rq_to_pdu(abort_req); nvme_set_info(abort_cmd, abort_req, abort_completion); @@ -1124,9 +1123,16 @@ static void nvme_abort_req(struct request *req) --dev->ctrl.abort_limit; cmd_rq->aborted = 1; - dev_warn(nvmeq->q_dmadev, "Aborting I/O %d QID %d\n", req->tag, - nvmeq->qid); + dev_warn(nvmeq->q_dmadev, "I/O %d QID %d timeout, aborting\n", + req->tag, nvmeq->qid); nvme_submit_cmd(dev->queues[0], &cmd); + + /* + * The aborted req will be completed on receiving the abort req. + * We enable the timer again. If hit twice, it'll cause a device reset, + * as the device then is in a faulty state. + */ + return BLK_EH_RESET_TIMER; } static void nvme_cancel_queue_ios(struct request *req, void *data, bool reserved) @@ -1157,23 +1163,6 @@ static void nvme_cancel_queue_ios(struct request *req, void *data, bool reserved fn(nvmeq, ctx, &cqe); } -static enum blk_eh_timer_return nvme_timeout(struct request *req, bool reserved) -{ - struct nvme_cmd_info *cmd = blk_mq_rq_to_pdu(req); - struct nvme_queue *nvmeq = cmd->nvmeq; - - dev_warn(nvmeq->q_dmadev, "Timeout I/O %d QID %d\n", req->tag, - nvmeq->qid); - nvme_abort_req(req); - - /* - * The aborted req will be completed on receiving the abort req. - * We enable the timer again. If hit twice, it'll cause a device reset, - * as the device then is in a faulty state. - */ - return BLK_EH_RESET_TIMER; -} - static void nvme_free_queue(struct nvme_queue *nvmeq) { dma_free_coherent(nvmeq->q_dmadev, CQ_SIZE(nvmeq->q_depth), From 297465c873ae8c99180617ca904dc1a4a738f25d Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 26 Nov 2015 12:58:11 +0100 Subject: [PATCH 34/67] nvme: add NVME_SC_CANCELLED To properly document how we are using a negative Linux error value to communicate request cancellations inside the driver. Signed-off-by: Christoph Hellwig Acked-by: Keith Busch Signed-off-by: Jens Axboe --- drivers/nvme/host/nvme.h | 10 ++++++++++ drivers/nvme/host/pci.c | 2 +- 2 files changed, 11 insertions(+), 1 deletion(-) diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index b75d41e5c378..88950f362d97 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -19,6 +19,16 @@ #include #include +enum { + /* + * Driver internal status code for commands that were cancelled due + * to timeouts or controller shutdown. The value is negative so + * that it a) doesn't overlap with the unsigned hardware error codes, + * and b) can easily be tested for. + */ + NVME_SC_CANCELLED = -EINTR, +}; + extern unsigned char nvme_io_timeout; #define NVME_IO_TIMEOUT (nvme_io_timeout * HZ) diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 99c5b6319d8d..e683bd1a05e6 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -621,7 +621,7 @@ static void req_completion(struct nvme_queue *nvmeq, void *ctx, if (req->cmd_type == REQ_TYPE_DRV_PRIV) { if (cmd_rq->ctx == CMD_CTX_CANCELLED) - error = -EINTR; + error = NVME_SC_CANCELLED; else error = status; } else { From 846cc05f95d599801f296d8599e82686ebd395f0 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 26 Nov 2015 12:10:29 +0100 Subject: [PATCH 35/67] nvme: simplify resets Don't delete the controller from dev_list before queuing a reset, instead just check for it being reset in the polling kthread. This allows to remove the dev_list_lock in various places, and in addition we can simply rely on checking the queue_work return value to see if we could reset a controller. Signed-off-by: Christoph Hellwig Signed-off-by: Keith Busch Signed-off-by: Jens Axboe --- drivers/nvme/host/pci.c | 39 +++++++++++++-------------------------- 1 file changed, 13 insertions(+), 26 deletions(-) diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index e683bd1a05e6..febcef5ae0aa 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -77,7 +77,6 @@ struct nvme_dev; struct nvme_queue; struct nvme_iod; -static int __nvme_reset(struct nvme_dev *dev); static int nvme_reset(struct nvme_dev *dev); static void nvme_process_cq(struct nvme_queue *nvmeq); static void nvme_unmap_data(struct nvme_dev *dev, struct nvme_iod *iod); @@ -1093,13 +1092,11 @@ static enum blk_eh_timer_return nvme_timeout(struct request *req, bool reserved) * the admin queue. */ if (!nvmeq->qid || cmd_rq->aborted) { - spin_lock_irq(&dev_list_lock); - if (!__nvme_reset(dev)) { + if (queue_work(nvme_workq, &dev->reset_work)) { dev_warn(dev->dev, "I/O %d QID %d timeout, reset controller\n", req->tag, nvmeq->qid); } - spin_unlock_irq(&dev_list_lock); return BLK_EH_RESET_TIMER; } @@ -1496,9 +1493,15 @@ static int nvme_kthread(void *data) int i; u32 csts = readl(dev->bar + NVME_REG_CSTS); + /* + * Skip controllers currently under reset. + */ + if (work_pending(&dev->reset_work) || work_busy(&dev->reset_work)) + continue; + if ((dev->subsystem && (csts & NVME_CSTS_NSSRO)) || csts & NVME_CSTS_CFS) { - if (!__nvme_reset(dev)) { + if (queue_work(nvme_workq, &dev->reset_work)) { dev_warn(dev->dev, "Failed status: %x, reset controller\n", readl(dev->bar + NVME_REG_CSTS)); @@ -2228,33 +2231,17 @@ static void nvme_reset_work(struct work_struct *ws) schedule_work(&dev->probe_work); } -static int __nvme_reset(struct nvme_dev *dev) -{ - if (work_pending(&dev->reset_work)) - return -EBUSY; - list_del_init(&dev->node); - queue_work(nvme_workq, &dev->reset_work); - return 0; -} - static int nvme_reset(struct nvme_dev *dev) { - int ret; - if (!dev->ctrl.admin_q || blk_queue_dying(dev->ctrl.admin_q)) return -ENODEV; - spin_lock(&dev_list_lock); - ret = __nvme_reset(dev); - spin_unlock(&dev_list_lock); + if (!queue_work(nvme_workq, &dev->reset_work)) + return -EBUSY; - if (!ret) { - flush_work(&dev->reset_work); - flush_work(&dev->probe_work); - return 0; - } - - return ret; + flush_work(&dev->reset_work); + flush_work(&dev->probe_work); + return 0; } static int nvme_pci_reg_read32(struct nvme_ctrl *ctrl, u32 off, u32 *val) From e1569a16180aef4311ff5fc54f54b23ae9e8a03e Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Thu, 26 Nov 2015 12:11:07 +0100 Subject: [PATCH 36/67] nvme: do not restart the request timeout if we're resetting the controller Otherwise we're never going to complete a command when it is restarted just after we completed all other outstanding commands in nvme_clear_queue. The controller must be disabled prior to completing a presumed lost command, do this by directly shutting down the controller before queueing the reset work, and return EH_HANDLED from the timeout handler after we shut the controller down. Signed-off-by: Keith Busch [hch: split and rebase] Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- drivers/nvme/host/pci.c | 25 ++++++++++++++++--------- 1 file changed, 16 insertions(+), 9 deletions(-) diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index febcef5ae0aa..6082f2775581 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -81,6 +81,7 @@ static int nvme_reset(struct nvme_dev *dev); static void nvme_process_cq(struct nvme_queue *nvmeq); static void nvme_unmap_data(struct nvme_dev *dev, struct nvme_iod *iod); static void nvme_dead_ctrl(struct nvme_dev *dev); +static void nvme_dev_shutdown(struct nvme_dev *dev); struct async_cmd_info { struct kthread_work work; @@ -1087,17 +1088,23 @@ static enum blk_eh_timer_return nvme_timeout(struct request *req, bool reserved) struct nvme_command cmd; /* - * Schedule controller reset if the command was already aborted once - * before and still hasn't been returned to the driver, or if this is - * the admin queue. + * Shutdown the controller immediately and schedule a reset if the + * command was already aborted once before and still hasn't been + * returned to the driver, or if this is the admin queue. */ if (!nvmeq->qid || cmd_rq->aborted) { - if (queue_work(nvme_workq, &dev->reset_work)) { - dev_warn(dev->dev, - "I/O %d QID %d timeout, reset controller\n", - req->tag, nvmeq->qid); - } - return BLK_EH_RESET_TIMER; + dev_warn(dev->dev, + "I/O %d QID %d timeout, reset controller\n", + req->tag, nvmeq->qid); + nvme_dev_shutdown(dev); + queue_work(nvme_workq, &dev->reset_work); + + /* + * Mark the request as handled, since the inline shutdown + * forces all outstanding requests to complete. + */ + req->errors = NVME_SC_CANCELLED; + return BLK_EH_HANDLED; } if (!dev->ctrl.abort_limit) From fd634f4142861e533ac57e88ece8e98ab5851edb Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 26 Nov 2015 12:42:26 +0100 Subject: [PATCH 37/67] nvme: merge probe_work and reset_work If we're using two work queues we're always going to run into races where one item is tearing down what the other one is initializing. So insted merge the two work queues, and let the old probe_work also tear the controller down first if it was alive. Together with the better detection of the probe path using a flag this gives us a properly serialized reset/probe path that also doesn't accidentally trigger when two commands time out and the second one tries to reset the controller while the first reset is still in progress. Signed-off-by: Christoph Hellwig Acked-by: Keith Busch Signed-off-by: Jens Axboe --- drivers/nvme/host/pci.c | 74 ++++++++++++++++++++++------------------- 1 file changed, 39 insertions(+), 35 deletions(-) diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 6082f2775581..23cbd93c0c56 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -112,7 +112,6 @@ struct nvme_dev { struct msix_entry *entry; void __iomem *bar; struct work_struct reset_work; - struct work_struct probe_work; struct work_struct scan_work; struct mutex shutdown_lock; bool subsystem; @@ -120,6 +119,8 @@ struct nvme_dev { dma_addr_t cmb_dma_addr; u64 cmb_size; u32 cmbsz; + unsigned long flags; +#define NVME_CTRL_RESETTING 0 struct nvme_ctrl ctrl; }; @@ -1088,9 +1089,24 @@ static enum blk_eh_timer_return nvme_timeout(struct request *req, bool reserved) struct nvme_command cmd; /* - * Shutdown the controller immediately and schedule a reset if the - * command was already aborted once before and still hasn't been - * returned to the driver, or if this is the admin queue. + * Shutdown immediately if controller times out while starting. The + * reset work will see the pci device disabled when it gets the forced + * cancellation error. All outstanding requests are completed on + * shutdown, so we return BLK_EH_HANDLED. + */ + if (test_bit(NVME_CTRL_RESETTING, &dev->flags)) { + dev_warn(dev->dev, + "I/O %d QID %d timeout, disable controller\n", + req->tag, nvmeq->qid); + nvme_dev_shutdown(dev); + req->errors = NVME_SC_CANCELLED; + return BLK_EH_HANDLED; + } + + /* + * Shutdown the controller immediately and schedule a reset if the + * command was already aborted once before and still hasn't been + * returned to the driver, or if this is the admin queue. */ if (!nvmeq->qid || cmd_rq->aborted) { dev_warn(dev->dev, @@ -2131,11 +2147,23 @@ static void nvme_pci_free_ctrl(struct nvme_ctrl *ctrl) kfree(dev); } -static void nvme_probe_work(struct work_struct *work) +static void nvme_reset_work(struct work_struct *work) { - struct nvme_dev *dev = container_of(work, struct nvme_dev, probe_work); + struct nvme_dev *dev = container_of(work, struct nvme_dev, reset_work); int result; + if (WARN_ON(test_bit(NVME_CTRL_RESETTING, &dev->flags))) + goto out; + + /* + * If we're called to reset a live controller first shut it down before + * moving on. + */ + if (dev->bar) + nvme_dev_shutdown(dev); + + set_bit(NVME_CTRL_RESETTING, &dev->flags); + result = nvme_dev_map(dev); if (result) goto out; @@ -2175,6 +2203,7 @@ static void nvme_probe_work(struct work_struct *work) nvme_dev_add(dev); } + clear_bit(NVME_CTRL_RESETTING, &dev->flags); return; remove: @@ -2189,7 +2218,7 @@ static void nvme_probe_work(struct work_struct *work) unmap: nvme_dev_unmap(dev); out: - if (!work_busy(&dev->reset_work)) + if (!work_pending(&dev->reset_work)) nvme_dead_ctrl(dev); } @@ -2216,28 +2245,6 @@ static void nvme_dead_ctrl(struct nvme_dev *dev) } } -static void nvme_reset_work(struct work_struct *ws) -{ - struct nvme_dev *dev = container_of(ws, struct nvme_dev, reset_work); - bool in_probe = work_busy(&dev->probe_work); - - nvme_dev_shutdown(dev); - - /* Synchronize with device probe so that work will see failure status - * and exit gracefully without trying to schedule another reset */ - flush_work(&dev->probe_work); - - /* Fail this device if reset occured during probe to avoid - * infinite initialization loops. */ - if (in_probe) { - nvme_dead_ctrl(dev); - return; - } - /* Schedule device resume asynchronously so the reset work is available - * to cleanup errors that may occur during reinitialization */ - schedule_work(&dev->probe_work); -} - static int nvme_reset(struct nvme_dev *dev) { if (!dev->ctrl.admin_q || blk_queue_dying(dev->ctrl.admin_q)) @@ -2247,7 +2254,6 @@ static int nvme_reset(struct nvme_dev *dev) return -EBUSY; flush_work(&dev->reset_work); - flush_work(&dev->probe_work); return 0; } @@ -2316,7 +2322,6 @@ static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id) INIT_LIST_HEAD(&dev->node); INIT_WORK(&dev->scan_work, nvme_dev_scan); - INIT_WORK(&dev->probe_work, nvme_probe_work); INIT_WORK(&dev->reset_work, nvme_reset_work); mutex_init(&dev->shutdown_lock); @@ -2329,7 +2334,7 @@ static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id) if (result) goto release_pools; - schedule_work(&dev->probe_work); + schedule_work(&dev->reset_work); return 0; release_pools: @@ -2350,7 +2355,7 @@ static void nvme_reset_notify(struct pci_dev *pdev, bool prepare) if (prepare) nvme_dev_shutdown(dev); else - schedule_work(&dev->probe_work); + schedule_work(&dev->reset_work); } static void nvme_shutdown(struct pci_dev *pdev) @@ -2368,7 +2373,6 @@ static void nvme_remove(struct pci_dev *pdev) spin_unlock(&dev_list_lock); pci_set_drvdata(pdev, NULL); - flush_work(&dev->probe_work); flush_work(&dev->reset_work); flush_work(&dev->scan_work); nvme_remove_namespaces(&dev->ctrl); @@ -2402,7 +2406,7 @@ static int nvme_resume(struct device *dev) struct pci_dev *pdev = to_pci_dev(dev); struct nvme_dev *ndev = pci_get_drvdata(pdev); - schedule_work(&ndev->probe_work); + schedule_work(&ndev->reset_work); return 0; } #endif From 5c8809e650772be87ba04595a8ccf278bab7b543 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 26 Nov 2015 12:35:49 +0100 Subject: [PATCH 38/67] nvme: remove dead controllers from a work item Compared to the kthread this gives us multiple call prevention for free. Signed-off-by: Christoph Hellwig Signed-off-by: Keith Busch Signed-off-by: Jens Axboe --- drivers/nvme/host/pci.c | 22 +++++++++------------- 1 file changed, 9 insertions(+), 13 deletions(-) diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 23cbd93c0c56..26e982359a74 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -80,7 +80,7 @@ struct nvme_iod; static int nvme_reset(struct nvme_dev *dev); static void nvme_process_cq(struct nvme_queue *nvmeq); static void nvme_unmap_data(struct nvme_dev *dev, struct nvme_iod *iod); -static void nvme_dead_ctrl(struct nvme_dev *dev); +static void nvme_remove_dead_ctrl(struct nvme_dev *dev); static void nvme_dev_shutdown(struct nvme_dev *dev); struct async_cmd_info { @@ -113,6 +113,7 @@ struct nvme_dev { void __iomem *bar; struct work_struct reset_work; struct work_struct scan_work; + struct work_struct remove_work; struct mutex shutdown_lock; bool subsystem; void __iomem *cmb; @@ -2218,31 +2219,25 @@ static void nvme_reset_work(struct work_struct *work) unmap: nvme_dev_unmap(dev); out: - if (!work_pending(&dev->reset_work)) - nvme_dead_ctrl(dev); + nvme_remove_dead_ctrl(dev); } -static int nvme_remove_dead_ctrl(void *arg) +static void nvme_remove_dead_ctrl_work(struct work_struct *work) { - struct nvme_dev *dev = (struct nvme_dev *)arg; + struct nvme_dev *dev = container_of(work, struct nvme_dev, remove_work); struct pci_dev *pdev = to_pci_dev(dev->dev); if (pci_get_drvdata(pdev)) pci_stop_and_remove_bus_device_locked(pdev); nvme_put_ctrl(&dev->ctrl); - return 0; } -static void nvme_dead_ctrl(struct nvme_dev *dev) +static void nvme_remove_dead_ctrl(struct nvme_dev *dev) { - dev_warn(dev->dev, "Device failed to resume\n"); + dev_warn(dev->dev, "Removing after probe failure\n"); kref_get(&dev->ctrl.kref); - if (IS_ERR(kthread_run(nvme_remove_dead_ctrl, dev, "nvme%d", - dev->ctrl.instance))) { - dev_err(dev->dev, - "Failed to start controller remove task\n"); + if (!schedule_work(&dev->remove_work)) nvme_put_ctrl(&dev->ctrl); - } } static int nvme_reset(struct nvme_dev *dev) @@ -2323,6 +2318,7 @@ static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id) INIT_LIST_HEAD(&dev->node); INIT_WORK(&dev->scan_work, nvme_dev_scan); INIT_WORK(&dev->reset_work, nvme_reset_work); + INIT_WORK(&dev->remove_work, nvme_remove_dead_ctrl_work); mutex_init(&dev->shutdown_lock); result = nvme_setup_prp_pools(dev); From 6bf25d16410d8d95e3552f31c6a99e3fc3d31752 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 20 Nov 2015 09:36:44 +0100 Subject: [PATCH 39/67] nvme: switch abort_limit to an atomic_t There is no lock to sychronize access to the abort_limit field of struct nvme_ctrl, so switch it to an atomic_t. Signed-off-by: Christoph Hellwig Signed-off-by: Keith Busch Signed-off-by: Jens Axboe --- drivers/nvme/host/core.c | 2 +- drivers/nvme/host/nvme.h | 2 +- drivers/nvme/host/pci.c | 9 +++++---- 3 files changed, 7 insertions(+), 6 deletions(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 64891ebc4c52..859b1896d337 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -839,7 +839,7 @@ int nvme_init_identify(struct nvme_ctrl *ctrl) } ctrl->oncs = le16_to_cpup(&id->oncs); - ctrl->abort_limit = id->acl + 1; + atomic_set(&ctrl->abort_limit, id->acl + 1); ctrl->vwc = id->vwc; memcpy(ctrl->serial, id->sn, sizeof(id->sn)); memcpy(ctrl->model, id->mn, sizeof(id->mn)); diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index 88950f362d97..982fa303cec6 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -77,7 +77,7 @@ struct nvme_ctrl { u32 max_hw_sectors; u32 stripe_size; u16 oncs; - u16 abort_limit; + atomic_t abort_limit; u8 event_limit; u8 vwc; u32 vs; diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 26e982359a74..1ad7f18d48e2 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -388,7 +388,7 @@ static void abort_completion(struct nvme_queue *nvmeq, void *ctx, blk_mq_free_request(req); dev_warn(nvmeq->q_dmadev, "Abort status:%x result:%x", status, result); - ++nvmeq->dev->ctrl.abort_limit; + atomic_inc(&nvmeq->dev->ctrl.abort_limit); } static void async_completion(struct nvme_queue *nvmeq, void *ctx, @@ -1124,13 +1124,15 @@ static enum blk_eh_timer_return nvme_timeout(struct request *req, bool reserved) return BLK_EH_HANDLED; } - if (!dev->ctrl.abort_limit) + if (atomic_dec_and_test(&dev->ctrl.abort_limit)) return BLK_EH_RESET_TIMER; abort_req = blk_mq_alloc_request(dev->ctrl.admin_q, WRITE, BLK_MQ_REQ_NOWAIT); - if (IS_ERR(abort_req)) + if (IS_ERR(abort_req)) { + atomic_inc(&dev->ctrl.abort_limit); return BLK_EH_RESET_TIMER; + } abort_cmd = blk_mq_rq_to_pdu(abort_req); nvme_set_info(abort_cmd, abort_req, abort_completion); @@ -1141,7 +1143,6 @@ static enum blk_eh_timer_return nvme_timeout(struct request *req, bool reserved) cmd.abort.sqid = cpu_to_le16(nvmeq->qid); cmd.abort.command_id = abort_req->tag; - --dev->ctrl.abort_limit; cmd_rq->aborted = 1; dev_warn(nvmeq->q_dmadev, "I/O %d QID %d timeout, aborting\n", From 540c801c65eb58e05e0ca38b6fd644a83d7e2b33 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Thu, 22 Oct 2015 15:45:06 -0600 Subject: [PATCH 40/67] NVMe: Implement namespace list scanning The NVMe 1.1 specification provides an identify mode to return a list of active namespaces. This is more efficient to discover which namespace identifiers are active on a controller, providing potentially significant improvement in scan time for controllers with sparesly populated namespaces. Signed-off-by: Keith Busch [hch: add quirk for the broken Qemu Identify implementation. To be relaxed later] Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- drivers/nvme/host/core.c | 80 +++++++++++++++++++++++++++++++++++----- drivers/nvme/host/nvme.h | 6 +++ drivers/nvme/host/pci.c | 2 + 3 files changed, 79 insertions(+), 9 deletions(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 859b1896d337..96e05327ecf6 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -256,6 +256,16 @@ int nvme_identify_ctrl(struct nvme_ctrl *dev, struct nvme_id_ctrl **id) return error; } +static int nvme_identify_ns_list(struct nvme_ctrl *dev, unsigned nsid, __le32 *ns_list) +{ + struct nvme_command c = { }; + + c.identify.opcode = nvme_admin_identify; + c.identify.cns = cpu_to_le32(2); + c.identify.nsid = cpu_to_le32(nsid); + return nvme_submit_sync_cmd(dev->admin_q, &c, ns_list, 0x1000); +} + int nvme_identify_ns(struct nvme_ctrl *dev, unsigned nsid, struct nvme_id_ns **id) { @@ -1071,33 +1081,85 @@ static void nvme_ns_remove(struct nvme_ns *ns) nvme_put_ns(ns); } +static void nvme_validate_ns(struct nvme_ctrl *ctrl, unsigned nsid) +{ + struct nvme_ns *ns; + + ns = nvme_find_ns(ctrl, nsid); + if (ns) { + if (revalidate_disk(ns->disk)) + nvme_ns_remove(ns); + } else + nvme_alloc_ns(ctrl, nsid); +} + +static int nvme_scan_ns_list(struct nvme_ctrl *ctrl, unsigned nn) +{ + struct nvme_ns *ns; + __le32 *ns_list; + unsigned i, j, nsid, prev = 0, num_lists = DIV_ROUND_UP(nn, 1024); + int ret = 0; + + ns_list = kzalloc(0x1000, GFP_KERNEL); + if (!ns_list) + return -ENOMEM; + + for (i = 0; i < num_lists; i++) { + ret = nvme_identify_ns_list(ctrl, prev, ns_list); + if (ret) + goto out; + + for (j = 0; j < min(nn, 1024U); j++) { + nsid = le32_to_cpu(ns_list[j]); + if (!nsid) + goto out; + + nvme_validate_ns(ctrl, nsid); + + while (++prev < nsid) { + ns = nvme_find_ns(ctrl, prev); + if (ns) + nvme_ns_remove(ns); + } + } + nn -= j; + } + out: + kfree(ns_list); + return ret; +} + static void __nvme_scan_namespaces(struct nvme_ctrl *ctrl, unsigned nn) { struct nvme_ns *ns, *next; unsigned i; - for (i = 1; i <= nn; i++) { - ns = nvme_find_ns(ctrl, i); - if (ns) { - if (revalidate_disk(ns->disk)) - nvme_ns_remove(ns); - } else - nvme_alloc_ns(ctrl, i); - } + for (i = 1; i <= nn; i++) + nvme_validate_ns(ctrl, i); + list_for_each_entry_safe(ns, next, &ctrl->namespaces, list) { if (ns->ns_id > nn) nvme_ns_remove(ns); } - list_sort(NULL, &ctrl->namespaces, ns_cmp); } void nvme_scan_namespaces(struct nvme_ctrl *ctrl) { struct nvme_id_ctrl *id; + unsigned nn; if (nvme_identify_ctrl(ctrl, &id)) return; + + nn = le32_to_cpu(id->nn); + if (ctrl->vs >= NVME_VS(1, 1) && + !(ctrl->quirks & NVME_QUIRK_IDENTIFY_CNS)) { + if (!nvme_scan_ns_list(ctrl, nn)) + goto done; + } __nvme_scan_namespaces(ctrl, le32_to_cpup(&id->nn)); + done: + list_sort(NULL, &ctrl->namespaces, ns_cmp); kfree(id); } diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index 982fa303cec6..2965c469da4a 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -53,6 +53,12 @@ enum nvme_quirks { * specific Identify field. */ NVME_QUIRK_STRIPE_SIZE = (1 << 0), + + /* + * The controller doesn't handle Identify value others than 0 or 1 + * correctly. + */ + NVME_QUIRK_IDENTIFY_CNS = (1 << 1), }; struct nvme_ctrl { diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 1ad7f18d48e2..fac1de847753 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -2425,6 +2425,8 @@ static const struct pci_error_handlers nvme_err_handler = { static const struct pci_device_id nvme_id_table[] = { { PCI_VDEVICE(INTEL, 0x0953), .driver_data = NVME_QUIRK_STRIPE_SIZE, }, + { PCI_VDEVICE(INTEL, 0x5845), /* Qemu emulated controller */ + .driver_data = NVME_QUIRK_IDENTIFY_CNS, }, { PCI_DEVICE_CLASS(PCI_CLASS_STORAGE_EXPRESS, 0xffffff) }, { PCI_DEVICE(PCI_VENDOR_ID_APPLE, 0x2001) }, { 0, } From 92f7a1624bbc2361b96db81de89aee1baae40da9 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Fri, 23 Oct 2015 11:42:02 -0600 Subject: [PATCH 41/67] NVMe: Use unbounded work queue for all work Removes all usage of the global work queue so work can't be scheduled on two different work queues, and removes nvme's work queue singlethreadedness so controllers can be driven in parallel. Signed-off-by: Keith Busch [hch: keep the dead controller removal on the system workqueue to avoid deadlocks] Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- drivers/nvme/host/pci.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index fac1de847753..a909a8ba228a 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -371,7 +371,7 @@ static void async_req_completion(struct nvme_queue *nvmeq, void *ctx, switch (result & 0xff07) { case NVME_AER_NOTICE_NS_CHANGED: dev_info(nvmeq->q_dmadev, "rescanning\n"); - schedule_work(&nvmeq->dev->scan_work); + queue_work(nvme_workq, &nvmeq->dev->scan_work); default: dev_warn(nvmeq->q_dmadev, "async event result %08x\n", result); } @@ -1782,7 +1782,7 @@ static int nvme_dev_add(struct nvme_dev *dev) return 0; dev->ctrl.tagset = &dev->tagset; } - schedule_work(&dev->scan_work); + queue_work(nvme_workq, &dev->scan_work); return 0; } @@ -2331,7 +2331,7 @@ static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id) if (result) goto release_pools; - schedule_work(&dev->reset_work); + queue_work(nvme_workq, &dev->reset_work); return 0; release_pools: @@ -2352,7 +2352,7 @@ static void nvme_reset_notify(struct pci_dev *pdev, bool prepare) if (prepare) nvme_dev_shutdown(dev); else - schedule_work(&dev->reset_work); + queue_work(nvme_workq, &dev->reset_work); } static void nvme_shutdown(struct pci_dev *pdev) @@ -2403,7 +2403,7 @@ static int nvme_resume(struct device *dev) struct pci_dev *pdev = to_pci_dev(dev); struct nvme_dev *ndev = pci_get_drvdata(pdev); - schedule_work(&ndev->reset_work); + queue_work(nvme_workq, &ndev->reset_work); return 0; } #endif @@ -2451,7 +2451,7 @@ static int __init nvme_init(void) init_waitqueue_head(&nvme_kthread_wait); - nvme_workq = create_singlethread_workqueue("nvme"); + nvme_workq = alloc_workqueue("nvme", WQ_UNBOUND | WQ_MEM_RECLAIM, 0); if (!nvme_workq) return -ENOMEM; From 53029b0441bbd263dbb2ee6429572b1732dad4de Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Sat, 28 Nov 2015 15:41:02 +0100 Subject: [PATCH 42/67] NVMe: Remove device management handles on remove We don't want to allow new references to open on a device that is removed. This ties the lifetime of these handles to the physical device's presence rather than to the open reference count. Signed-off-by: Keith Busch Signed-off-by: Jens Axboe --- drivers/nvme/host/core.c | 13 +++++++++---- drivers/nvme/host/nvme.h | 1 + drivers/nvme/host/pci.c | 1 + 3 files changed, 11 insertions(+), 4 deletions(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 96e05327ecf6..25cb1929e985 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -1200,17 +1200,22 @@ static void nvme_release_instance(struct nvme_ctrl *ctrl) spin_unlock(&dev_list_lock); } -static void nvme_free_ctrl(struct kref *kref) -{ - struct nvme_ctrl *ctrl = container_of(kref, struct nvme_ctrl, kref); +void nvme_uninit_ctrl(struct nvme_ctrl *ctrl) + { + device_remove_file(ctrl->device, &dev_attr_reset_controller); + device_destroy(nvme_class, MKDEV(nvme_char_major, ctrl->instance)); spin_lock(&dev_list_lock); list_del(&ctrl->node); spin_unlock(&dev_list_lock); +} + +static void nvme_free_ctrl(struct kref *kref) +{ + struct nvme_ctrl *ctrl = container_of(kref, struct nvme_ctrl, kref); put_device(ctrl->device); nvme_release_instance(ctrl); - device_destroy(nvme_class, MKDEV(nvme_char_major, ctrl->instance)); ctrl->ops->free_ctrl(ctrl); } diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index 2965c469da4a..aa4b42ecbebe 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -221,6 +221,7 @@ int nvme_enable_ctrl(struct nvme_ctrl *ctrl, u64 cap); int nvme_shutdown_ctrl(struct nvme_ctrl *ctrl); int nvme_init_ctrl(struct nvme_ctrl *ctrl, struct device *dev, const struct nvme_ctrl_ops *ops, unsigned long quirks); +void nvme_uninit_ctrl(struct nvme_ctrl *ctrl); void nvme_put_ctrl(struct nvme_ctrl *ctrl); int nvme_init_identify(struct nvme_ctrl *ctrl); diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index a909a8ba228a..c83f0d8a592b 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -2373,6 +2373,7 @@ static void nvme_remove(struct pci_dev *pdev) flush_work(&dev->reset_work); flush_work(&dev->scan_work); nvme_remove_namespaces(&dev->ctrl); + nvme_uninit_ctrl(&dev->ctrl); nvme_dev_shutdown(dev); nvme_dev_remove_admin(dev); nvme_free_queues(dev, 0); From 4b9d5b151046ff717819864f93cb8e012b347bce Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Fri, 20 Nov 2015 09:13:30 +0100 Subject: [PATCH 43/67] NVMe: Simplify metadata setup We no longer require the two-pass setup for block integrity. Signed-off-by: Keith Busch Signed-off-by: Jens Axboe --- drivers/nvme/host/core.c | 28 +++------------------------- 1 file changed, 3 insertions(+), 25 deletions(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 25cb1929e985..875e403830a9 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -576,7 +576,6 @@ static int nvme_revalidate_disk(struct gendisk *disk) if (ns->lba_shift == 0) ns->lba_shift = 9; bs = 1 << ns->lba_shift; - /* XXX: PI implementation requires metadata equal t10 pi tuple size */ pi_type = ns->ms == sizeof(struct t10_pi_tuple) ? id->dps & NVME_NS_DPS_PI_MASK : 0; @@ -591,9 +590,8 @@ static int nvme_revalidate_disk(struct gendisk *disk) ns->pi_type = pi_type; blk_queue_logical_block_size(ns->queue, bs); - if (ns->ms && !ns->ext) + if (ns->ms && !blk_get_integrity(disk) && !ns->ext) nvme_init_integrity(ns); - if (ns->ms && !(ns->ms == 8 && ns->pi_type) && !blk_get_integrity(disk)) set_capacity(disk, 0); else @@ -1002,7 +1000,6 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid) ns->ns_id = nsid; ns->disk = disk; ns->lba_shift = 9; /* set to a default value for 512 until disk is validated */ - list_add_tail(&ns->list, &ctrl->namespaces); blk_queue_logical_block_size(ns->queue, 1 << ns->lba_shift); if (ctrl->max_hw_sectors) { @@ -1025,36 +1022,17 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid) disk->flags = GENHD_FL_EXT_DEVT; sprintf(disk->disk_name, "nvme%dn%d", ctrl->instance, nsid); - /* - * Initialize capacity to 0 until we establish the namespace format and - * setup integrity extentions if necessary. The revalidate_disk after - * add_disk allows the driver to register with integrity if the format - * requires it. - */ - set_capacity(disk, 0); if (nvme_revalidate_disk(ns->disk)) goto out_free_disk; + list_add_tail(&ns->list, &ctrl->namespaces); kref_get(&ctrl->kref); - if (ns->type != NVME_NS_LIGHTNVM) { + if (ns->type != NVME_NS_LIGHTNVM) add_disk(ns->disk); - if (ns->ms) { - struct block_device *bd = bdget_disk(ns->disk, 0); - if (!bd) - return; - if (blkdev_get(bd, FMODE_READ, NULL)) { - bdput(bd); - return; - } - blkdev_reread_part(bd); - blkdev_put(bd, FMODE_READ); - } - } return; out_free_disk: kfree(disk); - list_del(&ns->list); out_free_queue: blk_cleanup_queue(ns->queue); out_free_ns: From 4680072003df14230e9eeeeefb617401012234a5 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 16 Nov 2015 12:40:02 +0100 Subject: [PATCH 44/67] nvme: fix admin queue depth The number in tag_set->queue depth includes the reserved tags. Signed-off-by: Christoph Hellwig Acked-by: Keith Busch Signed-off-by: Jens Axboe --- drivers/nvme/host/pci.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index c83f0d8a592b..686a4e230841 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -1431,7 +1431,7 @@ static int nvme_alloc_admin_tags(struct nvme_dev *dev) if (!dev->ctrl.admin_q) { dev->admin_tagset.ops = &nvme_mq_admin_ops; dev->admin_tagset.nr_hw_queues = 1; - dev->admin_tagset.queue_depth = NVME_AQ_DEPTH - 1; + dev->admin_tagset.queue_depth = NVME_AQ_DEPTH; dev->admin_tagset.reserved_tags = 1; dev->admin_tagset.timeout = ADMIN_TIMEOUT; dev->admin_tagset.numa_node = dev_to_node(dev->dev); From 7688faa6dd2c99ce5d66571d9ad65535ec39e8cb Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sat, 28 Nov 2015 15:41:58 +0100 Subject: [PATCH 45/67] nvme: factor out a few helpers from req_completion We'll need them in other places later. Signed-off-by: Christoph Hellwig Acked-by: Keith Busch Signed-off-by: Jens Axboe --- drivers/nvme/host/core.c | 11 +++++++++++ drivers/nvme/host/nvme.h | 7 +++++++ drivers/nvme/host/pci.c | 12 ++---------- 3 files changed, 20 insertions(+), 10 deletions(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 875e403830a9..b52a789e1e77 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -78,6 +78,17 @@ static struct nvme_ns *nvme_get_ns_from_disk(struct gendisk *disk) return ns; } +void nvme_requeue_req(struct request *req) +{ + unsigned long flags; + + blk_mq_requeue_request(req); + spin_lock_irqsave(req->q->queue_lock, flags); + if (!blk_queue_stopped(req->q)) + blk_mq_kick_requeue_list(req->q); + spin_unlock_irqrestore(req->q->queue_lock, flags); +} + struct request *nvme_alloc_request(struct request_queue *q, struct nvme_command *cmd, unsigned int flags) { diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index aa4b42ecbebe..b0417622d27c 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -216,6 +216,12 @@ static inline int nvme_error_status(u16 status) } } +static inline bool nvme_req_needs_retry(struct request *req, u16 status) +{ + return !(status & NVME_SC_DNR || blk_noretry_request(req)) && + (jiffies - req->start_time) < req->timeout; +} + int nvme_disable_ctrl(struct nvme_ctrl *ctrl, u64 cap); int nvme_enable_ctrl(struct nvme_ctrl *ctrl, u64 cap); int nvme_shutdown_ctrl(struct nvme_ctrl *ctrl); @@ -230,6 +236,7 @@ void nvme_remove_namespaces(struct nvme_ctrl *ctrl); struct request *nvme_alloc_request(struct request_queue *q, struct nvme_command *cmd, unsigned int flags); +void nvme_requeue_req(struct request *req); int nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd, void *buf, unsigned bufflen); int __nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd, diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 686a4e230841..808fb7355603 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -607,17 +607,9 @@ static void req_completion(struct nvme_queue *nvmeq, void *ctx, int error = 0; if (unlikely(status)) { - if (!(status & NVME_SC_DNR || blk_noretry_request(req)) - && (jiffies - req->start_time) < req->timeout) { - unsigned long flags; - + if (nvme_req_needs_retry(req, status)) { nvme_unmap_data(nvmeq->dev, iod); - - blk_mq_requeue_request(req); - spin_lock_irqsave(req->q->queue_lock, flags); - if (!blk_queue_stopped(req->q)) - blk_mq_kick_requeue_list(req->q); - spin_unlock_irqrestore(req->q->queue_lock, flags); + nvme_requeue_req(req); return; } From d8f32166a9c587e87a3a86f654c73d40b6b5df00 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 16 Nov 2015 10:28:47 +0100 Subject: [PATCH 46/67] nvme: switch delete SQ/CQ to blk_execute_rq_nowait Signed-off-by: Christoph Hellwig Acked-by: Keith Busch Signed-off-by: Jens Axboe --- drivers/nvme/host/pci.c | 49 +++++++++++++---------------------------- 1 file changed, 15 insertions(+), 34 deletions(-) diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 808fb7355603..d6d92b022f97 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -86,8 +86,6 @@ static void nvme_dev_shutdown(struct nvme_dev *dev); struct async_cmd_info { struct kthread_work work; struct kthread_worker *worker; - struct request *req; - u32 result; int status; void *ctx; }; @@ -391,16 +389,6 @@ static void abort_completion(struct nvme_queue *nvmeq, void *ctx, atomic_inc(&nvmeq->dev->ctrl.abort_limit); } -static void async_completion(struct nvme_queue *nvmeq, void *ctx, - struct nvme_completion *cqe) -{ - struct async_cmd_info *cmdinfo = ctx; - cmdinfo->result = le32_to_cpup(&cqe->result); - cmdinfo->status = le16_to_cpup(&cqe->status) >> 1; - queue_kthread_work(cmdinfo->worker, &cmdinfo->work); - blk_mq_free_request(cmdinfo->req); -} - static inline struct nvme_cmd_info *get_cmd_from_tag(struct nvme_queue *nvmeq, unsigned int tag) { @@ -985,28 +973,13 @@ static int nvme_submit_async_admin_req(struct nvme_dev *dev) return 0; } -static int nvme_submit_admin_async_cmd(struct nvme_dev *dev, - struct nvme_command *cmd, - struct async_cmd_info *cmdinfo, unsigned timeout) +static void async_cmd_info_endio(struct request *req, int error) { - struct nvme_queue *nvmeq = dev->queues[0]; - struct request *req; - struct nvme_cmd_info *cmd_rq; + struct async_cmd_info *cmdinfo = req->end_io_data; - req = blk_mq_alloc_request(dev->ctrl.admin_q, WRITE, 0); - if (IS_ERR(req)) - return PTR_ERR(req); - - req->timeout = timeout; - cmd_rq = blk_mq_rq_to_pdu(req); - cmdinfo->req = req; - nvme_set_info(cmd_rq, cmdinfo, async_completion); - cmdinfo->status = -EINTR; - - cmd->common.command_id = req->tag; - - nvme_submit_cmd(nvmeq, cmd); - return 0; + cmdinfo->status = req->errors; + queue_kthread_work(cmdinfo->worker, &cmdinfo->work); + blk_mq_free_request(req); } static int adapter_delete_queue(struct nvme_dev *dev, u8 opcode, u16 id) @@ -1920,6 +1893,7 @@ static void nvme_del_queue_end(struct nvme_queue *nvmeq) static int adapter_async_del_queue(struct nvme_queue *nvmeq, u8 opcode, kthread_work_func_t fn) { + struct request *req; struct nvme_command c; memset(&c, 0, sizeof(c)); @@ -1927,8 +1901,15 @@ static int adapter_async_del_queue(struct nvme_queue *nvmeq, u8 opcode, c.delete_queue.qid = cpu_to_le16(nvmeq->qid); init_kthread_work(&nvmeq->cmdinfo.work, fn); - return nvme_submit_admin_async_cmd(nvmeq->dev, &c, &nvmeq->cmdinfo, - ADMIN_TIMEOUT); + + req = nvme_alloc_request(nvmeq->dev->ctrl.admin_q, &c, 0); + if (IS_ERR(req)) + return PTR_ERR(req); + + req->timeout = ADMIN_TIMEOUT; + req->end_io_data = &nvmeq->cmdinfo; + blk_execute_rq_nowait(req->q, NULL, req, 0, async_cmd_info_endio); + return 0; } static void nvme_del_cq_work_handler(struct kthread_work *work) From e7a2a87d5938bbebe1637c82fbde94ea6be3ef78 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 16 Nov 2015 10:39:48 +0100 Subject: [PATCH 47/67] nvme: switch abort to blk_execute_rq_nowait And remove the now unused nvme_submit_cmd helper. Signed-off-by: Christoph Hellwig Acked-by: Keith Busch Signed-off-by: Jens Axboe --- drivers/nvme/host/pci.c | 71 ++++++++++++++++++----------------------- 1 file changed, 31 insertions(+), 40 deletions(-) diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index d6d92b022f97..6a32a92a9227 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -375,20 +375,6 @@ static void async_req_completion(struct nvme_queue *nvmeq, void *ctx, } } -static void abort_completion(struct nvme_queue *nvmeq, void *ctx, - struct nvme_completion *cqe) -{ - struct request *req = ctx; - - u16 status = le16_to_cpup(&cqe->status) >> 1; - u32 result = le32_to_cpup(&cqe->result); - - blk_mq_free_request(req); - - dev_warn(nvmeq->q_dmadev, "Abort status:%x result:%x", status, result); - atomic_inc(&nvmeq->dev->ctrl.abort_limit); -} - static inline struct nvme_cmd_info *get_cmd_from_tag(struct nvme_queue *nvmeq, unsigned int tag) { @@ -440,14 +426,6 @@ static void __nvme_submit_cmd(struct nvme_queue *nvmeq, nvmeq->sq_tail = tail; } -static void nvme_submit_cmd(struct nvme_queue *nvmeq, struct nvme_command *cmd) -{ - unsigned long flags; - spin_lock_irqsave(&nvmeq->q_lock, flags); - __nvme_submit_cmd(nvmeq, cmd); - spin_unlock_irqrestore(&nvmeq->q_lock, flags); -} - static __le64 **iod_list(struct nvme_iod *iod) { return ((void *)iod) + iod->offset; @@ -1045,13 +1023,25 @@ static int adapter_delete_sq(struct nvme_dev *dev, u16 sqid) return adapter_delete_queue(dev, nvme_admin_delete_sq, sqid); } +static void abort_endio(struct request *req, int error) +{ + struct nvme_cmd_info *cmd = blk_mq_rq_to_pdu(req); + struct nvme_queue *nvmeq = cmd->nvmeq; + u32 result = (u32)(uintptr_t)req->special; + u16 status = req->errors; + + dev_warn(nvmeq->q_dmadev, "Abort status:%x result:%x", status, result); + atomic_inc(&nvmeq->dev->ctrl.abort_limit); + + blk_mq_free_request(req); +} + static enum blk_eh_timer_return nvme_timeout(struct request *req, bool reserved) { struct nvme_cmd_info *cmd_rq = blk_mq_rq_to_pdu(req); struct nvme_queue *nvmeq = cmd_rq->nvmeq; struct nvme_dev *dev = nvmeq->dev; struct request *abort_req; - struct nvme_cmd_info *abort_cmd; struct nvme_command cmd; /* @@ -1089,30 +1079,31 @@ static enum blk_eh_timer_return nvme_timeout(struct request *req, bool reserved) return BLK_EH_HANDLED; } - if (atomic_dec_and_test(&dev->ctrl.abort_limit)) - return BLK_EH_RESET_TIMER; + cmd_rq->aborted = 1; - abort_req = blk_mq_alloc_request(dev->ctrl.admin_q, WRITE, + if (atomic_dec_return(&dev->ctrl.abort_limit) < 0) { + atomic_inc(&dev->ctrl.abort_limit); + return BLK_EH_RESET_TIMER; + } + + memset(&cmd, 0, sizeof(cmd)); + cmd.abort.opcode = nvme_admin_abort_cmd; + cmd.abort.cid = req->tag; + cmd.abort.sqid = cpu_to_le16(nvmeq->qid); + + dev_warn(nvmeq->q_dmadev, "I/O %d QID %d timeout, aborting\n", + req->tag, nvmeq->qid); + + abort_req = nvme_alloc_request(dev->ctrl.admin_q, &cmd, BLK_MQ_REQ_NOWAIT); if (IS_ERR(abort_req)) { atomic_inc(&dev->ctrl.abort_limit); return BLK_EH_RESET_TIMER; } - abort_cmd = blk_mq_rq_to_pdu(abort_req); - nvme_set_info(abort_cmd, abort_req, abort_completion); - - memset(&cmd, 0, sizeof(cmd)); - cmd.abort.opcode = nvme_admin_abort_cmd; - cmd.abort.cid = req->tag; - cmd.abort.sqid = cpu_to_le16(nvmeq->qid); - cmd.abort.command_id = abort_req->tag; - - cmd_rq->aborted = 1; - - dev_warn(nvmeq->q_dmadev, "I/O %d QID %d timeout, aborting\n", - req->tag, nvmeq->qid); - nvme_submit_cmd(dev->queues[0], &cmd); + abort_req->timeout = ADMIN_TIMEOUT; + abort_req->end_io_data = NULL; + blk_execute_rq_nowait(abort_req->q, NULL, abort_req, 0, abort_endio); /* * The aborted req will be completed on receiving the abort req. From adf68f21c15572c68d9fadae618a09cf324b9814 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sat, 28 Nov 2015 15:42:28 +0100 Subject: [PATCH 48/67] nvme: special case AEN requests AEN requests are different from other requests in that they don't time out or can easily be cancelled. Because of that we should not use the blk-mq infrastructure but just special case them in the completion path. Signed-off-by: Christoph Hellwig Acked-by: Keith Busch Signed-off-by: Jens Axboe --- drivers/nvme/host/pci.c | 75 ++++++++++++++++++++++------------------- 1 file changed, 40 insertions(+), 35 deletions(-) diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 6a32a92a9227..0497ff67324c 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -48,6 +48,13 @@ #define NVME_AQ_DEPTH 256 #define SQ_SIZE(depth) (depth * sizeof(struct nvme_command)) #define CQ_SIZE(depth) (depth * sizeof(struct nvme_completion)) + +/* + * We handle AEN commands ourselves and don't even let the + * block layer know about them. + */ +#define NVME_NR_AEN_COMMANDS 1 +#define NVME_AQ_BLKMQ_DEPTH (NVME_AQ_DEPTH - NVME_NR_AEN_COMMANDS) unsigned char admin_timeout = 60; module_param(admin_timeout, byte, 0644); @@ -355,23 +362,23 @@ static void *cancel_cmd_info(struct nvme_cmd_info *cmd, nvme_completion_fn *fn) return ctx; } -static void async_req_completion(struct nvme_queue *nvmeq, void *ctx, - struct nvme_completion *cqe) +static void nvme_complete_async_event(struct nvme_dev *dev, + struct nvme_completion *cqe) { - u32 result = le32_to_cpup(&cqe->result); - u16 status = le16_to_cpup(&cqe->status) >> 1; + u16 status = le16_to_cpu(cqe->status) >> 1; + u32 result = le32_to_cpu(cqe->result); if (status == NVME_SC_SUCCESS || status == NVME_SC_ABORT_REQ) - ++nvmeq->dev->ctrl.event_limit; + ++dev->ctrl.event_limit; if (status != NVME_SC_SUCCESS) return; switch (result & 0xff07) { case NVME_AER_NOTICE_NS_CHANGED: - dev_info(nvmeq->q_dmadev, "rescanning\n"); - queue_work(nvme_workq, &nvmeq->dev->scan_work); + dev_info(dev->dev, "rescanning\n"); + queue_work(nvme_workq, &dev->scan_work); default: - dev_warn(nvmeq->q_dmadev, "async event result %08x\n", result); + dev_warn(dev->dev, "async event result %08x\n", result); } } @@ -404,7 +411,7 @@ static void *nvme_finish_cmd(struct nvme_queue *nvmeq, int tag, } /** - * nvme_submit_cmd() - Copy a command into a queue and ring the doorbell + * __nvme_submit_cmd() - Copy a command into a queue and ring the doorbell * @nvmeq: The queue to use * @cmd: The command to send * @@ -853,15 +860,31 @@ static void __nvme_process_cq(struct nvme_queue *nvmeq, unsigned int *tag) void *ctx; nvme_completion_fn fn; struct nvme_completion cqe = nvmeq->cqes[head]; - if ((le16_to_cpu(cqe.status) & 1) != phase) + u16 status = le16_to_cpu(cqe.status); + + if ((status & 1) != phase) break; nvmeq->sq_head = le16_to_cpu(cqe.sq_head); if (++head == nvmeq->q_depth) { head = 0; phase = !phase; } + if (tag && *tag == cqe.command_id) *tag = -1; + + /* + * AEN requests are special as they don't time out and can + * survive any kind of queue freeze and often don't respond to + * aborts. We don't even bother to allocate a struct request + * for them but rather special case them here. + */ + if (unlikely(nvmeq->qid == 0 && + cqe.command_id >= NVME_AQ_BLKMQ_DEPTH)) { + nvme_complete_async_event(nvmeq->dev, &cqe); + continue; + } + ctx = nvme_finish_cmd(nvmeq, cqe.command_id, &fn); fn(nvmeq, ctx, &cqe); } @@ -926,29 +949,15 @@ static int nvme_poll(struct blk_mq_hw_ctx *hctx, unsigned int tag) return 0; } -static int nvme_submit_async_admin_req(struct nvme_dev *dev) +static void nvme_submit_async_event(struct nvme_dev *dev) { - struct nvme_queue *nvmeq = dev->queues[0]; struct nvme_command c; - struct nvme_cmd_info *cmd_info; - struct request *req; - - req = blk_mq_alloc_request(dev->ctrl.admin_q, WRITE, - BLK_MQ_REQ_NOWAIT | BLK_MQ_REQ_RESERVED); - if (IS_ERR(req)) - return PTR_ERR(req); - - req->cmd_flags |= REQ_NO_TIMEOUT; - cmd_info = blk_mq_rq_to_pdu(req); - nvme_set_info(cmd_info, NULL, async_req_completion); memset(&c, 0, sizeof(c)); c.common.opcode = nvme_admin_async_event; - c.common.command_id = req->tag; + c.common.command_id = NVME_AQ_BLKMQ_DEPTH + --dev->ctrl.event_limit; - blk_mq_free_request(req); - __nvme_submit_cmd(nvmeq, &c); - return 0; + __nvme_submit_cmd(dev->queues[0], &c); } static void async_cmd_info_endio(struct request *req, int error) @@ -1387,8 +1396,7 @@ static int nvme_alloc_admin_tags(struct nvme_dev *dev) if (!dev->ctrl.admin_q) { dev->admin_tagset.ops = &nvme_mq_admin_ops; dev->admin_tagset.nr_hw_queues = 1; - dev->admin_tagset.queue_depth = NVME_AQ_DEPTH; - dev->admin_tagset.reserved_tags = 1; + dev->admin_tagset.queue_depth = NVME_AQ_BLKMQ_DEPTH; dev->admin_tagset.timeout = ADMIN_TIMEOUT; dev->admin_tagset.numa_node = dev_to_node(dev->dev); dev->admin_tagset.cmd_size = nvme_cmd_size(dev); @@ -1496,11 +1504,8 @@ static int nvme_kthread(void *data) spin_lock_irq(&nvmeq->q_lock); nvme_process_cq(nvmeq); - while (i == 0 && dev->ctrl.event_limit > 0) { - if (nvme_submit_async_admin_req(dev)) - break; - dev->ctrl.event_limit--; - } + while (i == 0 && dev->ctrl.event_limit > 0) + nvme_submit_async_event(dev); spin_unlock_irq(&nvmeq->q_lock); } } @@ -2151,7 +2156,7 @@ static void nvme_reset_work(struct work_struct *work) if (result) goto free_tags; - dev->ctrl.event_limit = 1; + dev->ctrl.event_limit = NVME_NR_AEN_COMMANDS; result = nvme_dev_list_add(dev); if (result) From aae239e1910ebc27ec9f7e8b25904a69626cf28c Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 26 Nov 2015 12:59:50 +0100 Subject: [PATCH 49/67] nvme: simplify completion handling Now that all commands are executed as block layer requests we can remove the internal completion in the NVMe driver. Note that we can simply call blk_mq_complete_request to abort commands as the block layer will protect against double copletions internally. Signed-off-by: Christoph Hellwig Acked-by: Keith Busch Signed-off-by: Jens Axboe --- drivers/nvme/host/pci.c | 141 ++++++++-------------------------------- 1 file changed, 26 insertions(+), 115 deletions(-) diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 0497ff67324c..84ac46fc9873 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -199,15 +199,11 @@ static inline void _nvme_check_size(void) BUILD_BUG_ON(sizeof(struct nvme_smart_log) != 512); } -typedef void (*nvme_completion_fn)(struct nvme_queue *, void *, - struct nvme_completion *); - struct nvme_cmd_info { - nvme_completion_fn fn; - void *ctx; int aborted; struct nvme_queue *nvmeq; - struct nvme_iod iod[0]; + struct nvme_iod *iod; + struct nvme_iod __iod; }; /* @@ -302,15 +298,6 @@ static int nvme_init_request(void *data, struct request *req, return 0; } -static void nvme_set_info(struct nvme_cmd_info *cmd, void *ctx, - nvme_completion_fn handler) -{ - cmd->fn = handler; - cmd->ctx = ctx; - cmd->aborted = 0; - blk_mq_start_request(blk_mq_rq_from_pdu(cmd)); -} - static void *iod_get_private(struct nvme_iod *iod) { return (void *) (iod->private & ~0x1UL); @@ -324,44 +311,6 @@ static bool iod_should_kfree(struct nvme_iod *iod) return (iod->private & NVME_INT_MASK) == 0; } -/* Special values must be less than 0x1000 */ -#define CMD_CTX_BASE ((void *)POISON_POINTER_DELTA) -#define CMD_CTX_CANCELLED (0x30C + CMD_CTX_BASE) -#define CMD_CTX_COMPLETED (0x310 + CMD_CTX_BASE) -#define CMD_CTX_INVALID (0x314 + CMD_CTX_BASE) - -static void special_completion(struct nvme_queue *nvmeq, void *ctx, - struct nvme_completion *cqe) -{ - if (ctx == CMD_CTX_CANCELLED) - return; - if (ctx == CMD_CTX_COMPLETED) { - dev_warn(nvmeq->q_dmadev, - "completed id %d twice on queue %d\n", - cqe->command_id, le16_to_cpup(&cqe->sq_id)); - return; - } - if (ctx == CMD_CTX_INVALID) { - dev_warn(nvmeq->q_dmadev, - "invalid id %d completed on queue %d\n", - cqe->command_id, le16_to_cpup(&cqe->sq_id)); - return; - } - dev_warn(nvmeq->q_dmadev, "Unknown special completion %p\n", ctx); -} - -static void *cancel_cmd_info(struct nvme_cmd_info *cmd, nvme_completion_fn *fn) -{ - void *ctx; - - if (fn) - *fn = cmd->fn; - ctx = cmd->ctx; - cmd->fn = special_completion; - cmd->ctx = CMD_CTX_CANCELLED; - return ctx; -} - static void nvme_complete_async_event(struct nvme_dev *dev, struct nvme_completion *cqe) { @@ -382,34 +331,6 @@ static void nvme_complete_async_event(struct nvme_dev *dev, } } -static inline struct nvme_cmd_info *get_cmd_from_tag(struct nvme_queue *nvmeq, - unsigned int tag) -{ - struct request *req = blk_mq_tag_to_rq(*nvmeq->tags, tag); - - return blk_mq_rq_to_pdu(req); -} - -/* - * Called with local interrupts disabled and the q_lock held. May not sleep. - */ -static void *nvme_finish_cmd(struct nvme_queue *nvmeq, int tag, - nvme_completion_fn *fn) -{ - struct nvme_cmd_info *cmd = get_cmd_from_tag(nvmeq, tag); - void *ctx; - if (tag >= nvmeq->q_depth) { - *fn = special_completion; - return CMD_CTX_INVALID; - } - if (fn) - *fn = cmd->fn; - ctx = cmd->ctx; - cmd->fn = special_completion; - cmd->ctx = CMD_CTX_COMPLETED; - return ctx; -} - /** * __nvme_submit_cmd() - Copy a command into a queue and ring the doorbell * @nvmeq: The queue to use @@ -473,7 +394,7 @@ static struct nvme_iod *nvme_alloc_iod(struct request *rq, struct nvme_dev *dev, size <= NVME_INT_BYTES(dev)) { struct nvme_cmd_info *cmd = blk_mq_rq_to_pdu(rq); - iod = cmd->iod; + iod = &cmd->__iod; iod_init(iod, size, rq->nr_phys_segments, (unsigned long) rq | NVME_INT_MASK); return iod; @@ -570,12 +491,11 @@ static void nvme_dif_complete(u32 p, u32 v, struct t10_pi_tuple *pi) } #endif -static void req_completion(struct nvme_queue *nvmeq, void *ctx, - struct nvme_completion *cqe) +static void req_completion(struct nvme_queue *nvmeq, struct nvme_completion *cqe) { - struct nvme_iod *iod = ctx; - struct request *req = iod_get_private(iod); + struct request *req = blk_mq_tag_to_rq(*nvmeq->tags, cqe->command_id); struct nvme_cmd_info *cmd_rq = blk_mq_rq_to_pdu(req); + struct nvme_iod *iod = cmd_rq->iod; u16 status = le16_to_cpup(&cqe->status) >> 1; int error = 0; @@ -586,14 +506,10 @@ static void req_completion(struct nvme_queue *nvmeq, void *ctx, return; } - if (req->cmd_type == REQ_TYPE_DRV_PRIV) { - if (cmd_rq->ctx == CMD_CTX_CANCELLED) - error = NVME_SC_CANCELLED; - else - error = status; - } else { + if (req->cmd_type == REQ_TYPE_DRV_PRIV) + error = status; + else error = nvme_error_status(status); - } } if (req->cmd_type == REQ_TYPE_DRV_PRIV) { @@ -836,8 +752,10 @@ static int nvme_queue_rq(struct blk_mq_hw_ctx *hctx, if (ret) goto out; + cmd->iod = iod; + cmd->aborted = 0; cmnd.common.command_id = req->tag; - nvme_set_info(cmd, iod, req_completion); + blk_mq_start_request(req); spin_lock_irq(&nvmeq->q_lock); __nvme_submit_cmd(nvmeq, &cmnd); @@ -857,8 +775,6 @@ static void __nvme_process_cq(struct nvme_queue *nvmeq, unsigned int *tag) phase = nvmeq->cq_phase; for (;;) { - void *ctx; - nvme_completion_fn fn; struct nvme_completion cqe = nvmeq->cqes[head]; u16 status = le16_to_cpu(cqe.status); @@ -873,6 +789,13 @@ static void __nvme_process_cq(struct nvme_queue *nvmeq, unsigned int *tag) if (tag && *tag == cqe.command_id) *tag = -1; + if (unlikely(cqe.command_id >= nvmeq->q_depth)) { + dev_warn(nvmeq->q_dmadev, + "invalid id %d completed on queue %d\n", + cqe.command_id, le16_to_cpu(cqe.sq_id)); + continue; + } + /* * AEN requests are special as they don't time out and can * survive any kind of queue freeze and often don't respond to @@ -885,8 +808,7 @@ static void __nvme_process_cq(struct nvme_queue *nvmeq, unsigned int *tag) continue; } - ctx = nvme_finish_cmd(nvmeq, cqe.command_id, &fn); - fn(nvmeq, ctx, &cqe); + req_completion(nvmeq, &cqe); } /* If the controller ignores the cq head doorbell and continuously @@ -1125,29 +1047,18 @@ static enum blk_eh_timer_return nvme_timeout(struct request *req, bool reserved) static void nvme_cancel_queue_ios(struct request *req, void *data, bool reserved) { struct nvme_queue *nvmeq = data; - void *ctx; - nvme_completion_fn fn; - struct nvme_cmd_info *cmd; - struct nvme_completion cqe; + int status; if (!blk_mq_request_started(req)) return; - cmd = blk_mq_rq_to_pdu(req); - - if (cmd->ctx == CMD_CTX_CANCELLED) - return; + dev_warn(nvmeq->q_dmadev, + "Cancelling I/O %d QID %d\n", req->tag, nvmeq->qid); + status = NVME_SC_CANCELLED; if (blk_queue_dying(req->q)) - cqe.status = cpu_to_le16((NVME_SC_ABORT_REQ | NVME_SC_DNR) << 1); - else - cqe.status = cpu_to_le16(NVME_SC_ABORT_REQ << 1); - - - dev_warn(nvmeq->q_dmadev, "Cancelling I/O %d QID %d\n", - req->tag, nvmeq->qid); - ctx = cancel_cmd_info(cmd, &fn); - fn(nvmeq, ctx, &cqe); + status |= NVME_SC_DNR; + blk_mq_complete_request(req, status); } static void nvme_free_queue(struct nvme_queue *nvmeq) From eee417b0697827a6e120199b126b447af3c81b47 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 26 Nov 2015 13:03:13 +0100 Subject: [PATCH 50/67] nvme: properly free resources for cancelled command We need to move freeing of resources to the ->complete handler to ensure they are also freed when we cancel the command. Signed-off-by: Christoph Hellwig Acked-by: Keith Busch Signed-off-by: Jens Axboe --- drivers/nvme/host/pci.c | 79 +++++++++++++++++++++-------------------- 1 file changed, 40 insertions(+), 39 deletions(-) diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 84ac46fc9873..ec768b64ab77 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -82,11 +82,9 @@ static wait_queue_head_t nvme_kthread_wait; struct nvme_dev; struct nvme_queue; -struct nvme_iod; static int nvme_reset(struct nvme_dev *dev); static void nvme_process_cq(struct nvme_queue *nvmeq); -static void nvme_unmap_data(struct nvme_dev *dev, struct nvme_iod *iod); static void nvme_remove_dead_ctrl(struct nvme_dev *dev); static void nvme_dev_shutdown(struct nvme_dev *dev); @@ -491,41 +489,6 @@ static void nvme_dif_complete(u32 p, u32 v, struct t10_pi_tuple *pi) } #endif -static void req_completion(struct nvme_queue *nvmeq, struct nvme_completion *cqe) -{ - struct request *req = blk_mq_tag_to_rq(*nvmeq->tags, cqe->command_id); - struct nvme_cmd_info *cmd_rq = blk_mq_rq_to_pdu(req); - struct nvme_iod *iod = cmd_rq->iod; - u16 status = le16_to_cpup(&cqe->status) >> 1; - int error = 0; - - if (unlikely(status)) { - if (nvme_req_needs_retry(req, status)) { - nvme_unmap_data(nvmeq->dev, iod); - nvme_requeue_req(req); - return; - } - - if (req->cmd_type == REQ_TYPE_DRV_PRIV) - error = status; - else - error = nvme_error_status(status); - } - - if (req->cmd_type == REQ_TYPE_DRV_PRIV) { - u32 result = le32_to_cpup(&cqe->result); - req->special = (void *)(uintptr_t)result; - } - - if (cmd_rq->aborted) - dev_warn(nvmeq->dev->dev, - "completing aborted command with status:%04x\n", - error); - - nvme_unmap_data(nvmeq->dev, iod); - blk_mq_complete_request(req, error); -} - static bool nvme_setup_prps(struct nvme_dev *dev, struct nvme_iod *iod, int total_len) { @@ -726,7 +689,7 @@ static int nvme_queue_rq(struct blk_mq_hw_ctx *hctx, if (ns && ns->ms && !blk_integrity_rq(req)) { if (!(ns->pi_type && ns->ms == 8) && req->cmd_type != REQ_TYPE_DRV_PRIV) { - blk_mq_complete_request(req, -EFAULT); + blk_mq_end_request(req, -EFAULT); return BLK_MQ_RQ_QUEUE_OK; } } @@ -767,6 +730,35 @@ out: return ret; } +static void nvme_complete_rq(struct request *req) +{ + struct nvme_cmd_info *cmd = blk_mq_rq_to_pdu(req); + struct nvme_dev *dev = cmd->nvmeq->dev; + int error = 0; + + nvme_unmap_data(dev, cmd->iod); + + if (unlikely(req->errors)) { + if (nvme_req_needs_retry(req, req->errors)) { + nvme_requeue_req(req); + return; + } + + if (req->cmd_type == REQ_TYPE_DRV_PRIV) + error = req->errors; + else + error = nvme_error_status(req->errors); + } + + if (unlikely(cmd->aborted)) { + dev_warn(dev->dev, + "completing aborted command with status: %04x\n", + req->errors); + } + + blk_mq_end_request(req, error); +} + static void __nvme_process_cq(struct nvme_queue *nvmeq, unsigned int *tag) { u16 head, phase; @@ -777,6 +769,7 @@ static void __nvme_process_cq(struct nvme_queue *nvmeq, unsigned int *tag) for (;;) { struct nvme_completion cqe = nvmeq->cqes[head]; u16 status = le16_to_cpu(cqe.status); + struct request *req; if ((status & 1) != phase) break; @@ -808,7 +801,13 @@ static void __nvme_process_cq(struct nvme_queue *nvmeq, unsigned int *tag) continue; } - req_completion(nvmeq, &cqe); + req = blk_mq_tag_to_rq(*nvmeq->tags, cqe.command_id); + if (req->cmd_type == REQ_TYPE_DRV_PRIV) { + u32 result = le32_to_cpu(cqe.result); + req->special = (void *)(uintptr_t)result; + } + blk_mq_complete_request(req, status >> 1); + } /* If the controller ignores the cq head doorbell and continuously @@ -1278,6 +1277,7 @@ static int nvme_create_queue(struct nvme_queue *nvmeq, int qid) static struct blk_mq_ops nvme_mq_admin_ops = { .queue_rq = nvme_queue_rq, + .complete = nvme_complete_rq, .map_queue = blk_mq_map_queue, .init_hctx = nvme_admin_init_hctx, .exit_hctx = nvme_admin_exit_hctx, @@ -1287,6 +1287,7 @@ static struct blk_mq_ops nvme_mq_admin_ops = { static struct blk_mq_ops nvme_mq_ops = { .queue_rq = nvme_queue_rq, + .complete = nvme_complete_rq, .map_queue = blk_mq_map_queue, .init_hctx = nvme_init_hctx, .init_request = nvme_init_request, From bf68405705bd35c09ec1f7528718dce5af88daff Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 26 Oct 2015 17:12:51 +0900 Subject: [PATCH 51/67] nvme: meta_sg doesn't have to be an array Signed-off-by: Christoph Hellwig Acked-by: Keith Busch Signed-off-by: Jens Axboe --- drivers/nvme/host/pci.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index ec768b64ab77..24d695a2f6c4 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -174,7 +174,7 @@ struct nvme_iod { int nents; /* Used in scatterlist */ int length; /* Of data, in bytes */ dma_addr_t first_dma; - struct scatterlist meta_sg[1]; /* metadata requires single contiguous buffer */ + struct scatterlist meta_sg; /* metadata requires single contiguous buffer */ struct scatterlist sg[0]; }; @@ -594,21 +594,21 @@ static int nvme_map_data(struct nvme_dev *dev, struct nvme_iod *iod, if (blk_rq_count_integrity_sg(q, req->bio) != 1) goto out_unmap; - sg_init_table(iod->meta_sg, 1); - if (blk_rq_map_integrity_sg(q, req->bio, iod->meta_sg) != 1) + sg_init_table(&iod->meta_sg, 1); + if (blk_rq_map_integrity_sg(q, req->bio, &iod->meta_sg) != 1) goto out_unmap; if (rq_data_dir(req)) nvme_dif_remap(req, nvme_dif_prep); - if (!dma_map_sg(dev->dev, iod->meta_sg, 1, dma_dir)) + if (!dma_map_sg(dev->dev, &iod->meta_sg, 1, dma_dir)) goto out_unmap; } cmnd->rw.prp1 = cpu_to_le64(sg_dma_address(iod->sg)); cmnd->rw.prp2 = cpu_to_le64(iod->first_dma); if (blk_integrity_rq(req)) - cmnd->rw.metadata = cpu_to_le64(sg_dma_address(iod->meta_sg)); + cmnd->rw.metadata = cpu_to_le64(sg_dma_address(&iod->meta_sg)); return BLK_MQ_RQ_QUEUE_OK; out_unmap: @@ -628,7 +628,7 @@ static void nvme_unmap_data(struct nvme_dev *dev, struct nvme_iod *iod) if (blk_integrity_rq(req)) { if (!rq_data_dir(req)) nvme_dif_remap(req, nvme_dif_complete); - dma_unmap_sg(dev->dev, iod->meta_sg, 1, dma_dir); + dma_unmap_sg(dev->dev, &iod->meta_sg, 1, dma_dir); } } From f4800d6d1548e0d5ab94f2216d41d94282e2588c Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sat, 28 Nov 2015 15:43:10 +0100 Subject: [PATCH 52/67] nvme: merge iod and cmd_info Merge the two per-request structures in the nvme driver into a single one. Signed-off-by: Christoph Hellwig Acked-by: Keith Busch Signed-off-by: Jens Axboe --- drivers/nvme/host/pci.c | 184 ++++++++++++++++------------------------ 1 file changed, 73 insertions(+), 111 deletions(-) diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 24d695a2f6c4..b88708affad8 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -164,18 +164,19 @@ struct nvme_queue { /* * The nvme_iod describes the data in an I/O, including the list of PRP * entries. You can't see it in this data structure because C doesn't let - * me express that. Use nvme_alloc_iod to ensure there's enough space + * me express that. Use nvme_init_iod to ensure there's enough space * allocated to store the PRP list. */ struct nvme_iod { - unsigned long private; /* For the use of the submitter of the I/O */ + struct nvme_queue *nvmeq; + int aborted; int npages; /* In the PRP list. 0 means small pool in use */ - int offset; /* Of PRP list */ int nents; /* Used in scatterlist */ int length; /* Of data, in bytes */ dma_addr_t first_dma; struct scatterlist meta_sg; /* metadata requires single contiguous buffer */ - struct scatterlist sg[0]; + struct scatterlist *sg; + struct scatterlist inline_sg[0]; }; /* @@ -197,19 +198,11 @@ static inline void _nvme_check_size(void) BUILD_BUG_ON(sizeof(struct nvme_smart_log) != 512); } -struct nvme_cmd_info { - int aborted; - struct nvme_queue *nvmeq; - struct nvme_iod *iod; - struct nvme_iod __iod; -}; - /* * Max size of iod being embedded in the request payload */ #define NVME_INT_PAGES 2 #define NVME_INT_BYTES(dev) (NVME_INT_PAGES * (dev)->ctrl.page_size) -#define NVME_INT_MASK 0x01 /* * Will slightly overestimate the number of pages needed. This is OK @@ -223,15 +216,17 @@ static int nvme_npages(unsigned size, struct nvme_dev *dev) return DIV_ROUND_UP(8 * nprps, PAGE_SIZE - 8); } +static unsigned int nvme_iod_alloc_size(struct nvme_dev *dev, + unsigned int size, unsigned int nseg) +{ + return sizeof(__le64 *) * nvme_npages(size, dev) + + sizeof(struct scatterlist) * nseg; +} + static unsigned int nvme_cmd_size(struct nvme_dev *dev) { - unsigned int ret = sizeof(struct nvme_cmd_info); - - ret += sizeof(struct nvme_iod); - ret += sizeof(__le64 *) * nvme_npages(NVME_INT_BYTES(dev), dev); - ret += sizeof(struct scatterlist) * NVME_INT_PAGES; - - return ret; + return sizeof(struct nvme_iod) + + nvme_iod_alloc_size(dev, NVME_INT_BYTES(dev), NVME_INT_PAGES); } static int nvme_admin_init_hctx(struct blk_mq_hw_ctx *hctx, void *data, @@ -261,11 +256,11 @@ static int nvme_admin_init_request(void *data, struct request *req, unsigned int numa_node) { struct nvme_dev *dev = data; - struct nvme_cmd_info *cmd = blk_mq_rq_to_pdu(req); + struct nvme_iod *iod = blk_mq_rq_to_pdu(req); struct nvme_queue *nvmeq = dev->queues[0]; BUG_ON(!nvmeq); - cmd->nvmeq = nvmeq; + iod->nvmeq = nvmeq; return 0; } @@ -288,27 +283,14 @@ static int nvme_init_request(void *data, struct request *req, unsigned int numa_node) { struct nvme_dev *dev = data; - struct nvme_cmd_info *cmd = blk_mq_rq_to_pdu(req); + struct nvme_iod *iod = blk_mq_rq_to_pdu(req); struct nvme_queue *nvmeq = dev->queues[hctx_idx + 1]; BUG_ON(!nvmeq); - cmd->nvmeq = nvmeq; + iod->nvmeq = nvmeq; return 0; } -static void *iod_get_private(struct nvme_iod *iod) -{ - return (void *) (iod->private & ~0x1UL); -} - -/* - * If bit 0 is set, the iod is embedded in the request payload. - */ -static bool iod_should_kfree(struct nvme_iod *iod) -{ - return (iod->private & NVME_INT_MASK) == 0; -} - static void nvme_complete_async_event(struct nvme_dev *dev, struct nvme_completion *cqe) { @@ -352,61 +334,44 @@ static void __nvme_submit_cmd(struct nvme_queue *nvmeq, nvmeq->sq_tail = tail; } -static __le64 **iod_list(struct nvme_iod *iod) +static __le64 **iod_list(struct request *req) { - return ((void *)iod) + iod->offset; + struct nvme_iod *iod = blk_mq_rq_to_pdu(req); + return (__le64 **)(iod->sg + req->nr_phys_segments); } -static inline void iod_init(struct nvme_iod *iod, unsigned nbytes, - unsigned nseg, unsigned long private) +static int nvme_init_iod(struct request *rq, struct nvme_dev *dev) { - iod->private = private; - iod->offset = offsetof(struct nvme_iod, sg[nseg]); - iod->npages = -1; - iod->length = nbytes; - iod->nents = 0; -} + struct nvme_iod *iod = blk_mq_rq_to_pdu(rq); + int nseg = rq->nr_phys_segments; + unsigned size; -static struct nvme_iod * -__nvme_alloc_iod(unsigned nseg, unsigned bytes, struct nvme_dev *dev, - unsigned long priv, gfp_t gfp) -{ - struct nvme_iod *iod = kmalloc(sizeof(struct nvme_iod) + - sizeof(__le64 *) * nvme_npages(bytes, dev) + - sizeof(struct scatterlist) * nseg, gfp); + if (rq->cmd_flags & REQ_DISCARD) + size = sizeof(struct nvme_dsm_range); + else + size = blk_rq_bytes(rq); - if (iod) - iod_init(iod, bytes, nseg, priv); - - return iod; -} - -static struct nvme_iod *nvme_alloc_iod(struct request *rq, struct nvme_dev *dev, - gfp_t gfp) -{ - unsigned size = !(rq->cmd_flags & REQ_DISCARD) ? blk_rq_bytes(rq) : - sizeof(struct nvme_dsm_range); - struct nvme_iod *iod; - - if (rq->nr_phys_segments <= NVME_INT_PAGES && - size <= NVME_INT_BYTES(dev)) { - struct nvme_cmd_info *cmd = blk_mq_rq_to_pdu(rq); - - iod = &cmd->__iod; - iod_init(iod, size, rq->nr_phys_segments, - (unsigned long) rq | NVME_INT_MASK); - return iod; + if (nseg > NVME_INT_PAGES || size > NVME_INT_BYTES(dev)) { + iod->sg = kmalloc(nvme_iod_alloc_size(dev, size, nseg), GFP_ATOMIC); + if (!iod->sg) + return BLK_MQ_RQ_QUEUE_BUSY; + } else { + iod->sg = iod->inline_sg; } - return __nvme_alloc_iod(rq->nr_phys_segments, size, dev, - (unsigned long) rq, gfp); + iod->aborted = 0; + iod->npages = -1; + iod->nents = 0; + iod->length = size; + return 0; } -static void nvme_free_iod(struct nvme_dev *dev, struct nvme_iod *iod) +static void nvme_free_iod(struct nvme_dev *dev, struct request *req) { + struct nvme_iod *iod = blk_mq_rq_to_pdu(req); const int last_prp = dev->ctrl.page_size / 8 - 1; int i; - __le64 **list = iod_list(iod); + __le64 **list = iod_list(req); dma_addr_t prp_dma = iod->first_dma; if (iod->npages == 0) @@ -418,8 +383,8 @@ static void nvme_free_iod(struct nvme_dev *dev, struct nvme_iod *iod) prp_dma = next_prp_dma; } - if (iod_should_kfree(iod)) - kfree(iod); + if (iod->sg != iod->inline_sg) + kfree(iod->sg); } #ifdef CONFIG_BLK_DEV_INTEGRITY @@ -489,9 +454,10 @@ static void nvme_dif_complete(u32 p, u32 v, struct t10_pi_tuple *pi) } #endif -static bool nvme_setup_prps(struct nvme_dev *dev, struct nvme_iod *iod, +static bool nvme_setup_prps(struct nvme_dev *dev, struct request *req, int total_len) { + struct nvme_iod *iod = blk_mq_rq_to_pdu(req); struct dma_pool *pool; int length = total_len; struct scatterlist *sg = iod->sg; @@ -500,7 +466,7 @@ static bool nvme_setup_prps(struct nvme_dev *dev, struct nvme_iod *iod, u32 page_size = dev->ctrl.page_size; int offset = dma_addr & (page_size - 1); __le64 *prp_list; - __le64 **list = iod_list(iod); + __le64 **list = iod_list(req); dma_addr_t prp_dma; int nprps, i; @@ -568,10 +534,10 @@ static bool nvme_setup_prps(struct nvme_dev *dev, struct nvme_iod *iod, return true; } -static int nvme_map_data(struct nvme_dev *dev, struct nvme_iod *iod, +static int nvme_map_data(struct nvme_dev *dev, struct request *req, struct nvme_command *cmnd) { - struct request *req = iod_get_private(iod); + struct nvme_iod *iod = blk_mq_rq_to_pdu(req); struct request_queue *q = req->q; enum dma_data_direction dma_dir = rq_data_dir(req) ? DMA_TO_DEVICE : DMA_FROM_DEVICE; @@ -586,7 +552,7 @@ static int nvme_map_data(struct nvme_dev *dev, struct nvme_iod *iod, if (!dma_map_sg(dev->dev, iod->sg, iod->nents, dma_dir)) goto out; - if (!nvme_setup_prps(dev, iod, blk_rq_bytes(req))) + if (!nvme_setup_prps(dev, req, blk_rq_bytes(req))) goto out_unmap; ret = BLK_MQ_RQ_QUEUE_ERROR; @@ -617,9 +583,9 @@ out: return ret; } -static void nvme_unmap_data(struct nvme_dev *dev, struct nvme_iod *iod) +static void nvme_unmap_data(struct nvme_dev *dev, struct request *req) { - struct request *req = iod_get_private(iod); + struct nvme_iod *iod = blk_mq_rq_to_pdu(req); enum dma_data_direction dma_dir = rq_data_dir(req) ? DMA_TO_DEVICE : DMA_FROM_DEVICE; @@ -632,7 +598,7 @@ static void nvme_unmap_data(struct nvme_dev *dev, struct nvme_iod *iod) } } - nvme_free_iod(dev, iod); + nvme_free_iod(dev, req); } /* @@ -641,16 +607,16 @@ static void nvme_unmap_data(struct nvme_dev *dev, struct nvme_iod *iod) * the iod. */ static int nvme_setup_discard(struct nvme_queue *nvmeq, struct nvme_ns *ns, - struct nvme_iod *iod, struct nvme_command *cmnd) + struct request *req, struct nvme_command *cmnd) { - struct request *req = iod_get_private(iod); + struct nvme_iod *iod = blk_mq_rq_to_pdu(req); struct nvme_dsm_range *range; range = dma_pool_alloc(nvmeq->dev->prp_small_pool, GFP_ATOMIC, &iod->first_dma); if (!range) return BLK_MQ_RQ_QUEUE_BUSY; - iod_list(iod)[0] = (__le64 *)range; + iod_list(req)[0] = (__le64 *)range; iod->npages = 0; range->cattr = cpu_to_le32(0); @@ -676,8 +642,6 @@ static int nvme_queue_rq(struct blk_mq_hw_ctx *hctx, struct nvme_queue *nvmeq = hctx->driver_data; struct nvme_dev *dev = nvmeq->dev; struct request *req = bd->rq; - struct nvme_cmd_info *cmd = blk_mq_rq_to_pdu(req); - struct nvme_iod *iod; struct nvme_command cmnd; int ret = BLK_MQ_RQ_QUEUE_OK; @@ -694,12 +658,12 @@ static int nvme_queue_rq(struct blk_mq_hw_ctx *hctx, } } - iod = nvme_alloc_iod(req, dev, GFP_ATOMIC); - if (!iod) - return BLK_MQ_RQ_QUEUE_BUSY; + ret = nvme_init_iod(req, dev); + if (ret) + return ret; if (req->cmd_flags & REQ_DISCARD) { - ret = nvme_setup_discard(nvmeq, ns, iod, &cmnd); + ret = nvme_setup_discard(nvmeq, ns, req, &cmnd); } else { if (req->cmd_type == REQ_TYPE_DRV_PRIV) memcpy(&cmnd, req->cmd, sizeof(cmnd)); @@ -709,14 +673,12 @@ static int nvme_queue_rq(struct blk_mq_hw_ctx *hctx, nvme_setup_rw(ns, req, &cmnd); if (req->nr_phys_segments) - ret = nvme_map_data(dev, iod, &cmnd); + ret = nvme_map_data(dev, req, &cmnd); } if (ret) goto out; - cmd->iod = iod; - cmd->aborted = 0; cmnd.common.command_id = req->tag; blk_mq_start_request(req); @@ -726,17 +688,17 @@ static int nvme_queue_rq(struct blk_mq_hw_ctx *hctx, spin_unlock_irq(&nvmeq->q_lock); return BLK_MQ_RQ_QUEUE_OK; out: - nvme_free_iod(dev, iod); + nvme_free_iod(dev, req); return ret; } static void nvme_complete_rq(struct request *req) { - struct nvme_cmd_info *cmd = blk_mq_rq_to_pdu(req); - struct nvme_dev *dev = cmd->nvmeq->dev; + struct nvme_iod *iod = blk_mq_rq_to_pdu(req); + struct nvme_dev *dev = iod->nvmeq->dev; int error = 0; - nvme_unmap_data(dev, cmd->iod); + nvme_unmap_data(dev, req); if (unlikely(req->errors)) { if (nvme_req_needs_retry(req, req->errors)) { @@ -750,7 +712,7 @@ static void nvme_complete_rq(struct request *req) error = nvme_error_status(req->errors); } - if (unlikely(cmd->aborted)) { + if (unlikely(iod->aborted)) { dev_warn(dev->dev, "completing aborted command with status: %04x\n", req->errors); @@ -955,8 +917,8 @@ static int adapter_delete_sq(struct nvme_dev *dev, u16 sqid) static void abort_endio(struct request *req, int error) { - struct nvme_cmd_info *cmd = blk_mq_rq_to_pdu(req); - struct nvme_queue *nvmeq = cmd->nvmeq; + struct nvme_iod *iod = blk_mq_rq_to_pdu(req); + struct nvme_queue *nvmeq = iod->nvmeq; u32 result = (u32)(uintptr_t)req->special; u16 status = req->errors; @@ -968,8 +930,8 @@ static void abort_endio(struct request *req, int error) static enum blk_eh_timer_return nvme_timeout(struct request *req, bool reserved) { - struct nvme_cmd_info *cmd_rq = blk_mq_rq_to_pdu(req); - struct nvme_queue *nvmeq = cmd_rq->nvmeq; + struct nvme_iod *iod = blk_mq_rq_to_pdu(req); + struct nvme_queue *nvmeq = iod->nvmeq; struct nvme_dev *dev = nvmeq->dev; struct request *abort_req; struct nvme_command cmd; @@ -994,7 +956,7 @@ static enum blk_eh_timer_return nvme_timeout(struct request *req, bool reserved) * command was already aborted once before and still hasn't been * returned to the driver, or if this is the admin queue. */ - if (!nvmeq->qid || cmd_rq->aborted) { + if (!nvmeq->qid || iod->aborted) { dev_warn(dev->dev, "I/O %d QID %d timeout, reset controller\n", req->tag, nvmeq->qid); @@ -1009,7 +971,7 @@ static enum blk_eh_timer_return nvme_timeout(struct request *req, bool reserved) return BLK_EH_HANDLED; } - cmd_rq->aborted = 1; + iod->aborted = 1; if (atomic_dec_return(&dev->ctrl.abort_limit) < 0) { atomic_inc(&dev->ctrl.abort_limit); From bbc758ec04c2f30805ce0fcdfbaa4c3445fafbae Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sat, 7 Nov 2015 09:39:28 +0100 Subject: [PATCH 53/67] block: remove REQ_NO_TIMEOUT flag This was added for the 'magic' AEN requests in the NVMe driver that never return. We now handle them purely inside the driver and don't need this core hack any more. Signed-off-by: Christoph Hellwig Acked-by: Keith Busch Signed-off-by: Jens Axboe --- block/blk-mq.c | 2 -- block/blk-timeout.c | 3 --- include/linux/blk_types.h | 2 -- 3 files changed, 7 deletions(-) diff --git a/block/blk-mq.c b/block/blk-mq.c index 9cb2894840ab..8f812b486c07 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -603,8 +603,6 @@ static void blk_mq_check_expired(struct blk_mq_hw_ctx *hctx, blk_mq_complete_request(rq, -EIO); return; } - if (rq->cmd_flags & REQ_NO_TIMEOUT) - return; if (time_after_eq(jiffies, rq->deadline)) { if (!blk_mark_rq_complete(rq)) diff --git a/block/blk-timeout.c b/block/blk-timeout.c index dd4fdfbcb3dd..a30441a200c0 100644 --- a/block/blk-timeout.c +++ b/block/blk-timeout.c @@ -197,9 +197,6 @@ void blk_add_timer(struct request *req) struct request_queue *q = req->q; unsigned long expiry; - if (req->cmd_flags & REQ_NO_TIMEOUT) - return; - /* blk-mq has its own handler, so we don't need ->rq_timed_out_fn */ if (!q->mq_ops && !q->rq_timed_out_fn) return; diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index 0fb65843ec1e..86a38ea1823f 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -188,7 +188,6 @@ enum rq_flag_bits { __REQ_PM, /* runtime pm request */ __REQ_HASHED, /* on IO scheduler merge hash */ __REQ_MQ_INFLIGHT, /* track inflight for MQ */ - __REQ_NO_TIMEOUT, /* requests may never expire */ __REQ_NR_BITS, /* stops here */ }; @@ -242,7 +241,6 @@ enum rq_flag_bits { #define REQ_PM (1ULL << __REQ_PM) #define REQ_HASHED (1ULL << __REQ_HASHED) #define REQ_MQ_INFLIGHT (1ULL << __REQ_MQ_INFLIGHT) -#define REQ_NO_TIMEOUT (1ULL << __REQ_NO_TIMEOUT) typedef unsigned int blk_qc_t; #define BLK_QC_T_NONE -1U From a0a3408ee614848c27b0d36c2fe490da3b387b8d Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Mon, 7 Dec 2015 15:30:31 -0700 Subject: [PATCH 54/67] NVMe: Add pci error handlers Requests enabling pcie aer support. Shuts down the controller on error detected with io frozen state prior to requesting slot reset; resumes controller after reset completes. Signed-off-by: Keith Busch Reviewed-by: Christoph Hellwig Signed-off-by: Jens Axboe --- drivers/nvme/host/pci.c | 54 +++++++++++++++++++++++++++++++++-------- 1 file changed, 44 insertions(+), 10 deletions(-) diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index b88708affad8..b82bbea909cd 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -12,6 +12,7 @@ * more details. */ +#include #include #include #include @@ -1670,6 +1671,8 @@ static int nvme_dev_map(struct nvme_dev *dev) if (readl(dev->bar + NVME_REG_VS) >= NVME_VS(1, 2)) dev->cmb = nvme_map_cmb(dev); + pci_enable_pcie_error_reporting(pdev); + pci_save_state(pdev); return 0; unmap: @@ -1697,8 +1700,10 @@ static void nvme_dev_unmap(struct nvme_dev *dev) pci_release_regions(pdev); } - if (pci_is_enabled(pdev)) + if (pci_is_enabled(pdev)) { + pci_disable_pcie_error_reporting(pdev); pci_disable_device(pdev); + } } struct nvme_delq_ctx { @@ -2225,13 +2230,6 @@ static void nvme_remove(struct pci_dev *pdev) nvme_put_ctrl(&dev->ctrl); } -/* These functions are yet to be implemented */ -#define nvme_error_detected NULL -#define nvme_dump_registers NULL -#define nvme_link_reset NULL -#define nvme_slot_reset NULL -#define nvme_error_resume NULL - #ifdef CONFIG_PM_SLEEP static int nvme_suspend(struct device *dev) { @@ -2254,10 +2252,46 @@ static int nvme_resume(struct device *dev) static SIMPLE_DEV_PM_OPS(nvme_dev_pm_ops, nvme_suspend, nvme_resume); +static pci_ers_result_t nvme_error_detected(struct pci_dev *pdev, + pci_channel_state_t state) +{ + struct nvme_dev *dev = pci_get_drvdata(pdev); + + /* + * A frozen channel requires a reset. When detected, this method will + * shutdown the controller to quiesce. The controller will be restarted + * after the slot reset through driver's slot_reset callback. + */ + dev_warn(&pdev->dev, "error detected: state:%d\n", state); + switch (state) { + case pci_channel_io_normal: + return PCI_ERS_RESULT_CAN_RECOVER; + case pci_channel_io_frozen: + nvme_dev_shutdown(dev); + return PCI_ERS_RESULT_NEED_RESET; + case pci_channel_io_perm_failure: + return PCI_ERS_RESULT_DISCONNECT; + } + return PCI_ERS_RESULT_NEED_RESET; +} + +static pci_ers_result_t nvme_slot_reset(struct pci_dev *pdev) +{ + struct nvme_dev *dev = pci_get_drvdata(pdev); + + dev_info(&pdev->dev, "restart after slot reset\n"); + pci_restore_state(pdev); + queue_work(nvme_workq, &dev->reset_work); + return PCI_ERS_RESULT_RECOVERED; +} + +static void nvme_error_resume(struct pci_dev *pdev) +{ + pci_cleanup_aer_uncorrect_error_status(pdev); +} + static const struct pci_error_handlers nvme_err_handler = { .error_detected = nvme_error_detected, - .mmio_enabled = nvme_dump_registers, - .link_reset = nvme_link_reset, .slot_reset = nvme_slot_reset, .resume = nvme_error_resume, .reset_notify = nvme_reset_notify, From 2b9b6e86bca7209de02754fc84acf7ab3e78734e Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Tue, 22 Dec 2015 10:10:45 -0700 Subject: [PATCH 55/67] NVMe: Export namespace attributes to sysfs Exposes the NGUID, EUI-64, and NSID to sysfs entries under the disk's kobject. Signed-off-by: Keith Busch Reviewed-by: Sagi Grimberg Signed-off-by: Jens Axboe --- drivers/nvme/host/core.c | 69 ++++++++++++++++++++++++++++++++++++++-- drivers/nvme/host/nvme.h | 3 ++ 2 files changed, 70 insertions(+), 2 deletions(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index b52a789e1e77..1437ff36e91c 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -574,6 +574,11 @@ static int nvme_revalidate_disk(struct gendisk *disk) ns->type = NVME_NS_LIGHTNVM; } + if (ns->ctrl->vs >= NVME_VS(1, 1)) + memcpy(ns->eui, id->eui64, sizeof(ns->eui)); + if (ns->ctrl->vs >= NVME_VS(1, 2)) + memcpy(ns->uuid, id->nguid, sizeof(ns->uuid)); + old_ms = ns->ms; lbaf = id->flbas & NVME_NS_FLBAS_LBA_MASK; ns->lba_shift = id->lbaf[lbaf].ds; @@ -964,6 +969,59 @@ static ssize_t nvme_sysfs_reset(struct device *dev, } static DEVICE_ATTR(reset_controller, S_IWUSR, NULL, nvme_sysfs_reset); +static ssize_t uuid_show(struct device *dev, struct device_attribute *attr, + char *buf) +{ + struct nvme_ns *ns = dev_to_disk(dev)->private_data; + return sprintf(buf, "%pU\n", ns->uuid); +} +static DEVICE_ATTR(uuid, S_IRUGO, uuid_show, NULL); + +static ssize_t eui_show(struct device *dev, struct device_attribute *attr, + char *buf) +{ + struct nvme_ns *ns = dev_to_disk(dev)->private_data; + return sprintf(buf, "%8phd\n", ns->eui); +} +static DEVICE_ATTR(eui, S_IRUGO, eui_show, NULL); + +static ssize_t nsid_show(struct device *dev, struct device_attribute *attr, + char *buf) +{ + struct nvme_ns *ns = dev_to_disk(dev)->private_data; + return sprintf(buf, "%d\n", ns->ns_id); +} +static DEVICE_ATTR(nsid, S_IRUGO, nsid_show, NULL); + +static struct attribute *nvme_ns_attrs[] = { + &dev_attr_uuid.attr, + &dev_attr_eui.attr, + &dev_attr_nsid.attr, + NULL, +}; + +static umode_t nvme_attrs_are_visible(struct kobject *kobj, + struct attribute *a, int n) +{ + struct device *dev = container_of(kobj, struct device, kobj); + struct nvme_ns *ns = dev_to_disk(dev)->private_data; + + if (a == &dev_attr_uuid.attr) { + if (!memchr_inv(ns->uuid, 0, sizeof(ns->uuid))) + return 0; + } + if (a == &dev_attr_eui.attr) { + if (!memchr_inv(ns->eui, 0, sizeof(ns->eui))) + return 0; + } + return a->mode; +} + +static const struct attribute_group nvme_ns_attr_group = { + .attrs = nvme_ns_attrs, + .is_visible = nvme_attrs_are_visible, +}; + static int ns_cmp(void *priv, struct list_head *a, struct list_head *b) { struct nvme_ns *nsa = container_of(a, struct nvme_ns, list); @@ -1038,9 +1096,14 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid) list_add_tail(&ns->list, &ctrl->namespaces); kref_get(&ctrl->kref); - if (ns->type != NVME_NS_LIGHTNVM) - add_disk(ns->disk); + if (ns->type == NVME_NS_LIGHTNVM) + return; + add_disk(ns->disk); + if (sysfs_create_group(&disk_to_dev(ns->disk)->kobj, + &nvme_ns_attr_group)) + pr_warn("%s: failed to create sysfs group for identification\n", + ns->disk->disk_name); return; out_free_disk: kfree(disk); @@ -1060,6 +1123,8 @@ static void nvme_ns_remove(struct nvme_ns *ns) if (ns->disk->flags & GENHD_FL_UP) { if (blk_get_integrity(ns->disk)) blk_integrity_unregister(ns->disk); + sysfs_remove_group(&disk_to_dev(ns->disk)->kobj, + &nvme_ns_attr_group); del_gendisk(ns->disk); } if (kill || !blk_queue_dying(ns->queue)) { diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index b0417622d27c..d88cf45fbcc1 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -102,6 +102,9 @@ struct nvme_ns { struct gendisk *disk; struct kref kref; + u8 eui[8]; + u8 uuid[16]; + unsigned ns_id; int lba_shift; u16 ms; From c89e5b80245899fc51fb1d83880e2f5762fcf350 Mon Sep 17 00:00:00 2001 From: Sudip Mukherjee Date: Wed, 23 Dec 2015 21:05:26 +0530 Subject: [PATCH 56/67] PCI/AER: include header file We are having build failure with sparc allmodconfig with the error: drivers/nvme/host/pci.c:15:0: include/linux/aer.h: In function 'pci_enable_pcie_error_reporting': include/linux/aer.h:49:10: error: 'EINVAL' undeclared (first use in this function) The file aer.h is using the error values but they are defined in errno.h. Include errno.h so that we have the definitions of the error codes. Fixes: a0a3408ee614 ("NVMe: Add pci error handlers") Cc: Keith Busch Signed-off-by: Sudip Mukherjee Signed-off-by: Jens Axboe --- include/linux/aer.h | 1 + 1 file changed, 1 insertion(+) diff --git a/include/linux/aer.h b/include/linux/aer.h index 744b997d6a94..164049357e5c 100644 --- a/include/linux/aer.h +++ b/include/linux/aer.h @@ -7,6 +7,7 @@ #ifndef _AER_H_ #define _AER_H_ +#include #include #define AER_NONFATAL 0 From 363c9aacb6c59bb63148dd115632880a4aed4d88 Mon Sep 17 00:00:00 2001 From: Sagi Grimberg Date: Thu, 24 Dec 2015 15:26:59 +0100 Subject: [PATCH 57/67] nvme: Move nvme_freeze/unfreeze_queues to nvme core Nothing pci specific about them and We'll need them exported in other transports too. Signed-off-by: Sagi Grimberg Signed-off-by: Christoph Hellwig Acked-by: Keith Busch Signed-off-by: Jens Axboe --- drivers/nvme/host/core.c | 28 ++++++++++++++++++++++++++++ drivers/nvme/host/nvme.h | 3 +++ drivers/nvme/host/pci.c | 32 ++------------------------------ 3 files changed, 33 insertions(+), 30 deletions(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 1437ff36e91c..27130056136b 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -1328,6 +1328,34 @@ out: return ret; } +void nvme_freeze_queues(struct nvme_ctrl *ctrl) +{ + struct nvme_ns *ns; + + list_for_each_entry(ns, &ctrl->namespaces, list) { + blk_mq_freeze_queue_start(ns->queue); + + spin_lock_irq(ns->queue->queue_lock); + queue_flag_set(QUEUE_FLAG_STOPPED, ns->queue); + spin_unlock_irq(ns->queue->queue_lock); + + blk_mq_cancel_requeue_work(ns->queue); + blk_mq_stop_hw_queues(ns->queue); + } +} + +void nvme_unfreeze_queues(struct nvme_ctrl *ctrl) +{ + struct nvme_ns *ns; + + list_for_each_entry(ns, &ctrl->namespaces, list) { + queue_flag_clear_unlocked(QUEUE_FLAG_STOPPED, ns->queue); + blk_mq_unfreeze_queue(ns->queue); + blk_mq_start_stopped_hw_queues(ns->queue, true); + blk_mq_kick_requeue_list(ns->queue); + } +} + int __init nvme_core_init(void) { int result; diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index d88cf45fbcc1..0da67474ce6d 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -237,6 +237,9 @@ int nvme_init_identify(struct nvme_ctrl *ctrl); void nvme_scan_namespaces(struct nvme_ctrl *ctrl); void nvme_remove_namespaces(struct nvme_ctrl *ctrl); +void nvme_freeze_queues(struct nvme_ctrl *ctrl); +void nvme_unfreeze_queues(struct nvme_ctrl *ctrl); + struct request *nvme_alloc_request(struct request_queue *q, struct nvme_command *cmd, unsigned int flags); void nvme_requeue_req(struct request *req); diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index b82bbea909cd..a7e549969462 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -1903,34 +1903,6 @@ static void nvme_dev_list_remove(struct nvme_dev *dev) kthread_stop(tmp); } -static void nvme_freeze_queues(struct nvme_dev *dev) -{ - struct nvme_ns *ns; - - list_for_each_entry(ns, &dev->ctrl.namespaces, list) { - blk_mq_freeze_queue_start(ns->queue); - - spin_lock_irq(ns->queue->queue_lock); - queue_flag_set(QUEUE_FLAG_STOPPED, ns->queue); - spin_unlock_irq(ns->queue->queue_lock); - - blk_mq_cancel_requeue_work(ns->queue); - blk_mq_stop_hw_queues(ns->queue); - } -} - -static void nvme_unfreeze_queues(struct nvme_dev *dev) -{ - struct nvme_ns *ns; - - list_for_each_entry(ns, &dev->ctrl.namespaces, list) { - queue_flag_clear_unlocked(QUEUE_FLAG_STOPPED, ns->queue); - blk_mq_unfreeze_queue(ns->queue); - blk_mq_start_stopped_hw_queues(ns->queue, true); - blk_mq_kick_requeue_list(ns->queue); - } -} - static void nvme_dev_shutdown(struct nvme_dev *dev) { int i; @@ -1940,7 +1912,7 @@ static void nvme_dev_shutdown(struct nvme_dev *dev) mutex_lock(&dev->shutdown_lock); if (dev->bar) { - nvme_freeze_queues(dev); + nvme_freeze_queues(&dev->ctrl); csts = readl(dev->bar + NVME_REG_CSTS); } if (csts & NVME_CSTS_CFS || !(csts & NVME_CSTS_RDY)) { @@ -2049,7 +2021,7 @@ static void nvme_reset_work(struct work_struct *work) dev_warn(dev->dev, "IO queues not created\n"); nvme_remove_namespaces(&dev->ctrl); } else { - nvme_unfreeze_queues(dev); + nvme_unfreeze_queues(&dev->ctrl); nvme_dev_add(dev); } From 69d3b8ac15a5eb938e6a01909f6cc8ae4b5d3a17 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 24 Dec 2015 15:27:00 +0100 Subject: [PATCH 58/67] nvme: synchronize access to ctrl->namespaces Currently traversal and modification of ctrl->namespaces happens completely unsynchronized, which can be fixed by the addition of a simple mutex. Note: nvme_dev_ioctl will be handled in the next patch. Signed-off-by: Christoph Hellwig Reviewed-by: Sagi Grimberg Acked-by: Keith Busch Signed-off-by: Jens Axboe --- drivers/nvme/host/core.c | 17 +++++++++++++++++ drivers/nvme/host/nvme.h | 1 + 2 files changed, 18 insertions(+) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 27130056136b..a928ad5aabaa 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -1034,6 +1034,8 @@ static struct nvme_ns *nvme_find_ns(struct nvme_ctrl *ctrl, unsigned nsid) { struct nvme_ns *ns; + lockdep_assert_held(&ctrl->namespaces_mutex); + list_for_each_entry(ns, &ctrl->namespaces, list) { if (ns->ns_id == nsid) return ns; @@ -1049,6 +1051,8 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid) struct gendisk *disk; int node = dev_to_node(ctrl->dev); + lockdep_assert_held(&ctrl->namespaces_mutex); + ns = kzalloc_node(sizeof(*ns), GFP_KERNEL, node); if (!ns) return; @@ -1118,6 +1122,8 @@ static void nvme_ns_remove(struct nvme_ns *ns) bool kill = nvme_io_incapable(ns->ctrl) && !blk_queue_dying(ns->queue); + lockdep_assert_held(&ns->ctrl->namespaces_mutex); + if (kill) blk_set_queue_dying(ns->queue); if (ns->disk->flags & GENHD_FL_UP) { @@ -1188,6 +1194,8 @@ static void __nvme_scan_namespaces(struct nvme_ctrl *ctrl, unsigned nn) struct nvme_ns *ns, *next; unsigned i; + lockdep_assert_held(&ctrl->namespaces_mutex); + for (i = 1; i <= nn; i++) nvme_validate_ns(ctrl, i); @@ -1205,6 +1213,7 @@ void nvme_scan_namespaces(struct nvme_ctrl *ctrl) if (nvme_identify_ctrl(ctrl, &id)) return; + mutex_lock(&ctrl->namespaces_mutex); nn = le32_to_cpu(id->nn); if (ctrl->vs >= NVME_VS(1, 1) && !(ctrl->quirks & NVME_QUIRK_IDENTIFY_CNS)) { @@ -1214,6 +1223,7 @@ void nvme_scan_namespaces(struct nvme_ctrl *ctrl) __nvme_scan_namespaces(ctrl, le32_to_cpup(&id->nn)); done: list_sort(NULL, &ctrl->namespaces, ns_cmp); + mutex_unlock(&ctrl->namespaces_mutex); kfree(id); } @@ -1221,8 +1231,10 @@ void nvme_remove_namespaces(struct nvme_ctrl *ctrl) { struct nvme_ns *ns, *next; + mutex_lock(&ctrl->namespaces_mutex); list_for_each_entry_safe(ns, next, &ctrl->namespaces, list) nvme_ns_remove(ns); + mutex_unlock(&ctrl->namespaces_mutex); } static DEFINE_IDA(nvme_instance_ida); @@ -1290,6 +1302,7 @@ int nvme_init_ctrl(struct nvme_ctrl *ctrl, struct device *dev, int ret; INIT_LIST_HEAD(&ctrl->namespaces); + mutex_init(&ctrl->namespaces_mutex); kref_init(&ctrl->kref); ctrl->dev = dev; ctrl->ops = ops; @@ -1332,6 +1345,7 @@ void nvme_freeze_queues(struct nvme_ctrl *ctrl) { struct nvme_ns *ns; + mutex_lock(&ctrl->namespaces_mutex); list_for_each_entry(ns, &ctrl->namespaces, list) { blk_mq_freeze_queue_start(ns->queue); @@ -1342,18 +1356,21 @@ void nvme_freeze_queues(struct nvme_ctrl *ctrl) blk_mq_cancel_requeue_work(ns->queue); blk_mq_stop_hw_queues(ns->queue); } + mutex_unlock(&ctrl->namespaces_mutex); } void nvme_unfreeze_queues(struct nvme_ctrl *ctrl) { struct nvme_ns *ns; + mutex_lock(&ctrl->namespaces_mutex); list_for_each_entry(ns, &ctrl->namespaces, list) { queue_flag_clear_unlocked(QUEUE_FLAG_STOPPED, ns->queue); blk_mq_unfreeze_queue(ns->queue); blk_mq_start_stopped_hw_queues(ns->queue, true); blk_mq_kick_requeue_list(ns->queue); } + mutex_unlock(&ctrl->namespaces_mutex); } int __init nvme_core_init(void) diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index 0da67474ce6d..44375923bd6a 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -69,6 +69,7 @@ struct nvme_ctrl { int instance; struct blk_mq_tag_set *tagset; struct list_head namespaces; + struct mutex namespaces_mutex; struct device *device; /* char device */ struct list_head node; From bfd8947194b2e2a53db82bbc7eb7c15d028c46db Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 24 Dec 2015 15:27:01 +0100 Subject: [PATCH 59/67] nvme: fixes for NVME_IOCTL_IO_CMD on the char device Make sure we synchronize access to the namespaces list and grab a reference to the namespace before doing I/O. Make sure to reject the ioctl if multiple namespaces are present as it's entirely unsafe, and warn when using it even with a single namespace. Signed-off-by: Christoph Hellwig Reviewed-by: Sagi Grimberg Acked-by: Keith Busch Signed-off-by: Jens Axboe --- drivers/nvme/host/core.c | 39 ++++++++++++++++++++++++++++++++++----- 1 file changed, 34 insertions(+), 5 deletions(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index a928ad5aabaa..51f6fc83b051 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -922,21 +922,50 @@ static int nvme_dev_release(struct inode *inode, struct file *file) return 0; } +static int nvme_dev_user_cmd(struct nvme_ctrl *ctrl, void __user *argp) +{ + struct nvme_ns *ns; + int ret; + + mutex_lock(&ctrl->namespaces_mutex); + if (list_empty(&ctrl->namespaces)) { + ret = -ENOTTY; + goto out_unlock; + } + + ns = list_first_entry(&ctrl->namespaces, struct nvme_ns, list); + if (ns != list_last_entry(&ctrl->namespaces, struct nvme_ns, list)) { + dev_warn(ctrl->dev, + "NVME_IOCTL_IO_CMD not supported when multiple namespaces present!\n"); + ret = -EINVAL; + goto out_unlock; + } + + dev_warn(ctrl->dev, + "using deprecated NVME_IOCTL_IO_CMD ioctl on the char device!\n"); + kref_get(&ns->kref); + mutex_unlock(&ctrl->namespaces_mutex); + + ret = nvme_user_cmd(ctrl, ns, argp); + nvme_put_ns(ns); + return ret; + +out_unlock: + mutex_unlock(&ctrl->namespaces_mutex); + return ret; +} + static long nvme_dev_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { struct nvme_ctrl *ctrl = file->private_data; void __user *argp = (void __user *)arg; - struct nvme_ns *ns; switch (cmd) { case NVME_IOCTL_ADMIN_CMD: return nvme_user_cmd(ctrl, NULL, argp); case NVME_IOCTL_IO_CMD: - if (list_empty(&ctrl->namespaces)) - return -ENOTTY; - ns = list_first_entry(&ctrl->namespaces, struct nvme_ns, list); - return nvme_user_cmd(ctrl, ns, argp); + return nvme_dev_user_cmd(ctrl, argp); case NVME_IOCTL_RESET: dev_warn(ctrl->dev, "resetting controller\n"); return ctrl->ops->reset_ctrl(ctrl); From 4490733250b8b272a6d3e66352dd7b8025409549 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 24 Dec 2015 15:27:02 +0100 Subject: [PATCH 60/67] nvme: make SG_IO support optional Translation SCSI commands to NVMe commands is rather pointless in general as applications must not expext to be able to use SCSI commands on a generic block device. Make the huge translation layer optional and hope no one will ever enable it in the future. Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- drivers/nvme/host/Kconfig | 11 +++++++++++ drivers/nvme/host/Makefile | 3 ++- drivers/nvme/host/core.c | 2 ++ 3 files changed, 15 insertions(+), 1 deletion(-) diff --git a/drivers/nvme/host/Kconfig b/drivers/nvme/host/Kconfig index 002a94abdbc4..5d6237391dcd 100644 --- a/drivers/nvme/host/Kconfig +++ b/drivers/nvme/host/Kconfig @@ -8,3 +8,14 @@ config BLK_DEV_NVME To compile this driver as a module, choose M here: the module will be called nvme. + +config BLK_DEV_NVME_SCSI + bool "SCSI emulation for NVMe device nodes" + depends on BLK_DEV_NVME + ---help--- + This adds support for the SG_IO ioctl on the NVMe character + and block devices nodes, as well a a translation for a small + number of selected SCSI commands to NVMe commands to the NVMe + driver. If you don't know what this means you probably want + to say N here, and if you know what it means you probably + want to say N as well. diff --git a/drivers/nvme/host/Makefile b/drivers/nvme/host/Makefile index 3e26dc921c38..baf9f52bbfa5 100644 --- a/drivers/nvme/host/Makefile +++ b/drivers/nvme/host/Makefile @@ -1,4 +1,5 @@ obj-$(CONFIG_BLK_DEV_NVME) += nvme.o -nvme-y += core.o pci.o scsi.o lightnvm.o +nvme-y += core.o pci.o lightnvm.o +nvme-$(CONFIG_BLK_DEV_NVME_SCSI) += scsi.o diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 51f6fc83b051..8da4a8a55c49 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -467,10 +467,12 @@ static int nvme_ioctl(struct block_device *bdev, fmode_t mode, return nvme_user_cmd(ns->ctrl, ns, (void __user *)arg); case NVME_IOCTL_SUBMIT_IO: return nvme_submit_io(ns, (void __user *)arg); +#ifdef CONFIG_BLK_DEV_NVME_SCSI case SG_GET_VERSION_NUM: return nvme_sg_get_version_num((void __user *)arg); case SG_IO: return nvme_sg_io(ns, (void __user *)arg); +#endif default: return -ENOTTY; } From e3e9d50cd6ed392bb716e35c134d1e82707c51b4 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Mon, 4 Jan 2016 09:10:55 -0700 Subject: [PATCH 61/67] NVMe: Fix admin queue ring wrap The tag set queue depth needs to be one less than the h/w queue depth so we don't wrap the circular buffer. This conforms to the specification defined "Full Queue" condition. Signed-off-by: Keith Busch Signed-off-by: Jens Axboe --- drivers/nvme/host/pci.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index a7e549969462..30ed2ab2cadb 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -1271,7 +1271,12 @@ static int nvme_alloc_admin_tags(struct nvme_dev *dev) if (!dev->ctrl.admin_q) { dev->admin_tagset.ops = &nvme_mq_admin_ops; dev->admin_tagset.nr_hw_queues = 1; - dev->admin_tagset.queue_depth = NVME_AQ_BLKMQ_DEPTH; + + /* + * Subtract one to leave an empty queue entry for 'Full Queue' + * condition. See NVM-Express 1.2 specification, section 4.1.2. + */ + dev->admin_tagset.queue_depth = NVME_AQ_BLKMQ_DEPTH - 1; dev->admin_tagset.timeout = ADMIN_TIMEOUT; dev->admin_tagset.numa_node = dev_to_node(dev->dev); dev->admin_tagset.cmd_size = nvme_cmd_size(dev); From 1d49c38c4865c596b01b31a52540275c1bb383e7 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Mon, 4 Jan 2016 09:10:56 -0700 Subject: [PATCH 62/67] NVMe: Use a retryable error code on reset A negative status has the "do not retry" bit set, which makes it not retryable. Use a fake status that can potentially be retried on reset. An aborted command's status is overridden by the timeout handler so that it won't be retried, which is necessary to keep initialization from getting into a reset loop. Signed-off-by: Keith Busch Signed-off-by: Jens Axboe --- drivers/nvme/host/pci.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 30ed2ab2cadb..ac6c7afb2a6e 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -1017,7 +1017,7 @@ static void nvme_cancel_queue_ios(struct request *req, void *data, bool reserved dev_warn(nvmeq->q_dmadev, "Cancelling I/O %d QID %d\n", req->tag, nvmeq->qid); - status = NVME_SC_CANCELLED; + status = NVME_SC_ABORT_REQ; if (blk_queue_dying(req->q)) status |= NVME_SC_DNR; blk_mq_complete_request(req, status); From 25646264e15af96c5c630fc742708b1eb3339222 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Mon, 4 Jan 2016 09:10:57 -0700 Subject: [PATCH 63/67] NVMe: Remove queue freezing on resets NVMe submits all commands through the block layer now. This means we can let requests queue at the blk-mq hardware context since there is no path that bypasses this anymore so we don't need to freeze the queues anymore. The driver can simply stop the h/w queues from running during a reset instead. This also fixes a WARN in percpu_ref_reinit when the queue was unfrozen with requeued requests. Signed-off-by: Keith Busch Signed-off-by: Jens Axboe --- drivers/nvme/host/core.c | 7 ++----- drivers/nvme/host/nvme.h | 4 ++-- drivers/nvme/host/pci.c | 8 ++++---- 3 files changed, 8 insertions(+), 11 deletions(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 8da4a8a55c49..e31a256127f7 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -1372,14 +1372,12 @@ out: return ret; } -void nvme_freeze_queues(struct nvme_ctrl *ctrl) +void nvme_stop_queues(struct nvme_ctrl *ctrl) { struct nvme_ns *ns; mutex_lock(&ctrl->namespaces_mutex); list_for_each_entry(ns, &ctrl->namespaces, list) { - blk_mq_freeze_queue_start(ns->queue); - spin_lock_irq(ns->queue->queue_lock); queue_flag_set(QUEUE_FLAG_STOPPED, ns->queue); spin_unlock_irq(ns->queue->queue_lock); @@ -1390,14 +1388,13 @@ void nvme_freeze_queues(struct nvme_ctrl *ctrl) mutex_unlock(&ctrl->namespaces_mutex); } -void nvme_unfreeze_queues(struct nvme_ctrl *ctrl) +void nvme_start_queues(struct nvme_ctrl *ctrl) { struct nvme_ns *ns; mutex_lock(&ctrl->namespaces_mutex); list_for_each_entry(ns, &ctrl->namespaces, list) { queue_flag_clear_unlocked(QUEUE_FLAG_STOPPED, ns->queue); - blk_mq_unfreeze_queue(ns->queue); blk_mq_start_stopped_hw_queues(ns->queue, true); blk_mq_kick_requeue_list(ns->queue); } diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index 44375923bd6a..4722fadde0f1 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -238,8 +238,8 @@ int nvme_init_identify(struct nvme_ctrl *ctrl); void nvme_scan_namespaces(struct nvme_ctrl *ctrl); void nvme_remove_namespaces(struct nvme_ctrl *ctrl); -void nvme_freeze_queues(struct nvme_ctrl *ctrl); -void nvme_unfreeze_queues(struct nvme_ctrl *ctrl); +void nvme_stop_queues(struct nvme_ctrl *ctrl); +void nvme_start_queues(struct nvme_ctrl *ctrl); struct request *nvme_alloc_request(struct request_queue *q, struct nvme_command *cmd, unsigned int flags); diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index ac6c7afb2a6e..953fe485a258 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -1064,7 +1064,7 @@ static int nvme_suspend_queue(struct nvme_queue *nvmeq) spin_unlock_irq(&nvmeq->q_lock); if (!nvmeq->qid && nvmeq->dev->ctrl.admin_q) - blk_mq_freeze_queue_start(nvmeq->dev->ctrl.admin_q); + blk_mq_stop_hw_queues(nvmeq->dev->ctrl.admin_q); irq_set_affinity_hint(vector, NULL); free_irq(vector, nvmeq); @@ -1296,7 +1296,7 @@ static int nvme_alloc_admin_tags(struct nvme_dev *dev) return -ENODEV; } } else - blk_mq_unfreeze_queue(dev->ctrl.admin_q); + blk_mq_start_stopped_hw_queues(dev->ctrl.admin_q, true); return 0; } @@ -1917,7 +1917,7 @@ static void nvme_dev_shutdown(struct nvme_dev *dev) mutex_lock(&dev->shutdown_lock); if (dev->bar) { - nvme_freeze_queues(&dev->ctrl); + nvme_stop_queues(&dev->ctrl); csts = readl(dev->bar + NVME_REG_CSTS); } if (csts & NVME_CSTS_CFS || !(csts & NVME_CSTS_RDY)) { @@ -2026,7 +2026,7 @@ static void nvme_reset_work(struct work_struct *work) dev_warn(dev->dev, "IO queues not created\n"); nvme_remove_namespaces(&dev->ctrl); } else { - nvme_unfreeze_queues(&dev->ctrl); + nvme_start_queues(&dev->ctrl); nvme_dev_add(dev); } From db3cbfff5bcc0b9a82d8c71f00b9d60fad215871 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Tue, 12 Jan 2016 14:41:17 -0700 Subject: [PATCH 64/67] NVMe: IO queue deletion re-write The nvme driver deletes IO queues asynchronously since this operation may potentially take an undesirable amount of time with a large number of queues if done serially. The driver used to manage coordinating asynchronous deletions. This patch simplifies that by leveraging the block layer rather than using kthread workers and chaining more complicated callbacks. Signed-off-by: Keith Busch Signed-off-by: Jens Axboe --- drivers/nvme/host/pci.c | 251 +++++++++++++--------------------------- 1 file changed, 81 insertions(+), 170 deletions(-) diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 953fe485a258..72f284ff42b6 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -89,13 +89,6 @@ static void nvme_process_cq(struct nvme_queue *nvmeq); static void nvme_remove_dead_ctrl(struct nvme_dev *dev); static void nvme_dev_shutdown(struct nvme_dev *dev); -struct async_cmd_info { - struct kthread_work work; - struct kthread_worker *worker; - int status; - void *ctx; -}; - /* * Represents an NVM Express device. Each nvme_dev is a PCI function. */ @@ -125,9 +118,11 @@ struct nvme_dev { u64 cmb_size; u32 cmbsz; unsigned long flags; + #define NVME_CTRL_RESETTING 0 struct nvme_ctrl ctrl; + struct completion ioq_wait; }; static inline struct nvme_dev *to_nvme_dev(struct nvme_ctrl *ctrl) @@ -159,7 +154,6 @@ struct nvme_queue { u16 qid; u8 cq_phase; u8 cqe_seen; - struct async_cmd_info cmdinfo; }; /* @@ -844,15 +838,6 @@ static void nvme_submit_async_event(struct nvme_dev *dev) __nvme_submit_cmd(dev->queues[0], &c); } -static void async_cmd_info_endio(struct request *req, int error) -{ - struct async_cmd_info *cmdinfo = req->end_io_data; - - cmdinfo->status = req->errors; - queue_kthread_work(cmdinfo->worker, &cmdinfo->work); - blk_mq_free_request(req); -} - static int adapter_delete_queue(struct nvme_dev *dev, u8 opcode, u16 id) { struct nvme_command c; @@ -1600,6 +1585,84 @@ static void nvme_dev_scan(struct work_struct *work) nvme_set_irq_hints(dev); } +static void nvme_del_queue_end(struct request *req, int error) +{ + struct nvme_queue *nvmeq = req->end_io_data; + + blk_mq_free_request(req); + complete(&nvmeq->dev->ioq_wait); +} + +static void nvme_del_cq_end(struct request *req, int error) +{ + struct nvme_queue *nvmeq = req->end_io_data; + + if (!error) { + unsigned long flags; + + spin_lock_irqsave(&nvmeq->q_lock, flags); + nvme_process_cq(nvmeq); + spin_unlock_irqrestore(&nvmeq->q_lock, flags); + } + + nvme_del_queue_end(req, error); +} + +static int nvme_delete_queue(struct nvme_queue *nvmeq, u8 opcode) +{ + struct request_queue *q = nvmeq->dev->ctrl.admin_q; + struct request *req; + struct nvme_command cmd; + + memset(&cmd, 0, sizeof(cmd)); + cmd.delete_queue.opcode = opcode; + cmd.delete_queue.qid = cpu_to_le16(nvmeq->qid); + + req = nvme_alloc_request(q, &cmd, BLK_MQ_REQ_NOWAIT); + if (IS_ERR(req)) + return PTR_ERR(req); + + req->timeout = ADMIN_TIMEOUT; + req->end_io_data = nvmeq; + + blk_execute_rq_nowait(q, NULL, req, false, + opcode == nvme_admin_delete_cq ? + nvme_del_cq_end : nvme_del_queue_end); + return 0; +} + +static void nvme_disable_io_queues(struct nvme_dev *dev) +{ + int pass; + unsigned long timeout; + u8 opcode = nvme_admin_delete_sq; + + for (pass = 0; pass < 2; pass++) { + int sent = 0, i = dev->queue_count - 1; + + reinit_completion(&dev->ioq_wait); + retry: + timeout = ADMIN_TIMEOUT; + for (; i > 0; i--) { + struct nvme_queue *nvmeq = dev->queues[i]; + + if (!pass) + nvme_suspend_queue(nvmeq); + if (nvme_delete_queue(nvmeq, opcode)) + break; + ++sent; + } + while (sent--) { + timeout = wait_for_completion_io_timeout(&dev->ioq_wait, timeout); + if (timeout == 0) + return; + if (i) + goto retry; + } + opcode = nvme_admin_delete_cq; + } +} + /* * Return: error value if an error occurred setting up the queues or calling * Identify Device. 0 if these succeeded, even if adding some of the @@ -1711,159 +1774,6 @@ static void nvme_dev_unmap(struct nvme_dev *dev) } } -struct nvme_delq_ctx { - struct task_struct *waiter; - struct kthread_worker *worker; - atomic_t refcount; -}; - -static void nvme_wait_dq(struct nvme_delq_ctx *dq, struct nvme_dev *dev) -{ - dq->waiter = current; - mb(); - - for (;;) { - set_current_state(TASK_KILLABLE); - if (!atomic_read(&dq->refcount)) - break; - if (!schedule_timeout(ADMIN_TIMEOUT) || - fatal_signal_pending(current)) { - /* - * Disable the controller first since we can't trust it - * at this point, but leave the admin queue enabled - * until all queue deletion requests are flushed. - * FIXME: This may take a while if there are more h/w - * queues than admin tags. - */ - set_current_state(TASK_RUNNING); - nvme_disable_ctrl(&dev->ctrl, - lo_hi_readq(dev->bar + NVME_REG_CAP)); - nvme_clear_queue(dev->queues[0]); - flush_kthread_worker(dq->worker); - nvme_disable_queue(dev, 0); - return; - } - } - set_current_state(TASK_RUNNING); -} - -static void nvme_put_dq(struct nvme_delq_ctx *dq) -{ - atomic_dec(&dq->refcount); - if (dq->waiter) - wake_up_process(dq->waiter); -} - -static struct nvme_delq_ctx *nvme_get_dq(struct nvme_delq_ctx *dq) -{ - atomic_inc(&dq->refcount); - return dq; -} - -static void nvme_del_queue_end(struct nvme_queue *nvmeq) -{ - struct nvme_delq_ctx *dq = nvmeq->cmdinfo.ctx; - nvme_put_dq(dq); - - spin_lock_irq(&nvmeq->q_lock); - nvme_process_cq(nvmeq); - spin_unlock_irq(&nvmeq->q_lock); -} - -static int adapter_async_del_queue(struct nvme_queue *nvmeq, u8 opcode, - kthread_work_func_t fn) -{ - struct request *req; - struct nvme_command c; - - memset(&c, 0, sizeof(c)); - c.delete_queue.opcode = opcode; - c.delete_queue.qid = cpu_to_le16(nvmeq->qid); - - init_kthread_work(&nvmeq->cmdinfo.work, fn); - - req = nvme_alloc_request(nvmeq->dev->ctrl.admin_q, &c, 0); - if (IS_ERR(req)) - return PTR_ERR(req); - - req->timeout = ADMIN_TIMEOUT; - req->end_io_data = &nvmeq->cmdinfo; - blk_execute_rq_nowait(req->q, NULL, req, 0, async_cmd_info_endio); - return 0; -} - -static void nvme_del_cq_work_handler(struct kthread_work *work) -{ - struct nvme_queue *nvmeq = container_of(work, struct nvme_queue, - cmdinfo.work); - nvme_del_queue_end(nvmeq); -} - -static int nvme_delete_cq(struct nvme_queue *nvmeq) -{ - return adapter_async_del_queue(nvmeq, nvme_admin_delete_cq, - nvme_del_cq_work_handler); -} - -static void nvme_del_sq_work_handler(struct kthread_work *work) -{ - struct nvme_queue *nvmeq = container_of(work, struct nvme_queue, - cmdinfo.work); - int status = nvmeq->cmdinfo.status; - - if (!status) - status = nvme_delete_cq(nvmeq); - if (status) - nvme_del_queue_end(nvmeq); -} - -static int nvme_delete_sq(struct nvme_queue *nvmeq) -{ - return adapter_async_del_queue(nvmeq, nvme_admin_delete_sq, - nvme_del_sq_work_handler); -} - -static void nvme_del_queue_start(struct kthread_work *work) -{ - struct nvme_queue *nvmeq = container_of(work, struct nvme_queue, - cmdinfo.work); - if (nvme_delete_sq(nvmeq)) - nvme_del_queue_end(nvmeq); -} - -static void nvme_disable_io_queues(struct nvme_dev *dev) -{ - int i; - DEFINE_KTHREAD_WORKER_ONSTACK(worker); - struct nvme_delq_ctx dq; - struct task_struct *kworker_task = kthread_run(kthread_worker_fn, - &worker, "nvme%d", dev->ctrl.instance); - - if (IS_ERR(kworker_task)) { - dev_err(dev->dev, - "Failed to create queue del task\n"); - for (i = dev->queue_count - 1; i > 0; i--) - nvme_disable_queue(dev, i); - return; - } - - dq.waiter = NULL; - atomic_set(&dq.refcount, 0); - dq.worker = &worker; - for (i = dev->queue_count - 1; i > 0; i--) { - struct nvme_queue *nvmeq = dev->queues[i]; - - if (nvme_suspend_queue(nvmeq)) - continue; - nvmeq->cmdinfo.ctx = nvme_get_dq(&dq); - nvmeq->cmdinfo.worker = dq.worker; - init_kthread_work(&nvmeq->cmdinfo.work, nvme_del_queue_start); - queue_kthread_work(dq.worker, &nvmeq->cmdinfo.work); - } - nvme_wait_dq(&dq, dev); - kthread_stop(kworker_task); -} - static int nvme_dev_list_add(struct nvme_dev *dev) { bool start_thread = false; @@ -2146,6 +2056,7 @@ static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id) INIT_WORK(&dev->reset_work, nvme_reset_work); INIT_WORK(&dev->remove_work, nvme_remove_dead_ctrl_work); mutex_init(&dev->shutdown_lock); + init_completion(&dev->ioq_wait); result = nvme_setup_prp_pools(dev); if (result) From a5cdb68c2c10f0865122656833cd07636a4143ee Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Tue, 12 Jan 2016 14:41:18 -0700 Subject: [PATCH 65/67] NVMe: Shutdown controller only for power-off We don't need to shutdown a controller for a reset. A controller in a shutdown state may take longer to become ready than one that was simply disabled. This patch has the driver shut down a controller only if the device is about to be powered off or being removed. When taking the controller down for a reset reason, the controller will be disabled instead. Function names have been updated in this patch to reflect their changed semantics. Signed-off-by: Keith Busch Reviewed-by: Sagi Grimberg Signed-off-by: Jens Axboe --- drivers/nvme/host/pci.c | 40 +++++++++++++++++++--------------------- 1 file changed, 19 insertions(+), 21 deletions(-) diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 72f284ff42b6..8ff6ac5cafbe 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -87,7 +87,7 @@ struct nvme_queue; static int nvme_reset(struct nvme_dev *dev); static void nvme_process_cq(struct nvme_queue *nvmeq); static void nvme_remove_dead_ctrl(struct nvme_dev *dev); -static void nvme_dev_shutdown(struct nvme_dev *dev); +static void nvme_dev_disable(struct nvme_dev *dev, bool shutdown); /* * Represents an NVM Express device. Each nvme_dev is a PCI function. @@ -932,7 +932,7 @@ static enum blk_eh_timer_return nvme_timeout(struct request *req, bool reserved) dev_warn(dev->dev, "I/O %d QID %d timeout, disable controller\n", req->tag, nvmeq->qid); - nvme_dev_shutdown(dev); + nvme_dev_disable(dev, false); req->errors = NVME_SC_CANCELLED; return BLK_EH_HANDLED; } @@ -946,7 +946,7 @@ static enum blk_eh_timer_return nvme_timeout(struct request *req, bool reserved) dev_warn(dev->dev, "I/O %d QID %d timeout, reset controller\n", req->tag, nvmeq->qid); - nvme_dev_shutdown(dev); + nvme_dev_disable(dev, false); queue_work(nvme_workq, &dev->reset_work); /* @@ -1065,21 +1065,20 @@ static void nvme_clear_queue(struct nvme_queue *nvmeq) spin_unlock_irq(&nvmeq->q_lock); } -static void nvme_disable_queue(struct nvme_dev *dev, int qid) +static void nvme_disable_admin_queue(struct nvme_dev *dev, bool shutdown) { - struct nvme_queue *nvmeq = dev->queues[qid]; + struct nvme_queue *nvmeq = dev->queues[0]; if (!nvmeq) return; if (nvme_suspend_queue(nvmeq)) return; - /* Don't tell the adapter to delete the admin queue. - * Don't tell a removed adapter to delete IO queues. */ - if (qid && readl(dev->bar + NVME_REG_CSTS) != -1) { - adapter_delete_sq(dev, qid); - adapter_delete_cq(dev, qid); - } + if (shutdown) + nvme_shutdown_ctrl(&dev->ctrl); + else + nvme_disable_ctrl(&dev->ctrl, lo_hi_readq( + dev->bar + NVME_REG_CAP)); spin_lock_irq(&nvmeq->q_lock); nvme_process_cq(nvmeq); @@ -1818,7 +1817,7 @@ static void nvme_dev_list_remove(struct nvme_dev *dev) kthread_stop(tmp); } -static void nvme_dev_shutdown(struct nvme_dev *dev) +static void nvme_dev_disable(struct nvme_dev *dev, bool shutdown) { int i; u32 csts = -1; @@ -1837,8 +1836,7 @@ static void nvme_dev_shutdown(struct nvme_dev *dev) } } else { nvme_disable_io_queues(dev); - nvme_shutdown_ctrl(&dev->ctrl); - nvme_disable_queue(dev, 0); + nvme_disable_admin_queue(dev, shutdown); } nvme_dev_unmap(dev); @@ -1897,7 +1895,7 @@ static void nvme_reset_work(struct work_struct *work) * moving on. */ if (dev->bar) - nvme_dev_shutdown(dev); + nvme_dev_disable(dev, false); set_bit(NVME_CTRL_RESETTING, &dev->flags); @@ -1951,7 +1949,7 @@ static void nvme_reset_work(struct work_struct *work) dev->ctrl.admin_q = NULL; dev->queues[0]->tags = NULL; disable: - nvme_disable_queue(dev, 0); + nvme_disable_admin_queue(dev, false); unmap: nvme_dev_unmap(dev); out: @@ -2086,7 +2084,7 @@ static void nvme_reset_notify(struct pci_dev *pdev, bool prepare) struct nvme_dev *dev = pci_get_drvdata(pdev); if (prepare) - nvme_dev_shutdown(dev); + nvme_dev_disable(dev, false); else queue_work(nvme_workq, &dev->reset_work); } @@ -2094,7 +2092,7 @@ static void nvme_reset_notify(struct pci_dev *pdev, bool prepare) static void nvme_shutdown(struct pci_dev *pdev) { struct nvme_dev *dev = pci_get_drvdata(pdev); - nvme_dev_shutdown(dev); + nvme_dev_disable(dev, true); } static void nvme_remove(struct pci_dev *pdev) @@ -2110,7 +2108,7 @@ static void nvme_remove(struct pci_dev *pdev) flush_work(&dev->scan_work); nvme_remove_namespaces(&dev->ctrl); nvme_uninit_ctrl(&dev->ctrl); - nvme_dev_shutdown(dev); + nvme_dev_disable(dev, true); nvme_dev_remove_admin(dev); nvme_free_queues(dev, 0); nvme_release_cmb(dev); @@ -2124,7 +2122,7 @@ static int nvme_suspend(struct device *dev) struct pci_dev *pdev = to_pci_dev(dev); struct nvme_dev *ndev = pci_get_drvdata(pdev); - nvme_dev_shutdown(ndev); + nvme_dev_disable(ndev, true); return 0; } @@ -2155,7 +2153,7 @@ static pci_ers_result_t nvme_error_detected(struct pci_dev *pdev, case pci_channel_io_normal: return PCI_ERS_RESULT_CAN_RECOVER; case pci_channel_io_frozen: - nvme_dev_shutdown(dev); + nvme_dev_disable(dev, false); return PCI_ERS_RESULT_NEED_RESET; case pci_channel_io_perm_failure: return PCI_ERS_RESULT_DISCONNECT; From 779ff75617099f4defe14e20443b95019a4c5ae8 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Tue, 12 Jan 2016 15:09:31 -0700 Subject: [PATCH 66/67] NVMe: Export NVMe attributes to sysfs group Adds all controller information to attribute list exposed to sysfs, and appends the reset_controller attribute to it. The nvme device is created with this attribute list, so driver no long manages its attributes. Reported-by: Sujith Pandel Cc: Sujith Pandel Cc: David Milburn Signed-off-by: Keith Busch Signed-off-by: Jens Axboe --- drivers/nvme/host/core.c | 44 ++++++++++++++++++++++++++++++---------- 1 file changed, 33 insertions(+), 11 deletions(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index e31a256127f7..3e9c5e1e3b6d 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -1053,6 +1053,36 @@ static const struct attribute_group nvme_ns_attr_group = { .is_visible = nvme_attrs_are_visible, }; +#define nvme_show_function(field) \ +static ssize_t field##_show(struct device *dev, \ + struct device_attribute *attr, char *buf) \ +{ \ + struct nvme_ctrl *ctrl = dev_get_drvdata(dev); \ + return sprintf(buf, "%.*s\n", (int)sizeof(ctrl->field), ctrl->field); \ +} \ +static DEVICE_ATTR(field, S_IRUGO, field##_show, NULL); + +nvme_show_function(model); +nvme_show_function(serial); +nvme_show_function(firmware_rev); + +static struct attribute *nvme_dev_attrs[] = { + &dev_attr_reset_controller.attr, + &dev_attr_model.attr, + &dev_attr_serial.attr, + &dev_attr_firmware_rev.attr, + NULL +}; + +static struct attribute_group nvme_dev_attrs_group = { + .attrs = nvme_dev_attrs, +}; + +static const struct attribute_group *nvme_dev_attr_groups[] = { + &nvme_dev_attrs_group, + NULL, +}; + static int ns_cmp(void *priv, struct list_head *a, struct list_head *b) { struct nvme_ns *nsa = container_of(a, struct nvme_ns, list); @@ -1299,7 +1329,6 @@ static void nvme_release_instance(struct nvme_ctrl *ctrl) void nvme_uninit_ctrl(struct nvme_ctrl *ctrl) { - device_remove_file(ctrl->device, &dev_attr_reset_controller); device_destroy(nvme_class, MKDEV(nvme_char_major, ctrl->instance)); spin_lock(&dev_list_lock); @@ -1343,9 +1372,10 @@ int nvme_init_ctrl(struct nvme_ctrl *ctrl, struct device *dev, if (ret) goto out; - ctrl->device = device_create(nvme_class, ctrl->dev, + ctrl->device = device_create_with_groups(nvme_class, ctrl->dev, MKDEV(nvme_char_major, ctrl->instance), - dev, "nvme%d", ctrl->instance); + dev, nvme_dev_attr_groups, + "nvme%d", ctrl->instance); if (IS_ERR(ctrl->device)) { ret = PTR_ERR(ctrl->device); goto out_release_instance; @@ -1353,19 +1383,11 @@ int nvme_init_ctrl(struct nvme_ctrl *ctrl, struct device *dev, get_device(ctrl->device); dev_set_drvdata(ctrl->device, ctrl); - ret = device_create_file(ctrl->device, &dev_attr_reset_controller); - if (ret) - goto out_put_device; - spin_lock(&dev_list_lock); list_add_tail(&ctrl->node, &nvme_ctrl_list); spin_unlock(&dev_list_lock); return 0; - -out_put_device: - put_device(ctrl->device); - device_destroy(nvme_class, MKDEV(nvme_char_major, ctrl->instance)); out_release_instance: nvme_release_instance(ctrl); out: From a9cf8284b45110a4d98aea180a89c857e53bf850 Mon Sep 17 00:00:00 2001 From: Mike Frysinger Date: Sun, 10 Jan 2016 20:14:11 -0500 Subject: [PATCH 67/67] uapi: update install list after nvme.h rename Commit 9d99a8dda154 ("nvme: move hardware structures out of the uapi version of nvme.h") renamed nvme.h to nvme_ioctl.h, but the uapi list still refers to nvme.h. People trying to install the headers hit a failure as the header no longer exists. Cc: stable@vger.kernel.org Signed-off-by: Mike Frysinger Reviewed-by: Christoph Hellwig Signed-off-by: Jens Axboe --- include/uapi/linux/Kbuild | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/uapi/linux/Kbuild b/include/uapi/linux/Kbuild index 628e6e64c2fb..88e1292cb786 100644 --- a/include/uapi/linux/Kbuild +++ b/include/uapi/linux/Kbuild @@ -306,7 +306,7 @@ header-y += nfs_mount.h header-y += nl80211.h header-y += n_r3964.h header-y += nubus.h -header-y += nvme.h +header-y += nvme_ioctl.h header-y += nvram.h header-y += omap3isp.h header-y += omapfb.h