From e1e5e5641e6f271321aec257ed26a72715e4a8c2 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Thu, 19 Feb 2015 13:39:03 -0700 Subject: [PATCH 1/9] NVMe: Metadata format support Adds support for NVMe metadata formats and exposes block devices for all namespaces regardless of their format. Namespace formats that are unusable will have disk capacity set to 0, but a handle to the block device is created to simplify device management. A namespace is not usable when the format requires host interleave block and metadata in single buffer, has no provisioned storage, or has better data but failed to register with blk integrity. The namespace has to be scanned in two phases to support separate metadata formats. The first establishes the sector size and capacity prior to invoking add_disk. If metadata is required, the capacity will be temporarilly set to 0 until it can be revalidated and registered with the integrity extenstions after add_disk completes. The driver relies on the integrity extensions to provide the metadata buffer. NVMe requires this be a single physically contiguous region, so only one integrity segment is allowed per command. If the metadata is used for T10 PI, the driver provides mappings to save and restore the reftag physical block translation. The driver provides no-op functions for generate and verify if metadata is not used for protection information. This way the setup is always provided by the block layer. If a request does not supply a required metadata buffer, the command is failed with bad address. This could only happen if a user manually disables verify/generate on such a disk. The only exception to where this is okay is if the controller is capable of stripping/generating the metadata, which is possible on some types of formats. The metadata scatter gather list now occupies the spot in the nvme_iod that used to be used to link retryable IOD's, but we don't do that anymore, so the field was unused. Signed-off-by: Keith Busch --- drivers/block/nvme-core.c | 298 +++++++++++++++++++++++++++++--------- include/linux/nvme.h | 2 + include/uapi/linux/nvme.h | 16 ++ 3 files changed, 249 insertions(+), 67 deletions(-) diff --git a/drivers/block/nvme-core.c b/drivers/block/nvme-core.c index cbdfbbf98392..3ffa57a932ea 100644 --- a/drivers/block/nvme-core.c +++ b/drivers/block/nvme-core.c @@ -37,6 +37,7 @@ #include #include #include +#include #include #include #include @@ -482,6 +483,62 @@ static int nvme_error_status(u16 status) } } +static void nvme_dif_prep(u32 p, u32 v, struct t10_pi_tuple *pi) +{ + if (be32_to_cpu(pi->ref_tag) == v) + pi->ref_tag = cpu_to_be32(p); +} + +static void nvme_dif_complete(u32 p, u32 v, struct t10_pi_tuple *pi) +{ + if (be32_to_cpu(pi->ref_tag) == p) + pi->ref_tag = cpu_to_be32(v); +} + +/** + * nvme_dif_remap - remaps ref tags to bip seed and physical lba + * + * The virtual start sector is the one that was originally submitted by the + * block layer. Due to partitioning, MD/DM cloning, etc. the actual physical + * start sector may be different. Remap protection information to match the + * physical LBA on writes, and back to the original seed on reads. + * + * Type 0 and 3 do not have a ref tag, so no remapping required. + */ +static void nvme_dif_remap(struct request *req, + void (*dif_swap)(u32 p, u32 v, struct t10_pi_tuple *pi)) +{ + struct nvme_ns *ns = req->rq_disk->private_data; + struct bio_integrity_payload *bip; + struct t10_pi_tuple *pi; + void *p, *pmap; + u32 i, nlb, ts, phys, virt; + + if (!ns->pi_type || ns->pi_type == NVME_NS_DPS_PI_TYPE3) + return; + + bip = bio_integrity(req->bio); + if (!bip) + return; + + pmap = kmap_atomic(bip->bip_vec->bv_page) + bip->bip_vec->bv_offset; + if (!pmap) + return; + + p = pmap; + virt = bip_get_seed(bip); + phys = nvme_block_nr(ns, blk_rq_pos(req)); + nlb = (blk_rq_bytes(req) >> ns->lba_shift); + ts = ns->disk->integrity->tuple_size; + + for (i = 0; i < nlb; i++, virt++, phys++) { + pi = (struct t10_pi_tuple *)p; + dif_swap(phys, virt, pi); + p += ts; + } + kunmap_atomic(pmap); +} + static void req_completion(struct nvme_queue *nvmeq, void *ctx, struct nvme_completion *cqe) { @@ -512,9 +569,16 @@ static void req_completion(struct nvme_queue *nvmeq, void *ctx, "completing aborted command with status:%04x\n", status); - if (iod->nents) + if (iod->nents) { dma_unmap_sg(&nvmeq->dev->pci_dev->dev, iod->sg, iod->nents, rq_data_dir(req) ? DMA_TO_DEVICE : DMA_FROM_DEVICE); + if (blk_integrity_rq(req)) { + if (!rq_data_dir(req)) + nvme_dif_remap(req, nvme_dif_complete); + dma_unmap_sg(&nvmeq->dev->pci_dev->dev, iod->meta_sg, 1, + rq_data_dir(req) ? DMA_TO_DEVICE : DMA_FROM_DEVICE); + } + } nvme_free_iod(nvmeq->dev, iod); blk_mq_complete_request(req); @@ -670,6 +734,24 @@ static int nvme_submit_iod(struct nvme_queue *nvmeq, struct nvme_iod *iod, cmnd->rw.prp2 = cpu_to_le64(iod->first_dma); cmnd->rw.slba = cpu_to_le64(nvme_block_nr(ns, blk_rq_pos(req))); cmnd->rw.length = cpu_to_le16((blk_rq_bytes(req) >> ns->lba_shift) - 1); + + if (blk_integrity_rq(req)) { + cmnd->rw.metadata = cpu_to_le64(sg_dma_address(iod->meta_sg)); + switch (ns->pi_type) { + case NVME_NS_DPS_PI_TYPE3: + control |= NVME_RW_PRINFO_PRCHK_GUARD; + break; + case NVME_NS_DPS_PI_TYPE1: + case NVME_NS_DPS_PI_TYPE2: + control |= NVME_RW_PRINFO_PRCHK_GUARD | + NVME_RW_PRINFO_PRCHK_REF; + cmnd->rw.reftag = cpu_to_le32( + nvme_block_nr(ns, blk_rq_pos(req))); + break; + } + } else if (ns->ms) + control |= NVME_RW_PRINFO_PRACT; + cmnd->rw.control = cpu_to_le16(control); cmnd->rw.dsmgmt = cpu_to_le32(dsmgmt); @@ -690,6 +772,19 @@ static int nvme_queue_rq(struct blk_mq_hw_ctx *hctx, struct nvme_iod *iod; enum dma_data_direction dma_dir; + /* + * If formated with metadata, require the block layer provide a buffer + * unless this namespace is formated such that the metadata can be + * stripped/generated by the controller with PRACT=1. + */ + if (ns->ms && !blk_integrity_rq(req)) { + if (!(ns->pi_type && ns->ms == 8)) { + req->errors = -EFAULT; + blk_mq_complete_request(req); + return BLK_MQ_RQ_QUEUE_OK; + } + } + iod = nvme_alloc_iod(req, ns->dev, GFP_ATOMIC); if (!iod) return BLK_MQ_RQ_QUEUE_BUSY; @@ -725,6 +820,21 @@ static int nvme_queue_rq(struct blk_mq_hw_ctx *hctx, iod->nents, dma_dir); goto retry_cmd; } + if (blk_integrity_rq(req)) { + if (blk_rq_count_integrity_sg(req->q, req->bio) != 1) + goto error_cmd; + + sg_init_table(iod->meta_sg, 1); + if (blk_rq_map_integrity_sg( + req->q, req->bio, iod->meta_sg) != 1) + goto error_cmd; + + if (rq_data_dir(req)) + nvme_dif_remap(req, nvme_dif_prep); + + if (!dma_map_sg(nvmeq->q_dmadev, iod->meta_sg, 1, dma_dir)) + goto error_cmd; + } } nvme_set_info(cmd, iod, req_completion); @@ -1875,13 +1985,61 @@ static int nvme_getgeo(struct block_device *bd, struct hd_geometry *geo) return 0; } +static void nvme_config_discard(struct nvme_ns *ns) +{ + u32 logical_block_size = queue_logical_block_size(ns->queue); + ns->queue->limits.discard_zeroes_data = 0; + ns->queue->limits.discard_alignment = logical_block_size; + ns->queue->limits.discard_granularity = logical_block_size; + ns->queue->limits.max_discard_sectors = 0xffffffff; + queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, ns->queue); +} + +static int nvme_noop_verify(struct blk_integrity_iter *iter) +{ + return 0; +} + +static int nvme_noop_generate(struct blk_integrity_iter *iter) +{ + return 0; +} + +struct blk_integrity nvme_meta_noop = { + .name = "NVME_META_NOOP", + .generate_fn = nvme_noop_generate, + .verify_fn = nvme_noop_verify, +}; + +static void nvme_init_integrity(struct nvme_ns *ns) +{ + struct blk_integrity integrity; + + switch (ns->pi_type) { + case NVME_NS_DPS_PI_TYPE3: + integrity = t10_pi_type3_crc; + break; + case NVME_NS_DPS_PI_TYPE1: + case NVME_NS_DPS_PI_TYPE2: + integrity = t10_pi_type1_crc; + break; + default: + integrity = nvme_meta_noop; + break; + } + integrity.tuple_size = ns->ms; + blk_integrity_register(ns->disk, &integrity); + blk_queue_max_integrity_segments(ns->queue, 1); +} + static int nvme_revalidate_disk(struct gendisk *disk) { struct nvme_ns *ns = disk->private_data; struct nvme_dev *dev = ns->dev; struct nvme_id_ns *id; dma_addr_t dma_addr; - int lbaf; + int lbaf, pi_type, old_ms; + unsigned short bs; id = dma_alloc_coherent(&dev->pci_dev->dev, 4096, &dma_addr, GFP_KERNEL); @@ -1890,16 +2048,50 @@ static int nvme_revalidate_disk(struct gendisk *disk) __func__); return 0; } + if (nvme_identify(dev, ns->ns_id, 0, dma_addr)) { + dev_warn(&dev->pci_dev->dev, + "identify failed ns:%d, setting capacity to 0\n", + ns->ns_id); + memset(id, 0, sizeof(*id)); + } - if (nvme_identify(dev, ns->ns_id, 0, dma_addr)) - goto free; - - lbaf = id->flbas & 0xf; + old_ms = ns->ms; + lbaf = id->flbas & NVME_NS_FLBAS_LBA_MASK; ns->lba_shift = id->lbaf[lbaf].ds; + ns->ms = le16_to_cpu(id->lbaf[lbaf].ms); + + /* + * If identify namespace failed, use default 512 byte block size so + * block layer can use before failing read/write for 0 capacity. + */ + if (ns->lba_shift == 0) + ns->lba_shift = 9; + bs = 1 << ns->lba_shift; + + /* XXX: PI implementation requires metadata equal t10 pi tuple size */ + pi_type = ns->ms == sizeof(struct t10_pi_tuple) ? + id->dps & NVME_NS_DPS_PI_MASK : 0; + + if (disk->integrity && (ns->pi_type != pi_type || ns->ms != old_ms || + bs != queue_logical_block_size(disk->queue) || + (ns->ms && id->flbas & NVME_NS_FLBAS_META_EXT))) + blk_integrity_unregister(disk); + + ns->pi_type = pi_type; + blk_queue_logical_block_size(ns->queue, bs); + + if (ns->ms && !disk->integrity && (disk->flags & GENHD_FL_UP) && + !(id->flbas & NVME_NS_FLBAS_META_EXT)) + nvme_init_integrity(ns); + + if (id->ncap == 0 || (ns->ms && !disk->integrity)) + set_capacity(disk, 0); + else + set_capacity(disk, le64_to_cpup(&id->nsze) << (ns->lba_shift - 9)); + + if (dev->oncs & NVME_CTRL_ONCS_DSM) + nvme_config_discard(ns); - blk_queue_logical_block_size(ns->queue, 1 << ns->lba_shift); - set_capacity(disk, le64_to_cpup(&id->nsze) << (ns->lba_shift - 9)); - free: dma_free_coherent(&dev->pci_dev->dev, 4096, id, dma_addr); return 0; } @@ -1956,30 +2148,16 @@ static int nvme_kthread(void *data) return 0; } -static void nvme_config_discard(struct nvme_ns *ns) -{ - u32 logical_block_size = queue_logical_block_size(ns->queue); - ns->queue->limits.discard_zeroes_data = 0; - ns->queue->limits.discard_alignment = logical_block_size; - ns->queue->limits.discard_granularity = logical_block_size; - ns->queue->limits.max_discard_sectors = 0xffffffff; - queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, ns->queue); -} - -static struct nvme_ns *nvme_alloc_ns(struct nvme_dev *dev, unsigned nsid, - struct nvme_id_ns *id, struct nvme_lba_range_type *rt) +static void nvme_alloc_ns(struct nvme_dev *dev, unsigned nsid) { struct nvme_ns *ns; struct gendisk *disk; int node = dev_to_node(&dev->pci_dev->dev); - int lbaf; - - if (rt->attributes & NVME_LBART_ATTRIB_HIDE) - return NULL; ns = kzalloc_node(sizeof(*ns), GFP_KERNEL, node); if (!ns) - return NULL; + return; + ns->queue = blk_mq_init_queue(&dev->tagset); if (IS_ERR(ns->queue)) goto out_free_ns; @@ -1995,9 +2173,9 @@ static struct nvme_ns *nvme_alloc_ns(struct nvme_dev *dev, unsigned nsid, ns->ns_id = nsid; ns->disk = disk; - lbaf = id->flbas & 0xf; - ns->lba_shift = id->lbaf[lbaf].ds; - ns->ms = le16_to_cpu(id->lbaf[lbaf].ms); + ns->lba_shift = 9; /* set to a default value for 512 until disk is validated */ + list_add_tail(&ns->list, &dev->namespaces); + blk_queue_logical_block_size(ns->queue, 1 << ns->lba_shift); if (dev->max_hw_sectors) blk_queue_max_hw_sectors(ns->queue, dev->max_hw_sectors); @@ -2014,18 +2192,23 @@ static struct nvme_ns *nvme_alloc_ns(struct nvme_dev *dev, unsigned nsid, disk->driverfs_dev = &dev->pci_dev->dev; disk->flags = GENHD_FL_EXT_DEVT; sprintf(disk->disk_name, "nvme%dn%d", dev->instance, nsid); - set_capacity(disk, le64_to_cpup(&id->nsze) << (ns->lba_shift - 9)); - - if (dev->oncs & NVME_CTRL_ONCS_DSM) - nvme_config_discard(ns); - - return ns; + /* + * Initialize capacity to 0 until we establish the namespace format and + * setup integrity extentions if necessary. The revalidate_disk after + * add_disk allows the driver to register with integrity if the format + * requires it. + */ + set_capacity(disk, 0); + nvme_revalidate_disk(ns->disk); + add_disk(ns->disk); + if (ns->ms) + revalidate_disk(ns->disk); + return; out_free_queue: blk_cleanup_queue(ns->queue); out_free_ns: kfree(ns); - return NULL; } static void nvme_create_io_queues(struct nvme_dev *dev) @@ -2150,22 +2333,20 @@ static int nvme_dev_add(struct nvme_dev *dev) struct pci_dev *pdev = dev->pci_dev; int res; unsigned nn, i; - struct nvme_ns *ns; struct nvme_id_ctrl *ctrl; - struct nvme_id_ns *id_ns; void *mem; dma_addr_t dma_addr; int shift = NVME_CAP_MPSMIN(readq(&dev->bar->cap)) + 12; - mem = dma_alloc_coherent(&pdev->dev, 8192, &dma_addr, GFP_KERNEL); + mem = dma_alloc_coherent(&pdev->dev, 4096, &dma_addr, GFP_KERNEL); if (!mem) return -ENOMEM; res = nvme_identify(dev, 0, 1, dma_addr); if (res) { dev_err(&pdev->dev, "Identify Controller failed (%d)\n", res); - res = -EIO; - goto out; + dma_free_coherent(&dev->pci_dev->dev, 4096, mem, dma_addr); + return -EIO; } ctrl = mem; @@ -2191,6 +2372,7 @@ static int nvme_dev_add(struct nvme_dev *dev) } else dev->max_hw_sectors = max_hw_sectors; } + dma_free_coherent(&dev->pci_dev->dev, 4096, mem, dma_addr); dev->tagset.ops = &nvme_mq_ops; dev->tagset.nr_hw_queues = dev->online_queues - 1; @@ -2203,33 +2385,12 @@ static int nvme_dev_add(struct nvme_dev *dev) dev->tagset.driver_data = dev; if (blk_mq_alloc_tag_set(&dev->tagset)) - goto out; + return 0; - id_ns = mem; - for (i = 1; i <= nn; i++) { - res = nvme_identify(dev, i, 0, dma_addr); - if (res) - continue; + for (i = 1; i <= nn; i++) + nvme_alloc_ns(dev, i); - if (id_ns->ncap == 0) - continue; - - res = nvme_get_features(dev, NVME_FEAT_LBA_RANGE, i, - dma_addr + 4096, NULL); - if (res) - memset(mem + 4096, 0, 4096); - - ns = nvme_alloc_ns(dev, i, mem, mem + 4096); - if (ns) - list_add_tail(&ns->list, &dev->namespaces); - } - list_for_each_entry(ns, &dev->namespaces, list) - add_disk(ns->disk); - res = 0; - - out: - dma_free_coherent(&dev->pci_dev->dev, 8192, mem, dma_addr); - return res; + return 0; } static int nvme_dev_map(struct nvme_dev *dev) @@ -2528,8 +2689,11 @@ static void nvme_dev_remove(struct nvme_dev *dev) struct nvme_ns *ns; list_for_each_entry(ns, &dev->namespaces, list) { - if (ns->disk->flags & GENHD_FL_UP) + if (ns->disk->flags & GENHD_FL_UP) { + if (ns->disk->integrity) + blk_integrity_unregister(ns->disk); del_gendisk(ns->disk); + } if (!blk_queue_dying(ns->queue)) { blk_mq_abort_requeue_list(ns->queue); blk_cleanup_queue(ns->queue); diff --git a/include/linux/nvme.h b/include/linux/nvme.h index 19a5d4b23209..cca264db2478 100644 --- a/include/linux/nvme.h +++ b/include/linux/nvme.h @@ -121,6 +121,7 @@ struct nvme_ns { unsigned ns_id; int lba_shift; int ms; + int pi_type; u64 mode_select_num_blocks; u32 mode_select_block_len; }; @@ -138,6 +139,7 @@ struct nvme_iod { int nents; /* Used in scatterlist */ int length; /* Of data, in bytes */ dma_addr_t first_dma; + struct scatterlist meta_sg[1]; /* metadata requires single contiguous buffer */ struct scatterlist sg[0]; }; diff --git a/include/uapi/linux/nvme.h b/include/uapi/linux/nvme.h index 26386cf3db44..406bfc95652c 100644 --- a/include/uapi/linux/nvme.h +++ b/include/uapi/linux/nvme.h @@ -124,10 +124,22 @@ struct nvme_id_ns { enum { NVME_NS_FEAT_THIN = 1 << 0, + NVME_NS_FLBAS_LBA_MASK = 0xf, + NVME_NS_FLBAS_META_EXT = 0x10, NVME_LBAF_RP_BEST = 0, NVME_LBAF_RP_BETTER = 1, NVME_LBAF_RP_GOOD = 2, NVME_LBAF_RP_DEGRADED = 3, + NVME_NS_DPC_PI_LAST = 1 << 4, + NVME_NS_DPC_PI_FIRST = 1 << 3, + NVME_NS_DPC_PI_TYPE3 = 1 << 2, + NVME_NS_DPC_PI_TYPE2 = 1 << 1, + NVME_NS_DPC_PI_TYPE1 = 1 << 0, + NVME_NS_DPS_PI_FIRST = 1 << 3, + NVME_NS_DPS_PI_MASK = 0x7, + NVME_NS_DPS_PI_TYPE1 = 1, + NVME_NS_DPS_PI_TYPE2 = 2, + NVME_NS_DPS_PI_TYPE3 = 3, }; struct nvme_smart_log { @@ -261,6 +273,10 @@ enum { NVME_RW_DSM_LATENCY_LOW = 3 << 4, NVME_RW_DSM_SEQ_REQ = 1 << 6, NVME_RW_DSM_COMPRESSED = 1 << 7, + NVME_RW_PRINFO_PRCHK_REF = 1 << 10, + NVME_RW_PRINFO_PRCHK_APP = 1 << 11, + NVME_RW_PRINFO_PRCHK_GUARD = 1 << 12, + NVME_RW_PRINFO_PRACT = 1 << 13, }; struct nvme_dsm_cmd { From 4f1982b4e262c45475a91b4253e9bc7f7c991c13 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Thu, 19 Feb 2015 13:42:14 -0700 Subject: [PATCH 2/9] NVMe: Update SCSI Inquiry VPD 83h translation The original translation created collisions on Inquiry VPD 83 for many existing devices. Newer specifications provide other ways to translate based on the device's version can be used to create unique identifiers. Version 1.1 provides an EUI64 field that uniquely identifies each namespace, and 1.2 added the longer NGUID field for the same reason. Both follow the IEEE EUI format and readily translate to the SCSI device identification EUI designator type 2h. For devices implementing either, the translation will use this type, defaulting to the EUI64 8-byte type if implemented then NGUID's 16 byte version if not. If neither are provided, the 1.0 translation is used, and is updated to use the SCSI String format to guarantee a unique identifier. Knowing when to use the new fields depends on the nvme controller's revision. The NVME_VS macro was not decoding this correctly, so that is fixed in this patch and moved to a more appropriate place. Since the Identify Namespace structure required an update for the NGUID field, this patch adds the remaining new 1.2 fields to the structure. Signed-off-by: Keith Busch --- drivers/block/nvme-scsi.c | 94 ++++++++++++++++++++++----------------- include/linux/nvme.h | 2 - include/uapi/linux/nvme.h | 10 ++++- 3 files changed, 62 insertions(+), 44 deletions(-) diff --git a/drivers/block/nvme-scsi.c b/drivers/block/nvme-scsi.c index 5e78568026c3..5cc907648742 100644 --- a/drivers/block/nvme-scsi.c +++ b/drivers/block/nvme-scsi.c @@ -779,10 +779,8 @@ static int nvme_trans_device_id_page(struct nvme_ns *ns, struct sg_io_hdr *hdr, struct nvme_dev *dev = ns->dev; dma_addr_t dma_addr; void *mem; - struct nvme_id_ctrl *id_ctrl; int res = SNTI_TRANSLATION_SUCCESS; int nvme_sc; - u8 ieee[4]; int xfer_len; __be32 tmp_id = cpu_to_be32(ns->ns_id); @@ -793,46 +791,60 @@ static int nvme_trans_device_id_page(struct nvme_ns *ns, struct sg_io_hdr *hdr, goto out_dma; } - /* nvme controller identify */ - nvme_sc = nvme_identify(dev, 0, 1, dma_addr); - res = nvme_trans_status_code(hdr, nvme_sc); - if (res) - goto out_free; - if (nvme_sc) { - res = nvme_sc; - goto out_free; - } - id_ctrl = mem; - - /* Since SCSI tried to save 4 bits... [SPC-4(r34) Table 591] */ - ieee[0] = id_ctrl->ieee[0] << 4; - ieee[1] = id_ctrl->ieee[0] >> 4 | id_ctrl->ieee[1] << 4; - ieee[2] = id_ctrl->ieee[1] >> 4 | id_ctrl->ieee[2] << 4; - ieee[3] = id_ctrl->ieee[2] >> 4; - - memset(inq_response, 0, STANDARD_INQUIRY_LENGTH); + memset(inq_response, 0, alloc_len); inq_response[1] = INQ_DEVICE_IDENTIFICATION_PAGE; /* Page Code */ - inq_response[3] = 20; /* Page Length */ - /* Designation Descriptor start */ - inq_response[4] = 0x01; /* Proto ID=0h | Code set=1h */ - inq_response[5] = 0x03; /* PIV=0b | Asso=00b | Designator Type=3h */ - inq_response[6] = 0x00; /* Rsvd */ - inq_response[7] = 16; /* Designator Length */ - /* Designator start */ - inq_response[8] = 0x60 | ieee[3]; /* NAA=6h | IEEE ID MSB, High nibble*/ - inq_response[9] = ieee[2]; /* IEEE ID */ - inq_response[10] = ieee[1]; /* IEEE ID */ - inq_response[11] = ieee[0]; /* IEEE ID| Vendor Specific ID... */ - inq_response[12] = (dev->pci_dev->vendor & 0xFF00) >> 8; - inq_response[13] = (dev->pci_dev->vendor & 0x00FF); - inq_response[14] = dev->serial[0]; - inq_response[15] = dev->serial[1]; - inq_response[16] = dev->model[0]; - inq_response[17] = dev->model[1]; - memcpy(&inq_response[18], &tmp_id, sizeof(u32)); - /* Last 2 bytes are zero */ + if (readl(&dev->bar->vs) >= NVME_VS(1, 1)) { + struct nvme_id_ns *id_ns = mem; + void *eui = id_ns->eui64; + int len = sizeof(id_ns->eui64); - xfer_len = min(alloc_len, STANDARD_INQUIRY_LENGTH); + nvme_sc = nvme_identify(dev, ns->ns_id, 0, dma_addr); + res = nvme_trans_status_code(hdr, nvme_sc); + if (res) + goto out_free; + if (nvme_sc) { + res = nvme_sc; + goto out_free; + } + + if (readl(&dev->bar->vs) >= NVME_VS(1, 2)) { + if (bitmap_empty(eui, len * 8)) { + eui = id_ns->nguid; + len = sizeof(id_ns->nguid); + } + } + if (bitmap_empty(eui, len * 8)) + goto scsi_string; + + inq_response[3] = 4 + len; /* Page Length */ + /* Designation Descriptor start */ + inq_response[4] = 0x01; /* Proto ID=0h | Code set=1h */ + inq_response[5] = 0x02; /* PIV=0b | Asso=00b | Designator Type=2h */ + inq_response[6] = 0x00; /* Rsvd */ + inq_response[7] = len; /* Designator Length */ + memcpy(&inq_response[8], eui, len); + } else { + scsi_string: + if (alloc_len < 72) { + res = nvme_trans_completion(hdr, + SAM_STAT_CHECK_CONDITION, + ILLEGAL_REQUEST, SCSI_ASC_INVALID_CDB, + SCSI_ASCQ_CAUSE_NOT_REPORTABLE); + goto out_free; + } + inq_response[3] = 0x48; /* Page Length */ + /* Designation Descriptor start */ + inq_response[4] = 0x03; /* Proto ID=0h | Code set=3h */ + inq_response[5] = 0x08; /* PIV=0b | Asso=00b | Designator Type=8h */ + inq_response[6] = 0x00; /* Rsvd */ + inq_response[7] = 0x44; /* Designator Length */ + + sprintf(&inq_response[8], "%04x", dev->pci_dev->vendor); + memcpy(&inq_response[12], dev->model, sizeof(dev->model)); + sprintf(&inq_response[52], "%04x", tmp_id); + memcpy(&inq_response[56], dev->serial, sizeof(dev->serial)); + } + xfer_len = alloc_len; res = nvme_trans_copy_to_user(hdr, inq_response, xfer_len); out_free: @@ -2222,7 +2234,7 @@ static int nvme_trans_inquiry(struct nvme_ns *ns, struct sg_io_hdr *hdr, page_code = GET_INQ_PAGE_CODE(cmd); alloc_len = GET_INQ_ALLOC_LENGTH(cmd); - inq_response = kmalloc(STANDARD_INQUIRY_LENGTH, GFP_KERNEL); + inq_response = kmalloc(alloc_len, GFP_KERNEL); if (inq_response == NULL) { res = -ENOMEM; goto out_mem; diff --git a/include/linux/nvme.h b/include/linux/nvme.h index cca264db2478..1f062a9e521d 100644 --- a/include/linux/nvme.h +++ b/include/linux/nvme.h @@ -62,8 +62,6 @@ enum { NVME_CSTS_SHST_MASK = 3 << 2, }; -#define NVME_VS(major, minor) (major << 16 | minor) - extern unsigned char nvme_io_timeout; #define NVME_IO_TIMEOUT (nvme_io_timeout * HZ) diff --git a/include/uapi/linux/nvme.h b/include/uapi/linux/nvme.h index 406bfc95652c..aef9a81b2d75 100644 --- a/include/uapi/linux/nvme.h +++ b/include/uapi/linux/nvme.h @@ -115,7 +115,13 @@ struct nvme_id_ns { __le16 nawun; __le16 nawupf; __le16 nacwu; - __u8 rsvd40[80]; + __le16 nabsn; + __le16 nabo; + __le16 nabspf; + __u16 rsvd46; + __le64 nvmcap[2]; + __u8 rsvd64[40]; + __u8 nguid[16]; __u8 eui64[8]; struct nvme_lbaf lbaf[16]; __u8 rsvd192[192]; @@ -565,6 +571,8 @@ struct nvme_passthru_cmd { __u32 result; }; +#define NVME_VS(major, minor) (((major) << 16) | ((minor) << 8)) + #define nvme_admin_cmd nvme_passthru_cmd #define NVME_IOCTL_ID _IO('N', 0x40) From b3fffdefabab266ae5176a136d93b6670b07bb30 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Tue, 3 Feb 2015 11:21:42 -0700 Subject: [PATCH 3/9] NVMe: Register management handle under nvme class This creates a new class type for nvme devices to register their management character devices with. This is so we do not rely on miscdev to provide enough minors for as many nvme devices some people plan to use. The previous limit was approximately 60 NVMe controllers, depending on the platform and kernel. Now the limit is 1M, which ought to be enough for anybody. Since we have a new device class, it makes sense to attach the block devices under this as well, so part of this patch moves the management handle initialization prior to the namespaces discovery. Signed-off-by: Keith Busch --- drivers/block/nvme-core.c | 79 +++++++++++++++++++++++++++------------ include/linux/nvme.h | 3 +- 2 files changed, 57 insertions(+), 25 deletions(-) diff --git a/drivers/block/nvme-core.c b/drivers/block/nvme-core.c index 3ffa57a932ea..bb2b861cfed9 100644 --- a/drivers/block/nvme-core.c +++ b/drivers/block/nvme-core.c @@ -42,6 +42,7 @@ #include #include +#define NVME_MINORS (1U << MINORBITS) #define NVME_Q_DEPTH 1024 #define NVME_AQ_DEPTH 64 #define SQ_SIZE(depth) (depth * sizeof(struct nvme_command)) @@ -69,6 +70,9 @@ MODULE_PARM_DESC(shutdown_timeout, "timeout in seconds for controller shutdown") static int nvme_major; module_param(nvme_major, int, 0); +static int nvme_char_major; +module_param(nvme_char_major, int, 0); + static int use_threaded_interrupts; module_param(use_threaded_interrupts, int, 0); @@ -79,6 +83,8 @@ static struct workqueue_struct *nvme_workq; static wait_queue_head_t nvme_kthread_wait; static struct notifier_block nvme_nb; +static struct class *nvme_class; + static void nvme_reset_failed_dev(struct work_struct *ws); static int nvme_process_cq(struct nvme_queue *nvmeq); @@ -2189,7 +2195,7 @@ static void nvme_alloc_ns(struct nvme_dev *dev, unsigned nsid) disk->fops = &nvme_fops; disk->private_data = ns; disk->queue = ns->queue; - disk->driverfs_dev = &dev->pci_dev->dev; + disk->driverfs_dev = dev->device; disk->flags = GENHD_FL_EXT_DEVT; sprintf(disk->disk_name, "nvme%dn%d", dev->instance, nsid); @@ -2775,6 +2781,7 @@ static void nvme_free_dev(struct kref *kref) struct nvme_dev *dev = container_of(kref, struct nvme_dev, kref); pci_dev_put(dev->pci_dev); + put_device(dev->device); nvme_free_namespaces(dev); nvme_release_instance(dev); blk_mq_free_tag_set(&dev->tagset); @@ -2786,11 +2793,23 @@ static void nvme_free_dev(struct kref *kref) static int nvme_dev_open(struct inode *inode, struct file *f) { - struct nvme_dev *dev = container_of(f->private_data, struct nvme_dev, - miscdev); - kref_get(&dev->kref); - f->private_data = dev; - return 0; + struct nvme_dev *dev; + int instance = iminor(inode); + int ret = -ENODEV; + + spin_lock(&dev_list_lock); + list_for_each_entry(dev, &dev_list, node) { + if (dev->instance == instance) { + if (!kref_get_unless_zero(&dev->kref)) + break; + f->private_data = dev; + ret = 0; + break; + } + } + spin_unlock(&dev_list_lock); + + return ret; } static int nvme_dev_release(struct inode *inode, struct file *f) @@ -3002,29 +3021,26 @@ static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id) if (result) goto release_pools; + dev->device = device_create(nvme_class, &pdev->dev, + MKDEV(nvme_char_major, dev->instance), + dev, "nvme%d", dev->instance); + if (IS_ERR(dev->device)) { + result = PTR_ERR(dev->device); + goto shutdown; + } + get_device(dev->device); + if (dev->online_queues > 1) result = nvme_dev_add(dev); if (result) - goto shutdown; - - scnprintf(dev->name, sizeof(dev->name), "nvme%d", dev->instance); - dev->miscdev.minor = MISC_DYNAMIC_MINOR; - dev->miscdev.parent = &pdev->dev; - dev->miscdev.name = dev->name; - dev->miscdev.fops = &nvme_dev_fops; - result = misc_register(&dev->miscdev); - if (result) - goto remove; + goto device_del; nvme_set_irq_hints(dev); - dev->initialized = 1; return 0; - remove: - nvme_dev_remove(dev); - nvme_dev_remove_admin(dev); - nvme_free_namespaces(dev); + device_del: + device_destroy(nvme_class, MKDEV(nvme_char_major, dev->instance)); shutdown: nvme_dev_shutdown(dev); release_pools: @@ -3067,10 +3083,10 @@ static void nvme_remove(struct pci_dev *pdev) pci_set_drvdata(pdev, NULL); flush_work(&dev->reset_work); - misc_deregister(&dev->miscdev); nvme_dev_shutdown(dev); nvme_dev_remove(dev); nvme_dev_remove_admin(dev); + device_destroy(nvme_class, MKDEV(nvme_char_major, dev->instance)); nvme_free_queues(dev, 0); nvme_release_prp_pools(dev); kref_put(&dev->kref, nvme_free_dev); @@ -3154,11 +3170,26 @@ static int __init nvme_init(void) else if (result > 0) nvme_major = result; + result = __register_chrdev(nvme_char_major, 0, NVME_MINORS, "nvme", + &nvme_dev_fops); + if (result < 0) + goto unregister_blkdev; + else if (result > 0) + nvme_char_major = result; + + nvme_class = class_create(THIS_MODULE, "nvme"); + if (!nvme_class) + goto unregister_chrdev; + result = pci_register_driver(&nvme_driver); if (result) - goto unregister_blkdev; + goto destroy_class; return 0; + destroy_class: + class_destroy(nvme_class); + unregister_chrdev: + __unregister_chrdev(nvme_char_major, 0, NVME_MINORS, "nvme"); unregister_blkdev: unregister_blkdev(nvme_major, "nvme"); kill_workq: @@ -3172,6 +3203,8 @@ static void __exit nvme_exit(void) unregister_hotcpu_notifier(&nvme_nb); unregister_blkdev(nvme_major, "nvme"); destroy_workqueue(nvme_workq); + class_destroy(nvme_class); + __unregister_chrdev(nvme_char_major, 0, NVME_MINORS, "nvme"); BUG_ON(nvme_thread && !IS_ERR(nvme_thread)); _nvme_check_size(); } diff --git a/include/linux/nvme.h b/include/linux/nvme.h index 1f062a9e521d..383d495c5e4c 100644 --- a/include/linux/nvme.h +++ b/include/linux/nvme.h @@ -17,7 +17,6 @@ #include #include -#include #include #include @@ -89,7 +88,7 @@ struct nvme_dev { struct nvme_bar __iomem *bar; struct list_head namespaces; struct kref kref; - struct miscdevice miscdev; + struct device *device; work_func_t reset_workfn; struct work_struct reset_work; char name[12]; From 2e1d8448196ba85cd78a18723413a3c92aabe0f3 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Thu, 12 Feb 2015 15:33:00 -0700 Subject: [PATCH 4/9] NVMe: Asynchronous controller probe This performs the longest parts of nvme device probe in scheduled work. This speeds up probe significantly when multiple devices are in use. Signed-off-by: Keith Busch --- drivers/block/nvme-core.c | 48 +++++++++++++++++++++++++-------------- include/linux/nvme.h | 1 + 2 files changed, 32 insertions(+), 17 deletions(-) diff --git a/drivers/block/nvme-core.c b/drivers/block/nvme-core.c index bb2b861cfed9..a57685f74e5e 100644 --- a/drivers/block/nvme-core.c +++ b/drivers/block/nvme-core.c @@ -2800,6 +2800,10 @@ static int nvme_dev_open(struct inode *inode, struct file *f) spin_lock(&dev_list_lock); list_for_each_entry(dev, &dev_list, node) { if (dev->instance == instance) { + if (!dev->admin_q) { + ret = -EWOULDBLOCK; + break; + } if (!kref_get_unless_zero(&dev->kref)) break; f->private_data = dev; @@ -2982,6 +2986,7 @@ static void nvme_reset_workfn(struct work_struct *work) dev->reset_workfn(work); } +static void nvme_async_probe(struct work_struct *work); static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id) { int node, result = -ENOMEM; @@ -3017,34 +3022,20 @@ static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id) goto release; kref_init(&dev->kref); - result = nvme_dev_start(dev); - if (result) - goto release_pools; - dev->device = device_create(nvme_class, &pdev->dev, MKDEV(nvme_char_major, dev->instance), dev, "nvme%d", dev->instance); if (IS_ERR(dev->device)) { result = PTR_ERR(dev->device); - goto shutdown; + goto release_pools; } get_device(dev->device); - if (dev->online_queues > 1) - result = nvme_dev_add(dev); - if (result) - goto device_del; - - nvme_set_irq_hints(dev); - dev->initialized = 1; + INIT_WORK(&dev->probe_work, nvme_async_probe); + schedule_work(&dev->probe_work); return 0; - device_del: - device_destroy(nvme_class, MKDEV(nvme_char_major, dev->instance)); - shutdown: - nvme_dev_shutdown(dev); release_pools: - nvme_free_queues(dev, 0); nvme_release_prp_pools(dev); release: nvme_release_instance(dev); @@ -3057,6 +3048,28 @@ static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id) return result; } +static void nvme_async_probe(struct work_struct *work) +{ + struct nvme_dev *dev = container_of(work, struct nvme_dev, probe_work); + int result; + + result = nvme_dev_start(dev); + if (result) + goto reset; + + if (dev->online_queues > 1) + result = nvme_dev_add(dev); + if (result) + goto reset; + + nvme_set_irq_hints(dev); + dev->initialized = 1; + return; + reset: + dev->reset_workfn = nvme_reset_failed_dev; + queue_work(nvme_workq, &dev->reset_work); +} + static void nvme_reset_notify(struct pci_dev *pdev, bool prepare) { struct nvme_dev *dev = pci_get_drvdata(pdev); @@ -3082,6 +3095,7 @@ static void nvme_remove(struct pci_dev *pdev) spin_unlock(&dev_list_lock); pci_set_drvdata(pdev, NULL); + flush_work(&dev->probe_work); flush_work(&dev->reset_work); nvme_dev_shutdown(dev); nvme_dev_remove(dev); diff --git a/include/linux/nvme.h b/include/linux/nvme.h index 383d495c5e4c..e2429e8cdab4 100644 --- a/include/linux/nvme.h +++ b/include/linux/nvme.h @@ -91,6 +91,7 @@ struct nvme_dev { struct device *device; work_func_t reset_workfn; struct work_struct reset_work; + struct work_struct probe_work; char name[12]; char serial[20]; char model[40]; From 07836e659c81ec6b0d683dfbf7958339a22a7b69 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Thu, 19 Feb 2015 10:34:48 -0700 Subject: [PATCH 5/9] NVMe: Fix potential corruption during shutdown The driver has to end unreturned commands at some point even if the controller has not provided a completion. The driver tried to be safe by deleting IO queues prior to ending all unreturned commands. That should cause the controller to internally abort inflight commands, but IO queue deletion request does not have to be successful, so all bets are off. We still have to make progress, so to be extra safe, this patch doesn't clear a queue to release the dma mapping for a command until after the pci device has been disabled. This patch removes the special handling during device initialization so controller recovery can be done all the time. This is possible since initialization is not inlined with pci probe anymore. Reported-by: Nilish Choudhury Signed-off-by: Keith Busch --- drivers/block/nvme-core.c | 49 +++++++++++++++------------------------ include/linux/nvme.h | 1 - 2 files changed, 19 insertions(+), 31 deletions(-) diff --git a/drivers/block/nvme-core.c b/drivers/block/nvme-core.c index a57685f74e5e..0f388206b15b 100644 --- a/drivers/block/nvme-core.c +++ b/drivers/block/nvme-core.c @@ -1274,29 +1274,18 @@ static enum blk_eh_timer_return nvme_timeout(struct request *req, bool reserved) struct nvme_cmd_info *cmd = blk_mq_rq_to_pdu(req); struct nvme_queue *nvmeq = cmd->nvmeq; + dev_warn(nvmeq->q_dmadev, "Timeout I/O %d QID %d\n", req->tag, + nvmeq->qid); + spin_lock_irq(&nvmeq->q_lock); + nvme_abort_req(req); + spin_unlock_irq(&nvmeq->q_lock); + /* * The aborted req will be completed on receiving the abort req. * We enable the timer again. If hit twice, it'll cause a device reset, * as the device then is in a faulty state. */ - int ret = BLK_EH_RESET_TIMER; - - dev_warn(nvmeq->q_dmadev, "Timeout I/O %d QID %d\n", req->tag, - nvmeq->qid); - - spin_lock_irq(&nvmeq->q_lock); - if (!nvmeq->dev->initialized) { - /* - * Force cancelled command frees the request, which requires we - * return BLK_EH_NOT_HANDLED. - */ - nvme_cancel_queue_ios(nvmeq->hctx, req, nvmeq, reserved); - ret = BLK_EH_NOT_HANDLED; - } else - nvme_abort_req(req); - spin_unlock_irq(&nvmeq->q_lock); - - return ret; + return BLK_EH_RESET_TIMER; } static void nvme_free_queue(struct nvme_queue *nvmeq) @@ -1349,7 +1338,6 @@ static void nvme_clear_queue(struct nvme_queue *nvmeq) struct blk_mq_hw_ctx *hctx = nvmeq->hctx; spin_lock_irq(&nvmeq->q_lock); - nvme_process_cq(nvmeq); if (hctx && hctx->tags) blk_mq_tag_busy_iter(hctx, nvme_cancel_queue_ios, nvmeq); spin_unlock_irq(&nvmeq->q_lock); @@ -1372,7 +1360,10 @@ static void nvme_disable_queue(struct nvme_dev *dev, int qid) } if (!qid && dev->admin_q) blk_mq_freeze_queue_start(dev->admin_q); - nvme_clear_queue(nvmeq); + + spin_lock_irq(&nvmeq->q_lock); + nvme_process_cq(nvmeq); + spin_unlock_irq(&nvmeq->q_lock); } static struct nvme_queue *nvme_alloc_queue(struct nvme_dev *dev, int qid, @@ -2121,8 +2112,7 @@ static int nvme_kthread(void *data) spin_lock(&dev_list_lock); list_for_each_entry_safe(dev, next, &dev_list, node) { int i; - if (readl(&dev->bar->csts) & NVME_CSTS_CFS && - dev->initialized) { + if (readl(&dev->bar->csts) & NVME_CSTS_CFS) { if (work_busy(&dev->reset_work)) continue; list_del_init(&dev->node); @@ -2525,8 +2515,6 @@ static struct nvme_delq_ctx *nvme_get_dq(struct nvme_delq_ctx *dq) static void nvme_del_queue_end(struct nvme_queue *nvmeq) { struct nvme_delq_ctx *dq = nvmeq->cmdinfo.ctx; - - nvme_clear_queue(nvmeq); nvme_put_dq(dq); } @@ -2669,7 +2657,6 @@ static void nvme_dev_shutdown(struct nvme_dev *dev) int i; u32 csts = -1; - dev->initialized = 0; nvme_dev_list_remove(dev); if (dev->bar) { @@ -2680,7 +2667,6 @@ static void nvme_dev_shutdown(struct nvme_dev *dev) for (i = dev->queue_count - 1; i >= 0; i--) { struct nvme_queue *nvmeq = dev->queues[i]; nvme_suspend_queue(nvmeq); - nvme_clear_queue(nvmeq); } } else { nvme_disable_io_queues(dev); @@ -2688,6 +2674,9 @@ static void nvme_dev_shutdown(struct nvme_dev *dev) nvme_disable_queue(dev, 0); } nvme_dev_unmap(dev); + + for (i = dev->queue_count - 1; i >= 0; i--) + nvme_clear_queue(dev->queues[i]); } static void nvme_dev_remove(struct nvme_dev *dev) @@ -2955,7 +2944,6 @@ static int nvme_dev_resume(struct nvme_dev *dev) nvme_unfreeze_queues(dev); nvme_set_irq_hints(dev); } - dev->initialized = 1; return 0; } @@ -3063,11 +3051,12 @@ static void nvme_async_probe(struct work_struct *work) goto reset; nvme_set_irq_hints(dev); - dev->initialized = 1; return; reset: - dev->reset_workfn = nvme_reset_failed_dev; - queue_work(nvme_workq, &dev->reset_work); + if (!work_busy(&dev->reset_work)) { + dev->reset_workfn = nvme_reset_failed_dev; + queue_work(nvme_workq, &dev->reset_work); + } } static void nvme_reset_notify(struct pci_dev *pdev, bool prepare) diff --git a/include/linux/nvme.h b/include/linux/nvme.h index e2429e8cdab4..0adad4a5419b 100644 --- a/include/linux/nvme.h +++ b/include/linux/nvme.h @@ -103,7 +103,6 @@ struct nvme_dev { u16 abort_limit; u8 event_limit; u8 vwc; - u8 initialized; }; /* From 9ac16938ab784101631c32c6de825f7ebea08a48 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Fri, 9 Jan 2015 16:52:08 -0700 Subject: [PATCH 6/9] NVMe: Fix scsi mode select llbaa setting It should be a logical bitwise AND, not conditional. Signed-off-by: Keith Busch --- drivers/block/nvme-scsi.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/block/nvme-scsi.c b/drivers/block/nvme-scsi.c index 5cc907648742..e10196e0182d 100644 --- a/drivers/block/nvme-scsi.c +++ b/drivers/block/nvme-scsi.c @@ -1612,7 +1612,7 @@ static inline void nvme_trans_modesel_get_bd_len(u8 *parm_list, u8 cdb10, /* 10 Byte CDB */ *bd_len = (parm_list[MODE_SELECT_10_BD_OFFSET] << 8) + parm_list[MODE_SELECT_10_BD_OFFSET + 1]; - *llbaa = parm_list[MODE_SELECT_10_LLBAA_OFFSET] && + *llbaa = parm_list[MODE_SELECT_10_LLBAA_OFFSET] & MODE_SELECT_10_LLBAA_MASK; } else { /* 6 Byte CDB */ From 483285184059b3f5b3a5707977349528abc82441 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Thu, 19 Feb 2015 14:01:59 -0700 Subject: [PATCH 7/9] NVMe: Remove unused variables We don't track queues in a llist, subscribe to hot-cpu notifications, or internally retry commands. Delete the unused artifacts. Signed-off-by: Keith Busch --- drivers/block/nvme-core.c | 8 -------- 1 file changed, 8 deletions(-) diff --git a/drivers/block/nvme-core.c b/drivers/block/nvme-core.c index 0f388206b15b..cf2d8e3c93a8 100644 --- a/drivers/block/nvme-core.c +++ b/drivers/block/nvme-core.c @@ -49,7 +49,6 @@ #define CQ_SIZE(depth) (depth * sizeof(struct nvme_completion)) #define ADMIN_TIMEOUT (admin_timeout * HZ) #define SHUTDOWN_TIMEOUT (shutdown_timeout * HZ) -#define IOD_TIMEOUT (retry_time * HZ) static unsigned char admin_timeout = 60; module_param(admin_timeout, byte, 0644); @@ -59,10 +58,6 @@ unsigned char nvme_io_timeout = 30; module_param_named(io_timeout, nvme_io_timeout, byte, 0644); MODULE_PARM_DESC(io_timeout, "timeout in seconds for I/O"); -static unsigned char retry_time = 30; -module_param(retry_time, byte, 0644); -MODULE_PARM_DESC(retry_time, "time in seconds to retry failed I/O"); - static unsigned char shutdown_timeout = 5; module_param(shutdown_timeout, byte, 0644); MODULE_PARM_DESC(shutdown_timeout, "timeout in seconds for controller shutdown"); @@ -81,7 +76,6 @@ static LIST_HEAD(dev_list); static struct task_struct *nvme_thread; static struct workqueue_struct *nvme_workq; static wait_queue_head_t nvme_kthread_wait; -static struct notifier_block nvme_nb; static struct class *nvme_class; @@ -102,7 +96,6 @@ struct async_cmd_info { * commands and one for I/O commands). */ struct nvme_queue { - struct llist_node node; struct device *q_dmadev; struct nvme_dev *dev; char irqname[24]; /* nvme4294967295-65535\0 */ @@ -3203,7 +3196,6 @@ static int __init nvme_init(void) static void __exit nvme_exit(void) { pci_unregister_driver(&nvme_driver); - unregister_hotcpu_notifier(&nvme_nb); unregister_blkdev(nvme_major, "nvme"); destroy_workqueue(nvme_workq); class_destroy(nvme_class); From 0c0f9b95c8b710b74772edd9693fe7ab5419a75a Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Thu, 19 Feb 2015 14:29:48 -0700 Subject: [PATCH 8/9] NVMe: Fix potential corruption on sync commands This makes all sync commands uninterruptible and schedules without timeout so the controller either has to post a completion or the timeout recovery fails the command. This fixes potential memory or data corruption from a command timing out too early or woken by a signal. Previously any DMA buffers mapped for that command would have been released even though we don't know what the controller is planning to do with those addresses. Signed-off-by: Keith Busch --- drivers/block/nvme-core.c | 32 +++----------------------------- 1 file changed, 3 insertions(+), 29 deletions(-) diff --git a/drivers/block/nvme-core.c b/drivers/block/nvme-core.c index cf2d8e3c93a8..b64bccbb78c9 100644 --- a/drivers/block/nvme-core.c +++ b/drivers/block/nvme-core.c @@ -926,14 +926,6 @@ static irqreturn_t nvme_irq_check(int irq, void *data) return IRQ_WAKE_THREAD; } -static void nvme_abort_cmd_info(struct nvme_queue *nvmeq, struct nvme_cmd_info * - cmd_info) -{ - spin_lock_irq(&nvmeq->q_lock); - cancel_cmd_info(cmd_info, NULL); - spin_unlock_irq(&nvmeq->q_lock); -} - struct sync_cmd_info { struct task_struct *task; u32 result; @@ -956,7 +948,6 @@ static void sync_completion(struct nvme_queue *nvmeq, void *ctx, static int nvme_submit_sync_cmd(struct request *req, struct nvme_command *cmd, u32 *result, unsigned timeout) { - int ret; struct sync_cmd_info cmdinfo; struct nvme_cmd_info *cmd_rq = blk_mq_rq_to_pdu(req); struct nvme_queue *nvmeq = cmd_rq->nvmeq; @@ -968,29 +959,12 @@ static int nvme_submit_sync_cmd(struct request *req, struct nvme_command *cmd, nvme_set_info(cmd_rq, &cmdinfo, sync_completion); - set_current_state(TASK_KILLABLE); - ret = nvme_submit_cmd(nvmeq, cmd); - if (ret) { - nvme_finish_cmd(nvmeq, req->tag, NULL); - set_current_state(TASK_RUNNING); - } - ret = schedule_timeout(timeout); - - /* - * Ensure that sync_completion has either run, or that it will - * never run. - */ - nvme_abort_cmd_info(nvmeq, blk_mq_rq_to_pdu(req)); - - /* - * We never got the completion - */ - if (cmdinfo.status == -EINTR) - return -EINTR; + set_current_state(TASK_UNINTERRUPTIBLE); + nvme_submit_cmd(nvmeq, cmd); + schedule(); if (result) *result = cmdinfo.result; - return cmdinfo.status; } From 045c47ca306acf30c740c285a77a4b4bda6be7c5 Mon Sep 17 00:00:00 2001 From: Thadeu Lima de Souza Cascardo Date: Mon, 16 Feb 2015 17:16:45 -0200 Subject: [PATCH 9/9] blk-throttle: check stats_cpu before reading it from sysfs When reading blkio.throttle.io_serviced in a recently created blkio cgroup, it's possible to race against the creation of a throttle policy, which delays the allocation of stats_cpu. Like other functions in the throttle code, just checking for a NULL stats_cpu prevents the following oops caused by that race. [ 1117.285199] Unable to handle kernel paging request for data at address 0x7fb4d0020 [ 1117.285252] Faulting instruction address: 0xc0000000003efa2c [ 1137.733921] Oops: Kernel access of bad area, sig: 11 [#1] [ 1137.733945] SMP NR_CPUS=2048 NUMA PowerNV [ 1137.734025] Modules linked in: bridge stp llc kvm_hv kvm binfmt_misc autofs4 [ 1137.734102] CPU: 3 PID: 5302 Comm: blkcgroup Not tainted 3.19.0 #5 [ 1137.734132] task: c000000f1d188b00 ti: c000000f1d210000 task.ti: c000000f1d210000 [ 1137.734167] NIP: c0000000003efa2c LR: c0000000003ef9f0 CTR: c0000000003ef980 [ 1137.734202] REGS: c000000f1d213500 TRAP: 0300 Not tainted (3.19.0) [ 1137.734230] MSR: 9000000000009032 CR: 42008884 XER: 20000000 [ 1137.734325] CFAR: 0000000000008458 DAR: 00000007fb4d0020 DSISR: 40000000 SOFTE: 0 GPR00: c0000000003ed3a0 c000000f1d213780 c000000000c59538 0000000000000000 GPR04: 0000000000000800 0000000000000000 0000000000000000 0000000000000000 GPR08: ffffffffffffffff 00000007fb4d0020 00000007fb4d0000 c000000000780808 GPR12: 0000000022000888 c00000000fdc0d80 0000000000000000 0000000000000000 GPR16: 0000000000000000 0000000000000000 0000000000000000 0000000000000000 GPR20: 000001003e120200 c000000f1d5b0cc0 0000000000000200 0000000000000000 GPR24: 0000000000000001 c000000000c269e0 0000000000000020 c000000f1d5b0c80 GPR28: c000000000ca3a08 c000000000ca3dec c000000f1c667e00 c000000f1d213850 [ 1137.734886] NIP [c0000000003efa2c] .tg_prfill_cpu_rwstat+0xac/0x180 [ 1137.734915] LR [c0000000003ef9f0] .tg_prfill_cpu_rwstat+0x70/0x180 [ 1137.734943] Call Trace: [ 1137.734952] [c000000f1d213780] [d000000005560520] 0xd000000005560520 (unreliable) [ 1137.734996] [c000000f1d2138a0] [c0000000003ed3a0] .blkcg_print_blkgs+0xe0/0x1a0 [ 1137.735039] [c000000f1d213960] [c0000000003efb50] .tg_print_cpu_rwstat+0x50/0x70 [ 1137.735082] [c000000f1d2139e0] [c000000000104b48] .cgroup_seqfile_show+0x58/0x150 [ 1137.735125] [c000000f1d213a70] [c0000000002749dc] .kernfs_seq_show+0x3c/0x50 [ 1137.735161] [c000000f1d213ae0] [c000000000218630] .seq_read+0xe0/0x510 [ 1137.735197] [c000000f1d213bd0] [c000000000275b04] .kernfs_fop_read+0x164/0x200 [ 1137.735240] [c000000f1d213c80] [c0000000001eb8e0] .__vfs_read+0x30/0x80 [ 1137.735276] [c000000f1d213cf0] [c0000000001eb9c4] .vfs_read+0x94/0x1b0 [ 1137.735312] [c000000f1d213d90] [c0000000001ebb38] .SyS_read+0x58/0x100 [ 1137.735349] [c000000f1d213e30] [c000000000009218] syscall_exit+0x0/0x98 [ 1137.735383] Instruction dump: [ 1137.735405] 7c6307b4 7f891800 409d00b8 60000000 60420000 3d420004 392a63b0 786a1f24 [ 1137.735471] 7d49502a e93e01c8 7d495214 7d2ad214 <7cead02a> e9090008 e9490010 e9290018 And here is one code that allows to easily reproduce this, although this has first been found by running docker. void run(pid_t pid) { int n; int status; int fd; char *buffer; buffer = memalign(BUFFER_ALIGN, BUFFER_SIZE); n = snprintf(buffer, BUFFER_SIZE, "%d\n", pid); fd = open(CGPATH "/test/tasks", O_WRONLY); write(fd, buffer, n); close(fd); if (fork() > 0) { fd = open("/dev/sda", O_RDONLY | O_DIRECT); read(fd, buffer, 512); close(fd); wait(&status); } else { fd = open(CGPATH "/test/blkio.throttle.io_serviced", O_RDONLY); n = read(fd, buffer, BUFFER_SIZE); close(fd); } free(buffer); exit(0); } void test(void) { int status; mkdir(CGPATH "/test", 0666); if (fork() > 0) wait(&status); else run(getpid()); rmdir(CGPATH "/test"); } int main(int argc, char **argv) { int i; for (i = 0; i < NR_TESTS; i++) test(); return 0; } Reported-by: Ricardo Marin Matinata Signed-off-by: Thadeu Lima de Souza Cascardo Cc: stable@vger.kernel.org Signed-off-by: Jens Axboe --- block/blk-throttle.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/block/blk-throttle.c b/block/blk-throttle.c index 9273d0969ebd..5b9c6d5c3636 100644 --- a/block/blk-throttle.c +++ b/block/blk-throttle.c @@ -1292,6 +1292,9 @@ static u64 tg_prfill_cpu_rwstat(struct seq_file *sf, struct blkg_rwstat rwstat = { }, tmp; int i, cpu; + if (tg->stats_cpu == NULL) + return 0; + for_each_possible_cpu(cpu) { struct tg_stats_cpu *sc = per_cpu_ptr(tg->stats_cpu, cpu);