From dd545b52a3e1efd9f2c6352dbe95ccd0c53461cc Mon Sep 17 00:00:00 2001 From: Chandan Rajendra Date: Tue, 10 Jan 2017 13:29:54 -0700 Subject: [PATCH 01/12] do_direct_IO: Use inode->i_blkbits to compute block count to be cleaned The code currently uses sdio->blkbits to compute the number of blocks to be cleaned. However sdio->blkbits is derived from the logical block size of the underlying block device (Refer to the definition of do_blockdev_direct_IO()). Due to this, generic/299 test would rarely fail when executed on an ext4 filesystem with 64k as the block size and when using a virtio based disk (having 512 byte as the logical block size) inside a kvm guest. This commit fixes the bug by using inode->i_blkbits to compute the number of blocks to be cleaned. Signed-off-by: Chandan Rajendra Reviewed-by: Christoph Hellwig Fixed up by Jeff Moyer to only use/evaluate inode->i_blkbits once, to avoid issues with block size changes with IO in flight. Signed-off-by: Jens Axboe --- fs/direct-io.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/direct-io.c b/fs/direct-io.c index aeae8c063451..c87bae4376b8 100644 --- a/fs/direct-io.c +++ b/fs/direct-io.c @@ -906,6 +906,7 @@ static int do_direct_IO(struct dio *dio, struct dio_submit *sdio, struct buffer_head *map_bh) { const unsigned blkbits = sdio->blkbits; + const unsigned i_blkbits = blkbits + sdio->blkfactor; int ret = 0; while (sdio->block_in_file < sdio->final_block_in_request) { @@ -949,7 +950,7 @@ static int do_direct_IO(struct dio *dio, struct dio_submit *sdio, clean_bdev_aliases( map_bh->b_bdev, map_bh->b_blocknr, - map_bh->b_size >> blkbits); + map_bh->b_size >> i_blkbits); } if (!sdio->blkfactor) From a14d749fcebe97ddf6af6db3d1f6ece85c9ddcb9 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 9 Jan 2017 08:56:23 -0700 Subject: [PATCH 02/12] virtio_blk: avoid DMA to stack for the sense buffer Most users of BLOCK_PC requests allocate the sense buffer on the stack, so to avoid DMA to the stack copy them to a field in the heap allocated virtblk_req structure. Without that any attempt at SCSI passthrough I/O, including the SG_IO ioctl from userspace will crash the kernel. Note that this includes running tools like hdparm even when the host does not have SCSI passthrough enabled. Signed-off-by: Christoph Hellwig Cc: stable@vger.kernel.org # v4.9+ Signed-off-by: Jens Axboe --- drivers/block/virtio_blk.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c index 5545a679abd8..3c3b8f601469 100644 --- a/drivers/block/virtio_blk.c +++ b/drivers/block/virtio_blk.c @@ -56,6 +56,7 @@ struct virtblk_req { struct virtio_blk_outhdr out_hdr; struct virtio_scsi_inhdr in_hdr; u8 status; + u8 sense[SCSI_SENSE_BUFFERSIZE]; struct scatterlist sg[]; }; @@ -102,7 +103,8 @@ static int __virtblk_add_req(struct virtqueue *vq, } if (type == cpu_to_virtio32(vq->vdev, VIRTIO_BLK_T_SCSI_CMD)) { - sg_init_one(&sense, vbr->req->sense, SCSI_SENSE_BUFFERSIZE); + memcpy(vbr->sense, vbr->req->sense, SCSI_SENSE_BUFFERSIZE); + sg_init_one(&sense, vbr->sense, SCSI_SENSE_BUFFERSIZE); sgs[num_out + num_in++] = &sense; sg_init_one(&inhdr, &vbr->in_hdr, sizeof(vbr->in_hdr)); sgs[num_out + num_in++] = &inhdr; From 25b4acfc7de0fc4da3bfea3a316f7282c6fbde81 Mon Sep 17 00:00:00 2001 From: Jeff Moyer Date: Mon, 9 Jan 2017 15:20:31 -0500 Subject: [PATCH 03/12] nbd: blk_mq_init_queue returns an error code on failure, not NULL Additionally, don't assign directly to disk->queue, otherwise blk_put_queue (called via put_disk) will choke (panic) on the errno stored there. Bug found by code inspection after Omar found a similar issue in virtio_blk. Compile-tested only. Signed-off-by: Jeff Moyer Reviewed-by: Omar Sandoval Reviewed-by: Josef Bacik Signed-off-by: Jens Axboe --- drivers/block/nbd.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c index 38c576f76d36..50a2020b5b72 100644 --- a/drivers/block/nbd.c +++ b/drivers/block/nbd.c @@ -1042,6 +1042,7 @@ static int __init nbd_init(void) return -ENOMEM; for (i = 0; i < nbds_max; i++) { + struct request_queue *q; struct gendisk *disk = alloc_disk(1 << part_shift); if (!disk) goto out; @@ -1067,12 +1068,13 @@ static int __init nbd_init(void) * every gendisk to have its very own request_queue struct. * These structs are big so we dynamically allocate them. */ - disk->queue = blk_mq_init_queue(&nbd_dev[i].tag_set); - if (!disk->queue) { + q = blk_mq_init_queue(&nbd_dev[i].tag_set); + if (IS_ERR(q)) { blk_mq_free_tag_set(&nbd_dev[i].tag_set); put_disk(disk); goto out; } + disk->queue = q; /* * Tell the block layer that we are not a rotational device From 6bf6b0aa3da84a3d9126919a94c49c0fb7ee2fb3 Mon Sep 17 00:00:00 2001 From: Omar Sandoval Date: Mon, 9 Jan 2017 11:44:12 -0800 Subject: [PATCH 04/12] virtio_blk: fix panic in initialization error path If blk_mq_init_queue() returns an error, it gets assigned to vblk->disk->queue. Then, when we call put_disk(), we end up calling blk_put_queue() with the ERR_PTR, causing a bad dereference. Fix it by only assigning to vblk->disk->queue on success. Signed-off-by: Omar Sandoval Reviewed-by: Jeff Moyer Signed-off-by: Jens Axboe --- drivers/block/virtio_blk.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c index 3c3b8f601469..10332c24f961 100644 --- a/drivers/block/virtio_blk.c +++ b/drivers/block/virtio_blk.c @@ -630,11 +630,12 @@ static int virtblk_probe(struct virtio_device *vdev) if (err) goto out_put_disk; - q = vblk->disk->queue = blk_mq_init_queue(&vblk->tag_set); + q = blk_mq_init_queue(&vblk->tag_set); if (IS_ERR(q)) { err = -ENOMEM; goto out_free_tags; } + vblk->disk->queue = q; q->queuedata = vblk; From 1392370ee7de8aa3f69936f55bea6bfcc9879c59 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 3 Jan 2017 14:29:02 +0300 Subject: [PATCH 05/12] nvme-rdma: fix nvme_rdma_queue_is_ready Now that we don't abuse the cmd field in struct request for nvme command passthrough this function needs to be converted to the proper accessor as well. Fixes: d49187e97e ("nvme: introduce struct nvme_request") Signed-off-by: Christoph Hellwig Reviewed-by: Max Gurtovoy --- drivers/nvme/host/rdma.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c index f587af345889..34e564857716 100644 --- a/drivers/nvme/host/rdma.c +++ b/drivers/nvme/host/rdma.c @@ -1422,7 +1422,7 @@ static inline bool nvme_rdma_queue_is_ready(struct nvme_rdma_queue *queue, struct request *rq) { if (unlikely(!test_bit(NVME_RDMA_Q_LIVE, &queue->flags))) { - struct nvme_command *cmd = (struct nvme_command *)rq->cmd; + struct nvme_command *cmd = nvme_req(rq)->cmd; if (rq->cmd_type != REQ_TYPE_DRV_PRIV || cmd->common.opcode != nvme_fabrics_command || From b5a10c5f7532b7473776da87e67f8301bbc32693 Mon Sep 17 00:00:00 2001 From: "Guilherme G. Piccoli" Date: Wed, 28 Dec 2016 22:13:15 -0200 Subject: [PATCH 06/12] nvme: apply DELAY_BEFORE_CHK_RDY quirk at probe time too Commit 54adc01055b7 ("nvme/quirk: Add a delay before checking for adapter readiness") introduced a quirk to adapters that cannot read the bit NVME_CSTS_RDY right after register NVME_REG_CC is set; these adapters need a delay or else the action of reading the bit NVME_CSTS_RDY could somehow corrupt adapter's registers state and it never recovers. When this quirk was added, we checked ctrl->tagset in order to avoid quirking in probe time, supposing we would never require such delay during probe. Well, it was too optimistic; we in fact need this quirk at probe time in some cases, like after a kexec. In some experiments, after abnormal shutdown of machine (aka power cord unplug), we booted into our bootloader in Power, which is a Linux kernel, and kexec'ed into another distro. If this kexec is too quick, we end up reaching the probe of NVMe adapter in that distro when adapter is in bad state (not fully initialized on our bootloader). What happens next is that nvme_wait_ready() is unable to complete, except if the quirk is enabled. So, this patch removes the original ctrl->tagset verification in order to enable the quirk even on probe time. Fixes: 54adc01055b7 ("nvme/quirk: Add a delay before checking for adapter readiness") Reported-by: Andrew Byrne Reported-by: Jaime A. H. Gomez Reported-by: Zachary D. Myers Signed-off-by: Guilherme G. Piccoli Acked-by: Jeffrey Lien Signed-off-by: Christoph Hellwig --- drivers/nvme/host/core.c | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 2fc86dc7a8df..8a3c3e32a704 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -1106,12 +1106,7 @@ int nvme_disable_ctrl(struct nvme_ctrl *ctrl, u64 cap) if (ret) return ret; - /* Checking for ctrl->tagset is a trick to avoid sleeping on module - * load, since we only need the quirk on reset_controller. Notice - * that the HGST device needs this delay only in firmware activation - * procedure; unfortunately we have no (easy) way to verify this. - */ - if ((ctrl->quirks & NVME_QUIRK_DELAY_BEFORE_CHK_RDY) && ctrl->tagset) + if (ctrl->quirks & NVME_QUIRK_DELAY_BEFORE_CHK_RDY) msleep(NVME_QUIRK_DELAY_AMOUNT); return nvme_wait_ready(ctrl, cap, false); From f99e86485cc32cd16e5cc97f9bb0474f28608d84 Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Thu, 12 Jan 2017 07:58:32 -0700 Subject: [PATCH 07/12] block: Rename blk_queue_zone_size and bdev_zone_size All block device data fields and functions returning a number of 512B sectors are by convention named xxx_sectors while names in the form xxx_size are generally used for a number of bytes. The blk_queue_zone_size and bdev_zone_size functions were not following this convention so rename them. No functional change is introduced by this patch. Signed-off-by: Damien Le Moal Collapsed the two patches, they were nonsensically split and broke bisection. Signed-off-by: Jens Axboe --- block/blk-zoned.c | 4 ++-- block/partition-generic.c | 14 +++++++------- fs/f2fs/segment.c | 4 ++-- fs/f2fs/super.c | 6 +++--- include/linux/blkdev.h | 6 +++--- 5 files changed, 17 insertions(+), 17 deletions(-) diff --git a/block/blk-zoned.c b/block/blk-zoned.c index 472211fa183a..3bd15d8095b1 100644 --- a/block/blk-zoned.c +++ b/block/blk-zoned.c @@ -16,7 +16,7 @@ static inline sector_t blk_zone_start(struct request_queue *q, sector_t sector) { - sector_t zone_mask = blk_queue_zone_size(q) - 1; + sector_t zone_mask = blk_queue_zone_sectors(q) - 1; return sector & ~zone_mask; } @@ -222,7 +222,7 @@ int blkdev_reset_zones(struct block_device *bdev, return -EINVAL; /* Check alignment (handle eventual smaller last zone) */ - zone_sectors = blk_queue_zone_size(q); + zone_sectors = blk_queue_zone_sectors(q); if (sector & (zone_sectors - 1)) return -EINVAL; diff --git a/block/partition-generic.c b/block/partition-generic.c index d7beb6bbbf66..7afb9907821f 100644 --- a/block/partition-generic.c +++ b/block/partition-generic.c @@ -434,7 +434,7 @@ static bool part_zone_aligned(struct gendisk *disk, struct block_device *bdev, sector_t from, sector_t size) { - unsigned int zone_size = bdev_zone_size(bdev); + unsigned int zone_sectors = bdev_zone_sectors(bdev); /* * If this function is called, then the disk is a zoned block device @@ -446,7 +446,7 @@ static bool part_zone_aligned(struct gendisk *disk, * regular block devices (no zone operation) and their zone size will * be reported as 0. Allow this case. */ - if (!zone_size) + if (!zone_sectors) return true; /* @@ -455,24 +455,24 @@ static bool part_zone_aligned(struct gendisk *disk, * use it. Check the zone size too: it should be a power of 2 number * of sectors. */ - if (WARN_ON_ONCE(!is_power_of_2(zone_size))) { + if (WARN_ON_ONCE(!is_power_of_2(zone_sectors))) { u32 rem; - div_u64_rem(from, zone_size, &rem); + div_u64_rem(from, zone_sectors, &rem); if (rem) return false; if ((from + size) < get_capacity(disk)) { - div_u64_rem(size, zone_size, &rem); + div_u64_rem(size, zone_sectors, &rem); if (rem) return false; } } else { - if (from & (zone_size - 1)) + if (from & (zone_sectors - 1)) return false; if ((from + size) < get_capacity(disk) && - (size & (zone_size - 1))) + (size & (zone_sectors - 1))) return false; } diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 0738f48293cc..0d8802453758 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -713,8 +713,8 @@ static int __f2fs_issue_discard_zone(struct f2fs_sb_info *sbi, } sector = SECTOR_FROM_BLOCK(blkstart); - if (sector & (bdev_zone_size(bdev) - 1) || - nr_sects != bdev_zone_size(bdev)) { + if (sector & (bdev_zone_sectors(bdev) - 1) || + nr_sects != bdev_zone_sectors(bdev)) { f2fs_msg(sbi->sb, KERN_INFO, "(%d) %s: Unaligned discard attempted (block %x + %x)", devi, sbi->s_ndevs ? FDEV(devi).path: "", diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 702638e21c76..46fd30d8af77 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -1553,16 +1553,16 @@ static int init_blkz_info(struct f2fs_sb_info *sbi, int devi) return 0; if (sbi->blocks_per_blkz && sbi->blocks_per_blkz != - SECTOR_TO_BLOCK(bdev_zone_size(bdev))) + SECTOR_TO_BLOCK(bdev_zone_sectors(bdev))) return -EINVAL; - sbi->blocks_per_blkz = SECTOR_TO_BLOCK(bdev_zone_size(bdev)); + sbi->blocks_per_blkz = SECTOR_TO_BLOCK(bdev_zone_sectors(bdev)); if (sbi->log_blocks_per_blkz && sbi->log_blocks_per_blkz != __ilog2_u32(sbi->blocks_per_blkz)) return -EINVAL; sbi->log_blocks_per_blkz = __ilog2_u32(sbi->blocks_per_blkz); FDEV(devi).nr_blkz = SECTOR_TO_BLOCK(nr_sectors) >> sbi->log_blocks_per_blkz; - if (nr_sectors & (bdev_zone_size(bdev) - 1)) + if (nr_sectors & (bdev_zone_sectors(bdev) - 1)) FDEV(devi).nr_blkz++; FDEV(devi).blkz_type = kmalloc(FDEV(devi).nr_blkz, GFP_KERNEL); diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 83695641bd5e..ff3d774f2751 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -739,7 +739,7 @@ static inline bool blk_queue_is_zoned(struct request_queue *q) } } -static inline unsigned int blk_queue_zone_size(struct request_queue *q) +static inline unsigned int blk_queue_zone_sectors(struct request_queue *q) { return blk_queue_is_zoned(q) ? q->limits.chunk_sectors : 0; } @@ -1536,12 +1536,12 @@ static inline bool bdev_is_zoned(struct block_device *bdev) return false; } -static inline unsigned int bdev_zone_size(struct block_device *bdev) +static inline unsigned int bdev_zone_sectors(struct block_device *bdev) { struct request_queue *q = bdev_get_queue(bdev); if (q) - return blk_queue_zone_size(q); + return blk_queue_zone_sectors(q); return 0; } From 2e3258ecfaebace1ceffaa14e0ea94775d54f46f Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 13 Jan 2017 12:29:10 +0100 Subject: [PATCH 08/12] block: add blk_rq_payload_bytes Add a helper to calculate the actual data transfer size for special payload requests. Signed-off-by: Christoph Hellwig Reviewed-by: Hannes Reinecke Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index ff3d774f2751..1ca8e8fd1078 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -1000,6 +1000,19 @@ static inline unsigned int blk_rq_cur_sectors(const struct request *rq) return blk_rq_cur_bytes(rq) >> 9; } +/* + * Some commands like WRITE SAME have a payload or data transfer size which + * is different from the size of the request. Any driver that supports such + * commands using the RQF_SPECIAL_PAYLOAD flag needs to use this helper to + * calculate the data transfer size. + */ +static inline unsigned int blk_rq_payload_bytes(struct request *rq) +{ + if (rq->rq_flags & RQF_SPECIAL_PAYLOAD) + return rq->special_vec.bv_len; + return blk_rq_bytes(rq); +} + static inline unsigned int blk_queue_get_max_sectors(struct request_queue *q, int op) { From fd102b125e174edbea34e6e7a2d371bc7901c53d Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 13 Jan 2017 12:29:11 +0100 Subject: [PATCH 09/12] scsi: use blk_rq_payload_bytes Without that we'll pass a wrong payload size in cmd->sdb, which can lead to hangs with drivers that need the total transfer size. Signed-off-by: Christoph Hellwig Reported-by: Chris Valean Reported-by: Dexuan Cui Fixes: f9d03f96 ("block: improve handling of the magic discard payload") Reviewed-by: Hannes Reinecke Reviewed-by: Sagi Grimberg Signed-off-by: Jens Axboe --- drivers/scsi/scsi_lib.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c index c35b6de4ca64..ad4ff8fcd4dd 100644 --- a/drivers/scsi/scsi_lib.c +++ b/drivers/scsi/scsi_lib.c @@ -1018,7 +1018,7 @@ static int scsi_init_sgtable(struct request *req, struct scsi_data_buffer *sdb) count = blk_rq_map_sg(req->q, req, sdb->table.sgl); BUG_ON(count > sdb->table.nents); sdb->table.nents = count; - sdb->length = blk_rq_bytes(req); + sdb->length = blk_rq_payload_bytes(req); return BLKPREP_OK; } From b131c61d62266eb21b0f125f63f3d07e5670d726 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 13 Jan 2017 12:29:12 +0100 Subject: [PATCH 10/12] nvme: use blk_rq_payload_bytes The new blk_rq_payload_bytes generalizes the payload length hacks that nvme_map_len did before. Signed-off-by: Christoph Hellwig Reviewed-by: Hannes Reinecke Reviewed-by: Sagi Grimberg Signed-off-by: Jens Axboe --- drivers/nvme/host/fc.c | 5 ++--- drivers/nvme/host/nvme.h | 8 -------- drivers/nvme/host/pci.c | 19 ++++++++----------- drivers/nvme/host/rdma.c | 13 +++++-------- 4 files changed, 15 insertions(+), 30 deletions(-) diff --git a/drivers/nvme/host/fc.c b/drivers/nvme/host/fc.c index aa0bc60810a7..fcc9dcfdf675 100644 --- a/drivers/nvme/host/fc.c +++ b/drivers/nvme/host/fc.c @@ -1654,13 +1654,12 @@ nvme_fc_map_data(struct nvme_fc_ctrl *ctrl, struct request *rq, struct nvme_fc_fcp_op *op) { struct nvmefc_fcp_req *freq = &op->fcp_req; - u32 map_len = nvme_map_len(rq); enum dma_data_direction dir; int ret; freq->sg_cnt = 0; - if (!map_len) + if (!blk_rq_payload_bytes(rq)) return 0; freq->sg_table.sgl = freq->first_sgl; @@ -1854,7 +1853,7 @@ nvme_fc_queue_rq(struct blk_mq_hw_ctx *hctx, if (ret) return ret; - data_len = nvme_map_len(rq); + data_len = blk_rq_payload_bytes(rq); if (data_len) io_dir = ((rq_data_dir(rq) == WRITE) ? NVMEFC_FCP_WRITE : NVMEFC_FCP_READ); diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index 6377e14586dc..aead6d08ed2c 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -225,14 +225,6 @@ static inline u64 nvme_block_nr(struct nvme_ns *ns, sector_t sector) return (sector >> (ns->lba_shift - 9)); } -static inline unsigned nvme_map_len(struct request *rq) -{ - if (req_op(rq) == REQ_OP_DISCARD) - return sizeof(struct nvme_dsm_range); - else - return blk_rq_bytes(rq); -} - static inline void nvme_cleanup_cmd(struct request *req) { if (req->rq_flags & RQF_SPECIAL_PAYLOAD) { diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 19beeb7b2ac2..3faefabf339c 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -306,11 +306,11 @@ static __le64 **iod_list(struct request *req) return (__le64 **)(iod->sg + blk_rq_nr_phys_segments(req)); } -static int nvme_init_iod(struct request *rq, unsigned size, - struct nvme_dev *dev) +static int nvme_init_iod(struct request *rq, struct nvme_dev *dev) { struct nvme_iod *iod = blk_mq_rq_to_pdu(rq); int nseg = blk_rq_nr_phys_segments(rq); + unsigned int size = blk_rq_payload_bytes(rq); if (nseg > NVME_INT_PAGES || size > NVME_INT_BYTES(dev)) { iod->sg = kmalloc(nvme_iod_alloc_size(dev, size, nseg), GFP_ATOMIC); @@ -420,12 +420,11 @@ static void nvme_dif_complete(u32 p, u32 v, struct t10_pi_tuple *pi) } #endif -static bool nvme_setup_prps(struct nvme_dev *dev, struct request *req, - int total_len) +static bool nvme_setup_prps(struct nvme_dev *dev, struct request *req) { struct nvme_iod *iod = blk_mq_rq_to_pdu(req); struct dma_pool *pool; - int length = total_len; + int length = blk_rq_payload_bytes(req); struct scatterlist *sg = iod->sg; int dma_len = sg_dma_len(sg); u64 dma_addr = sg_dma_address(sg); @@ -501,7 +500,7 @@ static bool nvme_setup_prps(struct nvme_dev *dev, struct request *req, } static int nvme_map_data(struct nvme_dev *dev, struct request *req, - unsigned size, struct nvme_command *cmnd) + struct nvme_command *cmnd) { struct nvme_iod *iod = blk_mq_rq_to_pdu(req); struct request_queue *q = req->q; @@ -519,7 +518,7 @@ static int nvme_map_data(struct nvme_dev *dev, struct request *req, DMA_ATTR_NO_WARN)) goto out; - if (!nvme_setup_prps(dev, req, size)) + if (!nvme_setup_prps(dev, req)) goto out_unmap; ret = BLK_MQ_RQ_QUEUE_ERROR; @@ -580,7 +579,6 @@ static int nvme_queue_rq(struct blk_mq_hw_ctx *hctx, struct nvme_dev *dev = nvmeq->dev; struct request *req = bd->rq; struct nvme_command cmnd; - unsigned map_len; int ret = BLK_MQ_RQ_QUEUE_OK; /* @@ -600,13 +598,12 @@ static int nvme_queue_rq(struct blk_mq_hw_ctx *hctx, if (ret != BLK_MQ_RQ_QUEUE_OK) return ret; - map_len = nvme_map_len(req); - ret = nvme_init_iod(req, map_len, dev); + ret = nvme_init_iod(req, dev); if (ret != BLK_MQ_RQ_QUEUE_OK) goto out_free_cmd; if (blk_rq_nr_phys_segments(req)) - ret = nvme_map_data(dev, req, map_len, &cmnd); + ret = nvme_map_data(dev, req, &cmnd); if (ret != BLK_MQ_RQ_QUEUE_OK) goto out_cleanup_iod; diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c index 34e564857716..557f29b1f1bb 100644 --- a/drivers/nvme/host/rdma.c +++ b/drivers/nvme/host/rdma.c @@ -981,8 +981,7 @@ static int nvme_rdma_map_sg_fr(struct nvme_rdma_queue *queue, } static int nvme_rdma_map_data(struct nvme_rdma_queue *queue, - struct request *rq, unsigned int map_len, - struct nvme_command *c) + struct request *rq, struct nvme_command *c) { struct nvme_rdma_request *req = blk_mq_rq_to_pdu(rq); struct nvme_rdma_device *dev = queue->device; @@ -1014,9 +1013,9 @@ static int nvme_rdma_map_data(struct nvme_rdma_queue *queue, } if (count == 1) { - if (rq_data_dir(rq) == WRITE && - map_len <= nvme_rdma_inline_data_size(queue) && - nvme_rdma_queue_idx(queue)) + if (rq_data_dir(rq) == WRITE && nvme_rdma_queue_idx(queue) && + blk_rq_payload_bytes(rq) <= + nvme_rdma_inline_data_size(queue)) return nvme_rdma_map_sg_inline(queue, req, c); if (dev->pd->flags & IB_PD_UNSAFE_GLOBAL_RKEY) @@ -1444,7 +1443,6 @@ static int nvme_rdma_queue_rq(struct blk_mq_hw_ctx *hctx, struct nvme_command *c = sqe->data; bool flush = false; struct ib_device *dev; - unsigned int map_len; int ret; WARN_ON_ONCE(rq->tag < 0); @@ -1462,8 +1460,7 @@ static int nvme_rdma_queue_rq(struct blk_mq_hw_ctx *hctx, blk_mq_start_request(rq); - map_len = nvme_map_len(rq); - ret = nvme_rdma_map_data(queue, rq, map_len, c); + ret = nvme_rdma_map_data(queue, rq, c); if (ret < 0) { dev_err(queue->ctrl->ctrl.device, "Failed to map data (%d)\n", ret); From f80de881d8df967488b7343381619efa15019493 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 13 Jan 2017 12:29:13 +0100 Subject: [PATCH 11/12] sd: remove __data_len hack for WRITE SAME Now that we have the blk_rq_payload_bytes helper available to determine the actual I/O size we don't need to mess around with __data_len for WRITE SAME. Signed-off-by: Christoph Hellwig Reviewed-by: Hannes Reinecke Reviewed-by: Sagi Grimberg Signed-off-by: Jens Axboe --- drivers/scsi/sd.c | 17 +---------------- 1 file changed, 1 insertion(+), 16 deletions(-) diff --git a/drivers/scsi/sd.c b/drivers/scsi/sd.c index b1933041da39..1fbb1ecf49f2 100644 --- a/drivers/scsi/sd.c +++ b/drivers/scsi/sd.c @@ -836,7 +836,6 @@ static int sd_setup_write_same_cmnd(struct scsi_cmnd *cmd) struct bio *bio = rq->bio; sector_t sector = blk_rq_pos(rq); unsigned int nr_sectors = blk_rq_sectors(rq); - unsigned int nr_bytes = blk_rq_bytes(rq); int ret; if (sdkp->device->no_write_same) @@ -869,21 +868,7 @@ static int sd_setup_write_same_cmnd(struct scsi_cmnd *cmd) cmd->transfersize = sdp->sector_size; cmd->allowed = SD_MAX_RETRIES; - - /* - * For WRITE_SAME the data transferred in the DATA IN buffer is - * different from the amount of data actually written to the target. - * - * We set up __data_len to the amount of data transferred from the - * DATA IN buffer so that blk_rq_map_sg set up the proper S/G list - * to transfer a single sector of data first, but then reset it to - * the amount of data to be written right after so that the I/O path - * knows how much to actually write. - */ - rq->__data_len = sdp->sector_size; - ret = scsi_init_io(cmd); - rq->__data_len = nr_bytes; - return ret; + return scsi_init_io(cmd); } static int sd_setup_flush_cmnd(struct scsi_cmnd *cmd) From bef13315e990fd3d3fb4c39013aefd53f06c3657 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 13 Jan 2017 15:18:16 -0700 Subject: [PATCH 12/12] block: don't try to discard from __blkdev_issue_zeroout Discard can return -EIO asynchronously if the alignment for the request isn't suitable for the driver, which makes a proper fallback to other methods in __blkdev_issue_zeroout impossible. Thus only issue a sync discard from blkdev_issue_zeroout an don't try discard at all from __blkdev_issue_zeroout as a non-invasive workaround. One more reason why abusing discard for zeroing must die.. Signed-off-by: Christoph Hellwig Reported-by: Eryu Guan Fixes: e73c23ff ("block: add async variant of blkdev_issue_zeroout") Signed-off-by: Jens Axboe --- block/blk-lib.c | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/block/blk-lib.c b/block/blk-lib.c index ed89c8f4b2a0..f8c82a9b4012 100644 --- a/block/blk-lib.c +++ b/block/blk-lib.c @@ -301,13 +301,6 @@ int __blkdev_issue_zeroout(struct block_device *bdev, sector_t sector, if ((sector | nr_sects) & bs_mask) return -EINVAL; - if (discard) { - ret = __blkdev_issue_discard(bdev, sector, nr_sects, gfp_mask, - BLKDEV_DISCARD_ZERO, biop); - if (ret == 0 || (ret && ret != -EOPNOTSUPP)) - goto out; - } - ret = __blkdev_issue_write_zeroes(bdev, sector, nr_sects, gfp_mask, biop); if (ret == 0 || (ret && ret != -EOPNOTSUPP)) @@ -370,6 +363,12 @@ int blkdev_issue_zeroout(struct block_device *bdev, sector_t sector, struct bio *bio = NULL; struct blk_plug plug; + if (discard) { + if (!blkdev_issue_discard(bdev, sector, nr_sects, gfp_mask, + BLKDEV_DISCARD_ZERO)) + return 0; + } + blk_start_plug(&plug); ret = __blkdev_issue_zeroout(bdev, sector, nr_sects, gfp_mask, &bio, discard);