From c4cf5261f8bffd9de132b50660a69148e7575bd6 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 17 Apr 2015 16:15:18 -0600 Subject: [PATCH 01/24] bio: skip atomic inc/dec of ->bi_remaining for non-chains Struct bio has an atomic ref count for chained bio's, and we use this to know when to end IO on the bio. However, most bio's are not chained, so we don't need to always introduce this atomic operation as part of ending IO. Add a helper to elevate the bi_remaining count, and flag the bio as now actually needing the decrement at end_io time. Rename the field to __bi_remaining to catch any current users of this doing the incrementing manually. For high IOPS workloads, this reduces the overhead of bio_endio() substantially. Tested-by: Robert Elliott Acked-by: Kent Overstreet Reviewed-by: Jan Kara Signed-off-by: Jens Axboe --- block/bio.c | 38 +++++++++++++++++++++++++++--------- drivers/md/dm-cache-target.c | 2 +- drivers/md/dm-raid1.c | 2 +- drivers/md/dm-snap.c | 2 +- drivers/md/dm-thin.c | 4 ++-- include/linux/bio.h | 11 +++++++++++ include/linux/blk_types.h | 3 ++- 7 files changed, 47 insertions(+), 15 deletions(-) diff --git a/block/bio.c b/block/bio.c index f66a4eae16ee..117da319afb6 100644 --- a/block/bio.c +++ b/block/bio.c @@ -270,7 +270,7 @@ void bio_init(struct bio *bio) { memset(bio, 0, sizeof(*bio)); bio->bi_flags = 1 << BIO_UPTODATE; - atomic_set(&bio->bi_remaining, 1); + atomic_set(&bio->__bi_remaining, 1); atomic_set(&bio->bi_cnt, 1); } EXPORT_SYMBOL(bio_init); @@ -292,8 +292,8 @@ void bio_reset(struct bio *bio) __bio_free(bio); memset(bio, 0, BIO_RESET_BYTES); - bio->bi_flags = flags|(1 << BIO_UPTODATE); - atomic_set(&bio->bi_remaining, 1); + bio->bi_flags = flags | (1 << BIO_UPTODATE); + atomic_set(&bio->__bi_remaining, 1); } EXPORT_SYMBOL(bio_reset); @@ -320,7 +320,7 @@ void bio_chain(struct bio *bio, struct bio *parent) bio->bi_private = parent; bio->bi_end_io = bio_chain_endio; - atomic_inc(&parent->bi_remaining); + bio_inc_remaining(parent); } EXPORT_SYMBOL(bio_chain); @@ -1741,6 +1741,23 @@ void bio_flush_dcache_pages(struct bio *bi) EXPORT_SYMBOL(bio_flush_dcache_pages); #endif +static inline bool bio_remaining_done(struct bio *bio) +{ + /* + * If we're not chaining, then ->__bi_remaining is always 1 and + * we always end io on the first invocation. + */ + if (!bio_flagged(bio, BIO_CHAIN)) + return true; + + BUG_ON(atomic_read(&bio->__bi_remaining) <= 0); + + if (atomic_dec_and_test(&bio->__bi_remaining)) + return true; + + return false; +} + /** * bio_endio - end I/O on a bio * @bio: bio @@ -1758,15 +1775,13 @@ EXPORT_SYMBOL(bio_flush_dcache_pages); void bio_endio(struct bio *bio, int error) { while (bio) { - BUG_ON(atomic_read(&bio->bi_remaining) <= 0); - if (error) clear_bit(BIO_UPTODATE, &bio->bi_flags); else if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) error = -EIO; - if (!atomic_dec_and_test(&bio->bi_remaining)) - return; + if (unlikely(!bio_remaining_done(bio))) + break; /* * Need to have a real endio function for chained bios, @@ -1799,7 +1814,12 @@ EXPORT_SYMBOL(bio_endio); **/ void bio_endio_nodec(struct bio *bio, int error) { - atomic_inc(&bio->bi_remaining); + /* + * If it's not flagged as a chain, we are not going to dec the count + */ + if (bio_flagged(bio, BIO_CHAIN)) + bio_inc_remaining(bio); + bio_endio(bio, error); } EXPORT_SYMBOL(bio_endio_nodec); diff --git a/drivers/md/dm-cache-target.c b/drivers/md/dm-cache-target.c index 7755af351867..705eb7b99d69 100644 --- a/drivers/md/dm-cache-target.c +++ b/drivers/md/dm-cache-target.c @@ -91,7 +91,7 @@ static void dm_unhook_bio(struct dm_hook_info *h, struct bio *bio) * Must bump bi_remaining to allow bio to complete with * restored bi_end_io. */ - atomic_inc(&bio->bi_remaining); + bio_inc_remaining(bio); } /*----------------------------------------------------------------*/ diff --git a/drivers/md/dm-raid1.c b/drivers/md/dm-raid1.c index 089d62751f7f..d6a1c096b777 100644 --- a/drivers/md/dm-raid1.c +++ b/drivers/md/dm-raid1.c @@ -1254,7 +1254,7 @@ static int mirror_end_io(struct dm_target *ti, struct bio *bio, int error) dm_bio_restore(bd, bio); bio_record->details.bi_bdev = NULL; - atomic_inc(&bio->bi_remaining); + bio_inc_remaining(bio); queue_bio(ms, bio, rw); return DM_ENDIO_INCOMPLETE; diff --git a/drivers/md/dm-snap.c b/drivers/md/dm-snap.c index f83a0f3fc365..8bfeae218531 100644 --- a/drivers/md/dm-snap.c +++ b/drivers/md/dm-snap.c @@ -1478,7 +1478,7 @@ out: if (full_bio) { full_bio->bi_end_io = pe->full_bio_end_io; full_bio->bi_private = pe->full_bio_private; - atomic_inc(&full_bio->bi_remaining); + bio_inc_remaining(full_bio); } increment_pending_exceptions_done_count(); diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c index 921aafd12aee..342dbdad6131 100644 --- a/drivers/md/dm-thin.c +++ b/drivers/md/dm-thin.c @@ -795,7 +795,7 @@ static void process_prepared_mapping_fail(struct dm_thin_new_mapping *m) { if (m->bio) { m->bio->bi_end_io = m->saved_bi_end_io; - atomic_inc(&m->bio->bi_remaining); + bio_inc_remaining(m->bio); } cell_error(m->tc->pool, m->cell); list_del(&m->list); @@ -812,7 +812,7 @@ static void process_prepared_mapping(struct dm_thin_new_mapping *m) bio = m->bio; if (bio) { bio->bi_end_io = m->saved_bi_end_io; - atomic_inc(&bio->bi_remaining); + bio_inc_remaining(bio); } if (m->err) { diff --git a/include/linux/bio.h b/include/linux/bio.h index da3a127c9958..8bfe9eee6d1a 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -644,6 +644,17 @@ static inline struct bio *bio_list_get(struct bio_list *bl) return bio; } +/* + * Increment chain count for the bio. Make sure the CHAIN flag update + * is visible before the raised count. + */ +static inline void bio_inc_remaining(struct bio *bio) +{ + bio->bi_flags |= (1 << BIO_CHAIN); + smp_mb__before_atomic(); + atomic_inc(&bio->__bi_remaining); +} + /* * bio_set is used to allow other portions of the IO system to * allocate their own private memory pools for bio and iovec structures. diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index a1b25e35ea5f..8b07e0603887 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -65,7 +65,7 @@ struct bio { unsigned int bi_seg_front_size; unsigned int bi_seg_back_size; - atomic_t bi_remaining; + atomic_t __bi_remaining; bio_end_io_t *bi_end_io; @@ -122,6 +122,7 @@ struct bio { #define BIO_NULL_MAPPED 8 /* contains invalid user pages */ #define BIO_QUIET 9 /* Make BIO Quiet */ #define BIO_SNAP_STABLE 10 /* bio data must be snapshotted during write */ +#define BIO_CHAIN 11 /* chained bio, ->bi_remaining in effect */ /* * Flags starting here get preserved by bio_reset() - this includes From dac56212e8127dbc0bff7be35c508bc280213309 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 17 Apr 2015 16:23:59 -0600 Subject: [PATCH 02/24] bio: skip atomic inc/dec of ->bi_cnt for most use cases Struct bio has a reference count that controls when it can be freed. Most uses cases is allocating the bio, which then returns with a single reference to it, doing IO, and then dropping that single reference. We can remove this atomic_dec_and_test() in the completion path, if nobody else is holding a reference to the bio. If someone does call bio_get() on the bio, then we flag the bio as now having valid count and that we must properly honor the reference count when it's being put. Tested-by: Robert Elliott Signed-off-by: Jens Axboe --- block/bio.c | 18 +++++++++++------- drivers/md/bcache/request.c | 2 +- fs/btrfs/volumes.c | 2 +- fs/xfs/xfs_aops.c | 1 - include/linux/bio.h | 16 +++++++++++++++- include/linux/blk_types.h | 3 ++- 6 files changed, 30 insertions(+), 12 deletions(-) diff --git a/block/bio.c b/block/bio.c index 117da319afb6..c2ff8a88aef1 100644 --- a/block/bio.c +++ b/block/bio.c @@ -271,7 +271,7 @@ void bio_init(struct bio *bio) memset(bio, 0, sizeof(*bio)); bio->bi_flags = 1 << BIO_UPTODATE; atomic_set(&bio->__bi_remaining, 1); - atomic_set(&bio->bi_cnt, 1); + atomic_set(&bio->__bi_cnt, 1); } EXPORT_SYMBOL(bio_init); @@ -524,13 +524,17 @@ EXPORT_SYMBOL(zero_fill_bio); **/ void bio_put(struct bio *bio) { - BIO_BUG_ON(!atomic_read(&bio->bi_cnt)); - - /* - * last put frees it - */ - if (atomic_dec_and_test(&bio->bi_cnt)) + if (!bio_flagged(bio, BIO_REFFED)) bio_free(bio); + else { + BIO_BUG_ON(!atomic_read(&bio->__bi_cnt)); + + /* + * last put frees it + */ + if (atomic_dec_and_test(&bio->__bi_cnt)) + bio_free(bio); + } } EXPORT_SYMBOL(bio_put); diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c index ab43faddb447..1616f668a4cb 100644 --- a/drivers/md/bcache/request.c +++ b/drivers/md/bcache/request.c @@ -619,7 +619,7 @@ static void do_bio_hook(struct search *s, struct bio *orig_bio) bio->bi_end_io = request_endio; bio->bi_private = &s->cl; - atomic_set(&bio->bi_cnt, 3); + bio_cnt_set(bio, 3); } static void search_free(struct closure *cl) diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 96aebf3bcd5b..8e8d1d1e28a5 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -345,7 +345,7 @@ loop_lock: waitqueue_active(&fs_info->async_submit_wait)) wake_up(&fs_info->async_submit_wait); - BUG_ON(atomic_read(&cur->bi_cnt) == 0); + BUG_ON(atomic_read(&cur->__bi_cnt) == 0); /* * if we're doing the sync list, record that our diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c index a56960dd1684..095f94c2d8b5 100644 --- a/fs/xfs/xfs_aops.c +++ b/fs/xfs/xfs_aops.c @@ -356,7 +356,6 @@ xfs_end_bio( { xfs_ioend_t *ioend = bio->bi_private; - ASSERT(atomic_read(&bio->bi_cnt) >= 1); ioend->io_error = test_bit(BIO_UPTODATE, &bio->bi_flags) ? 0 : error; /* Toss bio and pass work off to an xfsdatad thread */ diff --git a/include/linux/bio.h b/include/linux/bio.h index 8bfe9eee6d1a..7486ea103f6e 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -290,7 +290,21 @@ static inline unsigned bio_segments(struct bio *bio) * returns. and then bio would be freed memory when if (bio->bi_flags ...) * runs */ -#define bio_get(bio) atomic_inc(&(bio)->bi_cnt) +static inline void bio_get(struct bio *bio) +{ + bio->bi_flags |= (1 << BIO_REFFED); + smp_mb__before_atomic(); + atomic_inc(&bio->__bi_cnt); +} + +static inline void bio_cnt_set(struct bio *bio, unsigned int count) +{ + if (count != 1) { + bio->bi_flags |= (1 << BIO_REFFED); + smp_mb__before_atomic(); + } + atomic_set(&bio->__bi_cnt, count); +} enum bip_flags { BIP_BLOCK_INTEGRITY = 1 << 0, /* block layer owns integrity data */ diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index 8b07e0603887..93d2e7153816 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -92,7 +92,7 @@ struct bio { unsigned short bi_max_vecs; /* max bvl_vecs we can hold */ - atomic_t bi_cnt; /* pin count */ + atomic_t __bi_cnt; /* pin count */ struct bio_vec *bi_io_vec; /* the actual vec list */ @@ -123,6 +123,7 @@ struct bio { #define BIO_QUIET 9 /* Make BIO Quiet */ #define BIO_SNAP_STABLE 10 /* bio data must be snapshotted during write */ #define BIO_CHAIN 11 /* chained bio, ->bi_remaining in effect */ +#define BIO_REFFED 12 /* bio has elevated ->bi_cnt */ /* * Flags starting here get preserved by bio_reset() - this includes From 4f8c9510ba71bb54477841bebb90154ef140860f Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 17 Apr 2015 22:37:16 +0200 Subject: [PATCH 03/24] block: rename REQ_TYPE_SPECIAL to REQ_TYPE_DRV_PRIV Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- drivers/block/nbd.c | 2 +- drivers/block/paride/pd.c | 4 ++-- drivers/block/sx8.c | 4 ++-- drivers/block/virtio_blk.c | 6 +++--- drivers/ide/ide-atapi.c | 4 ++-- drivers/ide/ide-cd.c | 2 +- drivers/ide/ide-cd_ioctl.c | 2 +- drivers/ide/ide-devsets.c | 2 +- drivers/ide/ide-eh.c | 2 +- drivers/ide/ide-floppy.c | 6 +++--- drivers/ide/ide-io.c | 4 ++-- drivers/ide/ide-ioctls.c | 2 +- drivers/ide/ide-park.c | 4 ++-- drivers/ide/ide-tape.c | 4 ++-- include/linux/blkdev.h | 4 ++-- 15 files changed, 26 insertions(+), 26 deletions(-) diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c index 39e5f7fae3ef..9cf52ac328fe 100644 --- a/drivers/block/nbd.c +++ b/drivers/block/nbd.c @@ -592,7 +592,7 @@ static int __nbd_ioctl(struct block_device *bdev, struct nbd_device *nbd, fsync_bdev(bdev); mutex_lock(&nbd->tx_lock); blk_rq_init(NULL, &sreq); - sreq.cmd_type = REQ_TYPE_SPECIAL; + sreq.cmd_type = REQ_TYPE_DRV_PRIV; nbd_cmd(&sreq) = NBD_CMD_DISC; /* Check again after getting mutex back. */ diff --git a/drivers/block/paride/pd.c b/drivers/block/paride/pd.c index d48715b287e6..dbb4da1cdca8 100644 --- a/drivers/block/paride/pd.c +++ b/drivers/block/paride/pd.c @@ -442,7 +442,7 @@ static char *pd_buf; /* buffer for request in progress */ static enum action do_pd_io_start(void) { - if (pd_req->cmd_type == REQ_TYPE_SPECIAL) { + if (pd_req->cmd_type == REQ_TYPE_DRV_PRIV) { phase = pd_special; return pd_special(); } @@ -725,7 +725,7 @@ static int pd_special_command(struct pd_unit *disk, if (IS_ERR(rq)) return PTR_ERR(rq); - rq->cmd_type = REQ_TYPE_SPECIAL; + rq->cmd_type = REQ_TYPE_DRV_PRIV; rq->special = func; err = blk_execute_rq(disk->gd->queue, disk->gd, rq, 0); diff --git a/drivers/block/sx8.c b/drivers/block/sx8.c index 5d552857de41..59c91d49b14b 100644 --- a/drivers/block/sx8.c +++ b/drivers/block/sx8.c @@ -620,7 +620,7 @@ static int carm_array_info (struct carm_host *host, unsigned int array_idx) spin_unlock_irq(&host->lock); DPRINTK("blk_execute_rq_nowait, tag == %u\n", idx); - crq->rq->cmd_type = REQ_TYPE_SPECIAL; + crq->rq->cmd_type = REQ_TYPE_DRV_PRIV; crq->rq->special = crq; blk_execute_rq_nowait(host->oob_q, NULL, crq->rq, true, NULL); @@ -661,7 +661,7 @@ static int carm_send_special (struct carm_host *host, carm_sspc_t func) crq->msg_bucket = (u32) rc; DPRINTK("blk_execute_rq_nowait, tag == %u\n", idx); - crq->rq->cmd_type = REQ_TYPE_SPECIAL; + crq->rq->cmd_type = REQ_TYPE_DRV_PRIV; crq->rq->special = crq; blk_execute_rq_nowait(host->oob_q, NULL, crq->rq, true, NULL); diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c index 5ea2f0bbbc7c..d4d05f064d39 100644 --- a/drivers/block/virtio_blk.c +++ b/drivers/block/virtio_blk.c @@ -124,7 +124,7 @@ static inline void virtblk_request_done(struct request *req) req->resid_len = virtio32_to_cpu(vblk->vdev, vbr->in_hdr.residual); req->sense_len = virtio32_to_cpu(vblk->vdev, vbr->in_hdr.sense_len); req->errors = virtio32_to_cpu(vblk->vdev, vbr->in_hdr.errors); - } else if (req->cmd_type == REQ_TYPE_SPECIAL) { + } else if (req->cmd_type == REQ_TYPE_DRV_PRIV) { req->errors = (error != 0); } @@ -188,7 +188,7 @@ static int virtio_queue_rq(struct blk_mq_hw_ctx *hctx, vbr->out_hdr.sector = 0; vbr->out_hdr.ioprio = cpu_to_virtio32(vblk->vdev, req_get_ioprio(vbr->req)); break; - case REQ_TYPE_SPECIAL: + case REQ_TYPE_DRV_PRIV: vbr->out_hdr.type = cpu_to_virtio32(vblk->vdev, VIRTIO_BLK_T_GET_ID); vbr->out_hdr.sector = 0; vbr->out_hdr.ioprio = cpu_to_virtio32(vblk->vdev, req_get_ioprio(vbr->req)); @@ -251,7 +251,7 @@ static int virtblk_get_id(struct gendisk *disk, char *id_str) return PTR_ERR(req); } - req->cmd_type = REQ_TYPE_SPECIAL; + req->cmd_type = REQ_TYPE_DRV_PRIV; err = blk_execute_rq(vblk->disk->queue, vblk->disk, req, false); blk_put_request(req); diff --git a/drivers/ide/ide-atapi.c b/drivers/ide/ide-atapi.c index fac3d9da2e07..b367300ce479 100644 --- a/drivers/ide/ide-atapi.c +++ b/drivers/ide/ide-atapi.c @@ -93,7 +93,7 @@ int ide_queue_pc_tail(ide_drive_t *drive, struct gendisk *disk, int error; rq = blk_get_request(drive->queue, READ, __GFP_WAIT); - rq->cmd_type = REQ_TYPE_SPECIAL; + rq->cmd_type = REQ_TYPE_DRV_PRIV; rq->special = (char *)pc; if (buf && bufflen) { @@ -477,7 +477,7 @@ static ide_startstop_t ide_pc_intr(ide_drive_t *drive) if (uptodate == 0) drive->failed_pc = NULL; - if (rq->cmd_type == REQ_TYPE_SPECIAL) { + if (rq->cmd_type == REQ_TYPE_DRV_PRIV) { rq->errors = 0; error = 0; } else { diff --git a/drivers/ide/ide-cd.c b/drivers/ide/ide-cd.c index 0b510bafd90e..9a32603c6c74 100644 --- a/drivers/ide/ide-cd.c +++ b/drivers/ide/ide-cd.c @@ -799,7 +799,7 @@ static ide_startstop_t ide_cd_do_request(ide_drive_t *drive, struct request *rq, cdrom_do_block_pc(drive, rq); break; - case REQ_TYPE_SPECIAL: + case REQ_TYPE_DRV_PRIV: /* right now this can only be a reset... */ uptodate = 1; goto out_end; diff --git a/drivers/ide/ide-cd_ioctl.c b/drivers/ide/ide-cd_ioctl.c index 02caa7dd51c8..066e39036518 100644 --- a/drivers/ide/ide-cd_ioctl.c +++ b/drivers/ide/ide-cd_ioctl.c @@ -304,7 +304,7 @@ int ide_cdrom_reset(struct cdrom_device_info *cdi) int ret; rq = blk_get_request(drive->queue, READ, __GFP_WAIT); - rq->cmd_type = REQ_TYPE_SPECIAL; + rq->cmd_type = REQ_TYPE_DRV_PRIV; rq->cmd_flags = REQ_QUIET; ret = blk_execute_rq(drive->queue, cd->disk, rq, 0); blk_put_request(rq); diff --git a/drivers/ide/ide-devsets.c b/drivers/ide/ide-devsets.c index 9e98122f646e..b05a74d78ef5 100644 --- a/drivers/ide/ide-devsets.c +++ b/drivers/ide/ide-devsets.c @@ -166,7 +166,7 @@ int ide_devset_execute(ide_drive_t *drive, const struct ide_devset *setting, return setting->set(drive, arg); rq = blk_get_request(q, READ, __GFP_WAIT); - rq->cmd_type = REQ_TYPE_SPECIAL; + rq->cmd_type = REQ_TYPE_DRV_PRIV; rq->cmd_len = 5; rq->cmd[0] = REQ_DEVSET_EXEC; *(int *)&rq->cmd[1] = arg; diff --git a/drivers/ide/ide-eh.c b/drivers/ide/ide-eh.c index 32970664c275..19d809c48a8d 100644 --- a/drivers/ide/ide-eh.c +++ b/drivers/ide/ide-eh.c @@ -147,7 +147,7 @@ static inline void ide_complete_drive_reset(ide_drive_t *drive, int err) { struct request *rq = drive->hwif->rq; - if (rq && rq->cmd_type == REQ_TYPE_SPECIAL && + if (rq && rq->cmd_type == REQ_TYPE_DRV_PRIV && rq->cmd[0] == REQ_DRIVE_RESET) { if (err <= 0 && rq->errors == 0) rq->errors = -EIO; diff --git a/drivers/ide/ide-floppy.c b/drivers/ide/ide-floppy.c index 8c6363cdd208..3dbfa5b6db6a 100644 --- a/drivers/ide/ide-floppy.c +++ b/drivers/ide/ide-floppy.c @@ -97,7 +97,7 @@ static int ide_floppy_callback(ide_drive_t *drive, int dsc) "Aborting request!\n"); } - if (rq->cmd_type == REQ_TYPE_SPECIAL) + if (rq->cmd_type == REQ_TYPE_DRV_PRIV) rq->errors = uptodate ? 0 : IDE_DRV_ERROR_GENERAL; return uptodate; @@ -246,7 +246,7 @@ static ide_startstop_t ide_floppy_do_request(ide_drive_t *drive, } else printk(KERN_ERR PFX "%s: I/O error\n", drive->name); - if (rq->cmd_type == REQ_TYPE_SPECIAL) { + if (rq->cmd_type == REQ_TYPE_DRV_PRIV) { rq->errors = 0; ide_complete_rq(drive, 0, blk_rq_bytes(rq)); return ide_stopped; @@ -265,7 +265,7 @@ static ide_startstop_t ide_floppy_do_request(ide_drive_t *drive, pc = &floppy->queued_pc; idefloppy_create_rw_cmd(drive, pc, rq, (unsigned long)block); break; - case REQ_TYPE_SPECIAL: + case REQ_TYPE_DRV_PRIV: case REQ_TYPE_SENSE: pc = (struct ide_atapi_pc *)rq->special; break; diff --git a/drivers/ide/ide-io.c b/drivers/ide/ide-io.c index 177db6d5b2f5..8e55abd03a24 100644 --- a/drivers/ide/ide-io.c +++ b/drivers/ide/ide-io.c @@ -135,7 +135,7 @@ EXPORT_SYMBOL(ide_complete_rq); void ide_kill_rq(ide_drive_t *drive, struct request *rq) { - u8 drv_req = (rq->cmd_type == REQ_TYPE_SPECIAL) && rq->rq_disk; + u8 drv_req = (rq->cmd_type == REQ_TYPE_DRV_PRIV) && rq->rq_disk; u8 media = drive->media; drive->failed_pc = NULL; @@ -353,7 +353,7 @@ static ide_startstop_t start_request (ide_drive_t *drive, struct request *rq) pm->pm_step == IDE_PM_COMPLETED) ide_complete_pm_rq(drive, rq); return startstop; - } else if (!rq->rq_disk && rq->cmd_type == REQ_TYPE_SPECIAL) + } else if (!rq->rq_disk && rq->cmd_type == REQ_TYPE_DRV_PRIV) /* * TODO: Once all ULDs have been modified to * check for specific op codes rather than diff --git a/drivers/ide/ide-ioctls.c b/drivers/ide/ide-ioctls.c index 6233fa2cb8a9..aa2e9b77b20d 100644 --- a/drivers/ide/ide-ioctls.c +++ b/drivers/ide/ide-ioctls.c @@ -222,7 +222,7 @@ static int generic_drive_reset(ide_drive_t *drive) int ret = 0; rq = blk_get_request(drive->queue, READ, __GFP_WAIT); - rq->cmd_type = REQ_TYPE_SPECIAL; + rq->cmd_type = REQ_TYPE_DRV_PRIV; rq->cmd_len = 1; rq->cmd[0] = REQ_DRIVE_RESET; if (blk_execute_rq(drive->queue, NULL, rq, 1)) diff --git a/drivers/ide/ide-park.c b/drivers/ide/ide-park.c index ca958604cda2..c80868520488 100644 --- a/drivers/ide/ide-park.c +++ b/drivers/ide/ide-park.c @@ -34,7 +34,7 @@ static void issue_park_cmd(ide_drive_t *drive, unsigned long timeout) rq = blk_get_request(q, READ, __GFP_WAIT); rq->cmd[0] = REQ_PARK_HEADS; rq->cmd_len = 1; - rq->cmd_type = REQ_TYPE_SPECIAL; + rq->cmd_type = REQ_TYPE_DRV_PRIV; rq->special = &timeout; rc = blk_execute_rq(q, NULL, rq, 1); blk_put_request(rq); @@ -51,7 +51,7 @@ static void issue_park_cmd(ide_drive_t *drive, unsigned long timeout) rq->cmd[0] = REQ_UNPARK_HEADS; rq->cmd_len = 1; - rq->cmd_type = REQ_TYPE_SPECIAL; + rq->cmd_type = REQ_TYPE_DRV_PRIV; elv_add_request(q, rq, ELEVATOR_INSERT_FRONT); out: diff --git a/drivers/ide/ide-tape.c b/drivers/ide/ide-tape.c index 6eb738ca6d2f..2a5d543db9f5 100644 --- a/drivers/ide/ide-tape.c +++ b/drivers/ide/ide-tape.c @@ -576,7 +576,7 @@ static ide_startstop_t idetape_do_request(ide_drive_t *drive, rq->cmd[0], (unsigned long long)blk_rq_pos(rq), blk_rq_sectors(rq)); - BUG_ON(!(rq->cmd_type == REQ_TYPE_SPECIAL || + BUG_ON(!(rq->cmd_type == REQ_TYPE_DRV_PRIV || rq->cmd_type == REQ_TYPE_SENSE)); /* Retry a failed packet command */ @@ -853,7 +853,7 @@ static int idetape_queue_rw_tail(ide_drive_t *drive, int cmd, int size) BUG_ON(size < 0 || size % tape->blk_size); rq = blk_get_request(drive->queue, READ, __GFP_WAIT); - rq->cmd_type = REQ_TYPE_SPECIAL; + rq->cmd_type = REQ_TYPE_DRV_PRIV; rq->cmd[13] = cmd; rq->rq_disk = tape->disk; rq->__sector = tape->first_frame; diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 7f9a516f24de..98c90272443b 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -79,10 +79,10 @@ enum rq_cmd_type_bits { REQ_TYPE_PM_SUSPEND, /* suspend request */ REQ_TYPE_PM_RESUME, /* resume request */ REQ_TYPE_PM_SHUTDOWN, /* shutdown request */ - REQ_TYPE_SPECIAL, /* driver defined type */ + REQ_TYPE_DRV_PRIV, /* driver defined type */ /* * for ATA/ATAPI devices. this really doesn't belong here, ide should - * use REQ_TYPE_SPECIAL and use rq->cmd[0] with the range of driver + * use REQ_TYPE_DRV_PRIV and use rq->cmd[0] with the range of driver * private REQ_LB opcodes to differentiate what type of request this is */ REQ_TYPE_ATA_TASKFILE, From b42171ef7d938a66fa52e66a3d911ed63770b5ca Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 17 Apr 2015 22:37:17 +0200 Subject: [PATCH 04/24] block: move REQ_TYPE_ATA_TASKFILE and REQ_TYPE_ATA_PC to ide.h These values are only used by the IDE driver, so move them into it by allowing drivers to take cmd_type values after the first private one. Note that we have to turn cmd_type into a plain unsigned integer so that gcc doesn't complain about mismatching enum types. Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 11 ++--------- include/linux/ide.h | 7 +++++++ 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 98c90272443b..9cb4d80a4987 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -79,14 +79,7 @@ enum rq_cmd_type_bits { REQ_TYPE_PM_SUSPEND, /* suspend request */ REQ_TYPE_PM_RESUME, /* resume request */ REQ_TYPE_PM_SHUTDOWN, /* shutdown request */ - REQ_TYPE_DRV_PRIV, /* driver defined type */ - /* - * for ATA/ATAPI devices. this really doesn't belong here, ide should - * use REQ_TYPE_DRV_PRIV and use rq->cmd[0] with the range of driver - * private REQ_LB opcodes to differentiate what type of request this is - */ - REQ_TYPE_ATA_TASKFILE, - REQ_TYPE_ATA_PC, + REQ_TYPE_DRV_PRIV, /* driver defined types from here */ }; #define BLK_MAX_CDB 16 @@ -108,7 +101,7 @@ struct request { struct blk_mq_ctx *mq_ctx; u64 cmd_flags; - enum rq_cmd_type_bits cmd_type; + unsigned cmd_type; unsigned long atomic_flags; int cpu; diff --git a/include/linux/ide.h b/include/linux/ide.h index 93b5ca754b5b..62ac399144a6 100644 --- a/include/linux/ide.h +++ b/include/linux/ide.h @@ -39,6 +39,12 @@ struct device; +/* IDE-specific values for req->cmd_type */ +enum ata_cmd_type_bits { + REQ_TYPE_ATA_TASKFILE = REQ_TYPE_DRV_PRIV + 1, + REQ_TYPE_ATA_PC, +}; + /* Error codes returned in rq->errors to the higher part of the driver. */ enum { IDE_DRV_ERROR_GENERAL = 101, @@ -1551,4 +1557,5 @@ static inline void ide_set_drivedata(ide_drive_t *drive, void *data) #define ide_host_for_each_port(i, port, host) \ for ((i) = 0; ((port) = (host)->ports[i]) || (i) < MAX_HOST_PORTS; (i)++) + #endif /* _IDE_H */ From b0b93b48a30e809240ddd7449a6ad60a5ddf7b4d Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 17 Apr 2015 22:37:18 +0200 Subject: [PATCH 05/24] block: move REQ_TYPE_SENSE to the ide driver Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- drivers/ide/ide-atapi.c | 6 +++--- drivers/ide/ide-cd.c | 8 ++++---- drivers/ide/ide-floppy.c | 2 +- drivers/ide/ide-tape.c | 2 +- include/linux/blkdev.h | 1 - include/linux/ide.h | 1 + 6 files changed, 10 insertions(+), 10 deletions(-) diff --git a/drivers/ide/ide-atapi.c b/drivers/ide/ide-atapi.c index b367300ce479..1362ad80a76c 100644 --- a/drivers/ide/ide-atapi.c +++ b/drivers/ide/ide-atapi.c @@ -191,7 +191,7 @@ void ide_prep_sense(ide_drive_t *drive, struct request *rq) BUG_ON(sense_len > sizeof(*sense)); - if (rq->cmd_type == REQ_TYPE_SENSE || drive->sense_rq_armed) + if (rq->cmd_type == REQ_TYPE_ATA_SENSE || drive->sense_rq_armed) return; memset(sense, 0, sizeof(*sense)); @@ -210,7 +210,7 @@ void ide_prep_sense(ide_drive_t *drive, struct request *rq) sense_rq->rq_disk = rq->rq_disk; sense_rq->cmd[0] = GPCMD_REQUEST_SENSE; sense_rq->cmd[4] = cmd_len; - sense_rq->cmd_type = REQ_TYPE_SENSE; + sense_rq->cmd_type = REQ_TYPE_ATA_SENSE; sense_rq->cmd_flags |= REQ_PREEMPT; if (drive->media == ide_tape) @@ -310,7 +310,7 @@ int ide_cd_get_xferlen(struct request *rq) switch (rq->cmd_type) { case REQ_TYPE_FS: return 32768; - case REQ_TYPE_SENSE: + case REQ_TYPE_ATA_SENSE: case REQ_TYPE_BLOCK_PC: case REQ_TYPE_ATA_PC: return blk_rq_bytes(rq); diff --git a/drivers/ide/ide-cd.c b/drivers/ide/ide-cd.c index 9a32603c6c74..64a6b827b3dd 100644 --- a/drivers/ide/ide-cd.c +++ b/drivers/ide/ide-cd.c @@ -210,7 +210,7 @@ static void cdrom_analyze_sense_data(ide_drive_t *drive, static void ide_cd_complete_failed_rq(ide_drive_t *drive, struct request *rq) { /* - * For REQ_TYPE_SENSE, "rq->special" points to the original + * For REQ_TYPE_ATA_SENSE, "rq->special" points to the original * failed request. Also, the sense data should be read * directly from rq which might be different from the original * sense buffer if it got copied during mapping. @@ -285,7 +285,7 @@ static int cdrom_decode_status(ide_drive_t *drive, u8 stat) "stat 0x%x", rq->cmd[0], rq->cmd_type, err, stat); - if (rq->cmd_type == REQ_TYPE_SENSE) { + if (rq->cmd_type == REQ_TYPE_ATA_SENSE) { /* * We got an error trying to get sense info from the drive * (probably while trying to recover from a former error). @@ -526,7 +526,7 @@ static ide_startstop_t cdrom_newpc_intr(ide_drive_t *drive) ide_expiry_t *expiry = NULL; int dma_error = 0, dma, thislen, uptodate = 0; int write = (rq_data_dir(rq) == WRITE) ? 1 : 0, rc = 0; - int sense = (rq->cmd_type == REQ_TYPE_SENSE); + int sense = (rq->cmd_type == REQ_TYPE_ATA_SENSE); unsigned int timeout; u16 len; u8 ireason, stat; @@ -791,7 +791,7 @@ static ide_startstop_t ide_cd_do_request(ide_drive_t *drive, struct request *rq, if (cdrom_start_rw(drive, rq) == ide_stopped) goto out_end; break; - case REQ_TYPE_SENSE: + case REQ_TYPE_ATA_SENSE: case REQ_TYPE_BLOCK_PC: case REQ_TYPE_ATA_PC: if (!rq->timeout) diff --git a/drivers/ide/ide-floppy.c b/drivers/ide/ide-floppy.c index 3dbfa5b6db6a..2fb5350c5410 100644 --- a/drivers/ide/ide-floppy.c +++ b/drivers/ide/ide-floppy.c @@ -266,7 +266,7 @@ static ide_startstop_t ide_floppy_do_request(ide_drive_t *drive, idefloppy_create_rw_cmd(drive, pc, rq, (unsigned long)block); break; case REQ_TYPE_DRV_PRIV: - case REQ_TYPE_SENSE: + case REQ_TYPE_ATA_SENSE: pc = (struct ide_atapi_pc *)rq->special; break; case REQ_TYPE_BLOCK_PC: diff --git a/drivers/ide/ide-tape.c b/drivers/ide/ide-tape.c index 2a5d543db9f5..f5d51d1d09ee 100644 --- a/drivers/ide/ide-tape.c +++ b/drivers/ide/ide-tape.c @@ -577,7 +577,7 @@ static ide_startstop_t idetape_do_request(ide_drive_t *drive, blk_rq_sectors(rq)); BUG_ON(!(rq->cmd_type == REQ_TYPE_DRV_PRIV || - rq->cmd_type == REQ_TYPE_SENSE)); + rq->cmd_type == REQ_TYPE_ATA_SENSE)); /* Retry a failed packet command */ if (drive->failed_pc && drive->pc->c[0] == REQUEST_SENSE) { diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 9cb4d80a4987..6076b9e18dcb 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -75,7 +75,6 @@ struct request_list { enum rq_cmd_type_bits { REQ_TYPE_FS = 1, /* fs request */ REQ_TYPE_BLOCK_PC, /* scsi command */ - REQ_TYPE_SENSE, /* sense request */ REQ_TYPE_PM_SUSPEND, /* suspend request */ REQ_TYPE_PM_RESUME, /* resume request */ REQ_TYPE_PM_SHUTDOWN, /* shutdown request */ diff --git a/include/linux/ide.h b/include/linux/ide.h index 62ac399144a6..9856b7d455d9 100644 --- a/include/linux/ide.h +++ b/include/linux/ide.h @@ -43,6 +43,7 @@ struct device; enum ata_cmd_type_bits { REQ_TYPE_ATA_TASKFILE = REQ_TYPE_DRV_PRIV + 1, REQ_TYPE_ATA_PC, + REQ_TYPE_ATA_SENSE, /* sense request */ }; /* Error codes returned in rq->errors to the higher part of the driver. */ From ac7cdff00a33d48d27217560fa3b16d802e5f535 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 17 Apr 2015 22:37:19 +0200 Subject: [PATCH 06/24] block: remove REQ_TYPE_PM_SHUTDOWN Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 1 - 1 file changed, 1 deletion(-) diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 6076b9e18dcb..c2829ba5e738 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -77,7 +77,6 @@ enum rq_cmd_type_bits { REQ_TYPE_BLOCK_PC, /* scsi command */ REQ_TYPE_PM_SUSPEND, /* suspend request */ REQ_TYPE_PM_RESUME, /* resume request */ - REQ_TYPE_PM_SHUTDOWN, /* shutdown request */ REQ_TYPE_DRV_PRIV, /* driver defined types from here */ }; From a7928c1578c550bd6f4dec62d65132e6db226c57 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 17 Apr 2015 22:37:20 +0200 Subject: [PATCH 07/24] block: move PM request support to IDE This removes the request types and hacks from the block code and into the old IDE driver. There is a small amunt of code duplication due to this, but it's not too bad. Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- block/blk-core.c | 1 + block/blk-exec.c | 10 ------- block/blk.h | 2 -- drivers/ide/ide-eh.c | 2 +- drivers/ide/ide-io.c | 8 +++--- drivers/ide/ide-pm.c | 56 +++++++++++++++++++++++++++++--------- drivers/ide/ide-taskfile.c | 2 +- include/linux/blkdev.h | 21 +------------- include/linux/ide.h | 19 +++++++++++++ 9 files changed, 70 insertions(+), 51 deletions(-) diff --git a/block/blk-core.c b/block/blk-core.c index fd154b94447a..2e5020f37d55 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -285,6 +285,7 @@ inline void __blk_run_queue_uncond(struct request_queue *q) q->request_fn(q); q->request_fn_active--; } +EXPORT_SYMBOL_GPL(__blk_run_queue_uncond); /** * __blk_run_queue - run a single device queue diff --git a/block/blk-exec.c b/block/blk-exec.c index 9924725fa50d..3fec8a29d0fa 100644 --- a/block/blk-exec.c +++ b/block/blk-exec.c @@ -53,7 +53,6 @@ void blk_execute_rq_nowait(struct request_queue *q, struct gendisk *bd_disk, rq_end_io_fn *done) { int where = at_head ? ELEVATOR_INSERT_FRONT : ELEVATOR_INSERT_BACK; - bool is_pm_resume; WARN_ON(irqs_disabled()); WARN_ON(rq->cmd_type == REQ_TYPE_FS); @@ -70,12 +69,6 @@ void blk_execute_rq_nowait(struct request_queue *q, struct gendisk *bd_disk, return; } - /* - * need to check this before __blk_run_queue(), because rq can - * be freed before that returns. - */ - is_pm_resume = rq->cmd_type == REQ_TYPE_PM_RESUME; - spin_lock_irq(q->queue_lock); if (unlikely(blk_queue_dying(q))) { @@ -88,9 +81,6 @@ void blk_execute_rq_nowait(struct request_queue *q, struct gendisk *bd_disk, __elv_add_request(q, rq, where); __blk_run_queue(q); - /* the queue is stopped so it won't be run */ - if (is_pm_resume) - __blk_run_queue_uncond(q); spin_unlock_irq(q->queue_lock); } EXPORT_SYMBOL_GPL(blk_execute_rq_nowait); diff --git a/block/blk.h b/block/blk.h index 43b036185712..4b48d55e588e 100644 --- a/block/blk.h +++ b/block/blk.h @@ -193,8 +193,6 @@ int blk_try_merge(struct request *rq, struct bio *bio); void blk_queue_congestion_threshold(struct request_queue *q); -void __blk_run_queue_uncond(struct request_queue *q); - int blk_dev_init(void); diff --git a/drivers/ide/ide-eh.c b/drivers/ide/ide-eh.c index 19d809c48a8d..d6da011299f5 100644 --- a/drivers/ide/ide-eh.c +++ b/drivers/ide/ide-eh.c @@ -129,7 +129,7 @@ ide_startstop_t ide_error(ide_drive_t *drive, const char *msg, u8 stat) if (cmd) ide_complete_cmd(drive, cmd, stat, err); - } else if (blk_pm_request(rq)) { + } else if (ata_pm_request(rq)) { rq->errors = 1; ide_complete_pm_rq(drive, rq); return ide_stopped; diff --git a/drivers/ide/ide-io.c b/drivers/ide/ide-io.c index 8e55abd03a24..669ea1e45795 100644 --- a/drivers/ide/ide-io.c +++ b/drivers/ide/ide-io.c @@ -320,7 +320,7 @@ static ide_startstop_t start_request (ide_drive_t *drive, struct request *rq) goto kill_rq; } - if (blk_pm_request(rq)) + if (ata_pm_request(rq)) ide_check_pm_state(drive, rq); drive->hwif->tp_ops->dev_select(drive); @@ -342,8 +342,8 @@ static ide_startstop_t start_request (ide_drive_t *drive, struct request *rq) if (rq->cmd_type == REQ_TYPE_ATA_TASKFILE) return execute_drive_cmd(drive, rq); - else if (blk_pm_request(rq)) { - struct request_pm_state *pm = rq->special; + else if (ata_pm_request(rq)) { + struct ide_pm_state *pm = rq->special; #ifdef DEBUG_PM printk("%s: start_power_step(step: %d)\n", drive->name, pm->pm_step); @@ -538,7 +538,7 @@ repeat: * state machine. */ if ((drive->dev_flags & IDE_DFLAG_BLOCKED) && - blk_pm_request(rq) == 0 && + ata_pm_request(rq) == 0 && (rq->cmd_flags & REQ_PREEMPT) == 0) { /* there should be no pending command at this point */ ide_unlock_port(hwif); diff --git a/drivers/ide/ide-pm.c b/drivers/ide/ide-pm.c index 8d1e32d7cd97..081e43458d50 100644 --- a/drivers/ide/ide-pm.c +++ b/drivers/ide/ide-pm.c @@ -8,7 +8,7 @@ int generic_ide_suspend(struct device *dev, pm_message_t mesg) ide_drive_t *pair = ide_get_pair_dev(drive); ide_hwif_t *hwif = drive->hwif; struct request *rq; - struct request_pm_state rqpm; + struct ide_pm_state rqpm; int ret; if (ide_port_acpi(hwif)) { @@ -19,7 +19,7 @@ int generic_ide_suspend(struct device *dev, pm_message_t mesg) memset(&rqpm, 0, sizeof(rqpm)); rq = blk_get_request(drive->queue, READ, __GFP_WAIT); - rq->cmd_type = REQ_TYPE_PM_SUSPEND; + rq->cmd_type = REQ_TYPE_ATA_PM_SUSPEND; rq->special = &rqpm; rqpm.pm_step = IDE_PM_START_SUSPEND; if (mesg.event == PM_EVENT_PRETHAW) @@ -38,13 +38,43 @@ int generic_ide_suspend(struct device *dev, pm_message_t mesg) return ret; } +static void ide_end_sync_rq(struct request *rq, int error) +{ + complete(rq->end_io_data); +} + +static int ide_pm_execute_rq(struct request *rq) +{ + struct request_queue *q = rq->q; + DECLARE_COMPLETION_ONSTACK(wait); + + rq->end_io_data = &wait; + rq->end_io = ide_end_sync_rq; + + spin_lock_irq(q->queue_lock); + if (unlikely(blk_queue_dying(q))) { + rq->cmd_flags |= REQ_QUIET; + rq->errors = -ENXIO; + __blk_end_request_all(rq, rq->errors); + spin_unlock_irq(q->queue_lock); + return -ENXIO; + } + __elv_add_request(q, rq, ELEVATOR_INSERT_FRONT); + __blk_run_queue_uncond(q); + spin_unlock_irq(q->queue_lock); + + wait_for_completion_io(&wait); + + return rq->errors ? -EIO : 0; +} + int generic_ide_resume(struct device *dev) { ide_drive_t *drive = to_ide_device(dev); ide_drive_t *pair = ide_get_pair_dev(drive); ide_hwif_t *hwif = drive->hwif; struct request *rq; - struct request_pm_state rqpm; + struct ide_pm_state rqpm; int err; if (ide_port_acpi(hwif)) { @@ -59,13 +89,13 @@ int generic_ide_resume(struct device *dev) memset(&rqpm, 0, sizeof(rqpm)); rq = blk_get_request(drive->queue, READ, __GFP_WAIT); - rq->cmd_type = REQ_TYPE_PM_RESUME; + rq->cmd_type = REQ_TYPE_ATA_PM_RESUME; rq->cmd_flags |= REQ_PREEMPT; rq->special = &rqpm; rqpm.pm_step = IDE_PM_START_RESUME; rqpm.pm_state = PM_EVENT_ON; - err = blk_execute_rq(drive->queue, NULL, rq, 1); + err = ide_pm_execute_rq(rq); blk_put_request(rq); if (err == 0 && dev->driver) { @@ -80,7 +110,7 @@ int generic_ide_resume(struct device *dev) void ide_complete_power_step(ide_drive_t *drive, struct request *rq) { - struct request_pm_state *pm = rq->special; + struct ide_pm_state *pm = rq->special; #ifdef DEBUG_PM printk(KERN_INFO "%s: complete_power_step(step: %d)\n", @@ -110,7 +140,7 @@ void ide_complete_power_step(ide_drive_t *drive, struct request *rq) ide_startstop_t ide_start_power_step(ide_drive_t *drive, struct request *rq) { - struct request_pm_state *pm = rq->special; + struct ide_pm_state *pm = rq->special; struct ide_cmd cmd = { }; switch (pm->pm_step) { @@ -182,7 +212,7 @@ out_do_tf: void ide_complete_pm_rq(ide_drive_t *drive, struct request *rq) { struct request_queue *q = drive->queue; - struct request_pm_state *pm = rq->special; + struct ide_pm_state *pm = rq->special; unsigned long flags; ide_complete_power_step(drive, rq); @@ -191,10 +221,10 @@ void ide_complete_pm_rq(ide_drive_t *drive, struct request *rq) #ifdef DEBUG_PM printk("%s: completing PM request, %s\n", drive->name, - (rq->cmd_type == REQ_TYPE_PM_SUSPEND) ? "suspend" : "resume"); + (rq->cmd_type == REQ_TYPE_ATA_PM_SUSPEND) ? "suspend" : "resume"); #endif spin_lock_irqsave(q->queue_lock, flags); - if (rq->cmd_type == REQ_TYPE_PM_SUSPEND) + if (rq->cmd_type == REQ_TYPE_ATA_PM_SUSPEND) blk_stop_queue(q); else drive->dev_flags &= ~IDE_DFLAG_BLOCKED; @@ -208,13 +238,13 @@ void ide_complete_pm_rq(ide_drive_t *drive, struct request *rq) void ide_check_pm_state(ide_drive_t *drive, struct request *rq) { - struct request_pm_state *pm = rq->special; + struct ide_pm_state *pm = rq->special; - if (rq->cmd_type == REQ_TYPE_PM_SUSPEND && + if (rq->cmd_type == REQ_TYPE_ATA_PM_SUSPEND && pm->pm_step == IDE_PM_START_SUSPEND) /* Mark drive blocked when starting the suspend sequence. */ drive->dev_flags |= IDE_DFLAG_BLOCKED; - else if (rq->cmd_type == REQ_TYPE_PM_RESUME && + else if (rq->cmd_type == REQ_TYPE_ATA_PM_RESUME && pm->pm_step == IDE_PM_START_RESUME) { /* * The first thing we do on wakeup is to wait for BSY bit to diff --git a/drivers/ide/ide-taskfile.c b/drivers/ide/ide-taskfile.c index dabb88b1cbec..0979e126fff1 100644 --- a/drivers/ide/ide-taskfile.c +++ b/drivers/ide/ide-taskfile.c @@ -186,7 +186,7 @@ static ide_startstop_t task_no_data_intr(ide_drive_t *drive) tf->command == ATA_CMD_CHK_POWER) { struct request *rq = hwif->rq; - if (blk_pm_request(rq)) + if (ata_pm_request(rq)) ide_complete_pm_rq(drive, rq); else ide_finish_cmd(drive, cmd, stat); diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index c2829ba5e738..2da818a48097 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -30,7 +30,6 @@ struct scsi_ioctl_command; struct request_queue; struct elevator_queue; -struct request_pm_state; struct blk_trace; struct request; struct sg_io_hdr; @@ -75,8 +74,6 @@ struct request_list { enum rq_cmd_type_bits { REQ_TYPE_FS = 1, /* fs request */ REQ_TYPE_BLOCK_PC, /* scsi command */ - REQ_TYPE_PM_SUSPEND, /* suspend request */ - REQ_TYPE_PM_RESUME, /* resume request */ REQ_TYPE_DRV_PRIV, /* driver defined types from here */ }; @@ -207,19 +204,6 @@ static inline unsigned short req_get_ioprio(struct request *req) return req->ioprio; } -/* - * State information carried for REQ_TYPE_PM_SUSPEND and REQ_TYPE_PM_RESUME - * requests. Some step values could eventually be made generic. - */ -struct request_pm_state -{ - /* PM state machine step value, currently driver specific */ - int pm_step; - /* requested PM state value (S1, S2, S3, S4, ...) */ - u32 pm_state; - void* data; /* for driver use */ -}; - #include struct blk_queue_ctx; @@ -601,10 +585,6 @@ static inline void queue_flag_clear(unsigned int flag, struct request_queue *q) (((rq)->cmd_flags & REQ_STARTED) && \ ((rq)->cmd_type == REQ_TYPE_FS)) -#define blk_pm_request(rq) \ - ((rq)->cmd_type == REQ_TYPE_PM_SUSPEND || \ - (rq)->cmd_type == REQ_TYPE_PM_RESUME) - #define blk_rq_cpu_valid(rq) ((rq)->cpu != -1) #define blk_bidi_rq(rq) ((rq)->next_rq != NULL) /* rq->queuelist of dequeued request must be list_empty() */ @@ -838,6 +818,7 @@ extern void blk_stop_queue(struct request_queue *q); extern void blk_sync_queue(struct request_queue *q); extern void __blk_stop_queue(struct request_queue *q); extern void __blk_run_queue(struct request_queue *q); +extern void __blk_run_queue_uncond(struct request_queue *q); extern void blk_run_queue(struct request_queue *); extern void blk_run_queue_async(struct request_queue *q); extern int blk_rq_map_user(struct request_queue *, struct request *, diff --git a/include/linux/ide.h b/include/linux/ide.h index 9856b7d455d9..a633898f36ac 100644 --- a/include/linux/ide.h +++ b/include/linux/ide.h @@ -44,8 +44,14 @@ enum ata_cmd_type_bits { REQ_TYPE_ATA_TASKFILE = REQ_TYPE_DRV_PRIV + 1, REQ_TYPE_ATA_PC, REQ_TYPE_ATA_SENSE, /* sense request */ + REQ_TYPE_ATA_PM_SUSPEND,/* suspend request */ + REQ_TYPE_ATA_PM_RESUME, /* resume request */ }; +#define ata_pm_request(rq) \ + ((rq)->cmd_type == REQ_TYPE_ATA_PM_SUSPEND || \ + (rq)->cmd_type == REQ_TYPE_ATA_PM_RESUME) + /* Error codes returned in rq->errors to the higher part of the driver. */ enum { IDE_DRV_ERROR_GENERAL = 101, @@ -1321,6 +1327,19 @@ struct ide_port_info { u8 udma_mask; }; +/* + * State information carried for REQ_TYPE_ATA_PM_SUSPEND and REQ_TYPE_ATA_PM_RESUME + * requests. + */ +struct ide_pm_state { + /* PM state machine step value, currently driver specific */ + int pm_step; + /* requested PM state value (S1, S2, S3, S4, ...) */ + u32 pm_state; + void* data; /* for driver use */ +}; + + int ide_pci_init_one(struct pci_dev *, const struct ide_port_info *, void *); int ide_pci_init_two(struct pci_dev *, struct pci_dev *, const struct ide_port_info *, void *); From 9dc6c806b3c4812619e305685b3c86835bf784ab Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 17 Apr 2015 22:37:21 +0200 Subject: [PATCH 08/24] nbd: stop using req->cmd Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- drivers/block/nbd.c | 48 +++++++++++++++++++--------------------- include/uapi/linux/nbd.h | 2 -- 2 files changed, 23 insertions(+), 27 deletions(-) diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c index 9cf52ac328fe..83a7ba4a3eec 100644 --- a/drivers/block/nbd.c +++ b/drivers/block/nbd.c @@ -230,29 +230,40 @@ static int nbd_send_req(struct nbd_device *nbd, struct request *req) int result, flags; struct nbd_request request; unsigned long size = blk_rq_bytes(req); + u32 type; + + if (req->cmd_type == REQ_TYPE_DRV_PRIV) + type = NBD_CMD_DISC; + else if (req->cmd_flags & REQ_DISCARD) + type = NBD_CMD_TRIM; + else if (req->cmd_flags & REQ_FLUSH) + type = NBD_CMD_FLUSH; + else if (rq_data_dir(req) == WRITE) + type = NBD_CMD_WRITE; + else + type = NBD_CMD_READ; memset(&request, 0, sizeof(request)); request.magic = htonl(NBD_REQUEST_MAGIC); - request.type = htonl(nbd_cmd(req)); - - if (nbd_cmd(req) != NBD_CMD_FLUSH && nbd_cmd(req) != NBD_CMD_DISC) { + request.type = htonl(type); + if (type != NBD_CMD_FLUSH && type != NBD_CMD_DISC) { request.from = cpu_to_be64((u64)blk_rq_pos(req) << 9); request.len = htonl(size); } memcpy(request.handle, &req, sizeof(req)); dev_dbg(nbd_to_dev(nbd), "request %p: sending control (%s@%llu,%uB)\n", - req, nbdcmd_to_ascii(nbd_cmd(req)), + req, nbdcmd_to_ascii(type), (unsigned long long)blk_rq_pos(req) << 9, blk_rq_bytes(req)); result = sock_xmit(nbd, 1, &request, sizeof(request), - (nbd_cmd(req) == NBD_CMD_WRITE) ? MSG_MORE : 0); + (type == NBD_CMD_WRITE) ? MSG_MORE : 0); if (result <= 0) { dev_err(disk_to_dev(nbd->disk), "Send control failed (result %d)\n", result); return -EIO; } - if (nbd_cmd(req) == NBD_CMD_WRITE) { + if (type == NBD_CMD_WRITE) { struct req_iterator iter; struct bio_vec bvec; /* @@ -352,7 +363,7 @@ static struct request *nbd_read_stat(struct nbd_device *nbd) } dev_dbg(nbd_to_dev(nbd), "request %p: got reply\n", req); - if (nbd_cmd(req) == NBD_CMD_READ) { + if (rq_data_dir(req) != WRITE) { struct req_iterator iter; struct bio_vec bvec; @@ -452,23 +463,11 @@ static void nbd_handle_req(struct nbd_device *nbd, struct request *req) if (req->cmd_type != REQ_TYPE_FS) goto error_out; - nbd_cmd(req) = NBD_CMD_READ; - if (rq_data_dir(req) == WRITE) { - if ((req->cmd_flags & REQ_DISCARD)) { - WARN_ON(!(nbd->flags & NBD_FLAG_SEND_TRIM)); - nbd_cmd(req) = NBD_CMD_TRIM; - } else - nbd_cmd(req) = NBD_CMD_WRITE; - if (nbd->flags & NBD_FLAG_READ_ONLY) { - dev_err(disk_to_dev(nbd->disk), - "Write on read-only\n"); - goto error_out; - } - } - - if (req->cmd_flags & REQ_FLUSH) { - BUG_ON(unlikely(blk_rq_sectors(req))); - nbd_cmd(req) = NBD_CMD_FLUSH; + if (rq_data_dir(req) == WRITE && + (nbd->flags & NBD_FLAG_READ_ONLY)) { + dev_err(disk_to_dev(nbd->disk), + "Write on read-only\n"); + goto error_out; } req->errors = 0; @@ -593,7 +592,6 @@ static int __nbd_ioctl(struct block_device *bdev, struct nbd_device *nbd, mutex_lock(&nbd->tx_lock); blk_rq_init(NULL, &sreq); sreq.cmd_type = REQ_TYPE_DRV_PRIV; - nbd_cmd(&sreq) = NBD_CMD_DISC; /* Check again after getting mutex back. */ if (!nbd->sock) diff --git a/include/uapi/linux/nbd.h b/include/uapi/linux/nbd.h index 4f52549b23ff..e08e413d5f71 100644 --- a/include/uapi/linux/nbd.h +++ b/include/uapi/linux/nbd.h @@ -44,8 +44,6 @@ enum { /* there is a gap here to match userspace */ #define NBD_FLAG_SEND_TRIM (1 << 5) /* send trim/discard */ -#define nbd_cmd(req) ((req)->cmd[0]) - /* userspace doesn't need the nbd_device structure */ /* These are sent over the network in the request/reply magic fields */ From dd6cf3e18decb4895503db1752bb5500c4dd588d Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Fri, 8 May 2015 10:51:28 -0700 Subject: [PATCH 09/24] blk: clean up plug Current code looks like inner plug gets flushed with a blk_finish_plug(). Actually it's a nop. All requests/callbacks are added to current->plug, while only outmost plug is assigned to current->plug. So inner plug always has empty request/callback list, which makes blk_flush_plug_list() a nop. This tries to make the code more clear. Signed-off-by: Shaohua Li Reviewed-by: Jeff Moyer Reviewed-by: Christoph Hellwig Signed-off-by: Jens Axboe --- block/blk-core.c | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/block/blk-core.c b/block/blk-core.c index 2e5020f37d55..9dcfb8ec554b 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -3032,21 +3032,20 @@ void blk_start_plug(struct blk_plug *plug) { struct task_struct *tsk = current; + /* + * If this is a nested plug, don't actually assign it. + */ + if (tsk->plug) + return; + INIT_LIST_HEAD(&plug->list); INIT_LIST_HEAD(&plug->mq_list); INIT_LIST_HEAD(&plug->cb_list); - /* - * If this is a nested plug, don't actually assign it. It will be - * flushed on its own. + * Store ordering should not be needed here, since a potential + * preempt will imply a full memory barrier */ - if (!tsk->plug) { - /* - * Store ordering should not be needed here, since a potential - * preempt will imply a full memory barrier - */ - tsk->plug = plug; - } + tsk->plug = plug; } EXPORT_SYMBOL(blk_start_plug); @@ -3193,10 +3192,11 @@ void blk_flush_plug_list(struct blk_plug *plug, bool from_schedule) void blk_finish_plug(struct blk_plug *plug) { + if (plug != current->plug) + return; blk_flush_plug_list(plug, false); - if (plug == current->plug) - current->plug = NULL; + current->plug = NULL; } EXPORT_SYMBOL(blk_finish_plug); From 5596d0d591bea25424c07f0fce00df5af593b31f Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Fri, 8 May 2015 10:51:29 -0700 Subject: [PATCH 10/24] sched: always use blk_schedule_flush_plug in io_schedule_out block plug callback could sleep, so we introduce a parameter 'from_schedule' and corresponding drivers can use it to destinguish a schedule plug flush or a plug finish. Unfortunately io_schedule_out still uses blk_flush_plug(). This causes below output (Note, I added a might_sleep() in raid1_unplug to make it trigger faster, but the whole thing doesn't matter if I add might_sleep). In raid1/10, this can cause deadlock. This patch makes io_schedule_out always uses blk_schedule_flush_plug. This should only impact drivers (as far as I know, raid 1/10) which are sensitive to the 'from_schedule' parameter. [ 370.817949] ------------[ cut here ]------------ [ 370.817960] WARNING: CPU: 7 PID: 145 at ../kernel/sched/core.c:7306 __might_sleep+0x7f/0x90() [ 370.817969] do not call blocking ops when !TASK_RUNNING; state=2 set at [] prepare_to_wait+0x2f/0x90 [ 370.817971] Modules linked in: raid1 [ 370.817976] CPU: 7 PID: 145 Comm: kworker/u16:9 Tainted: G W 4.0.0+ #361 [ 370.817977] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.7.5-20140709_153802- 04/01/2014 [ 370.817983] Workqueue: writeback bdi_writeback_workfn (flush-9:1) [ 370.817985] ffffffff81cd83be ffff8800ba8cb298 ffffffff819dd7af 0000000000000001 [ 370.817988] ffff8800ba8cb2e8 ffff8800ba8cb2d8 ffffffff81051afc ffff8800ba8cb2c8 [ 370.817990] ffffffffa00061a8 000000000000041e 0000000000000000 ffff8800ba8cba28 [ 370.817993] Call Trace: [ 370.817999] [] dump_stack+0x4f/0x7b [ 370.818002] [] warn_slowpath_common+0x8c/0xd0 [ 370.818004] [] warn_slowpath_fmt+0x46/0x50 [ 370.818006] [] ? prepare_to_wait+0x2f/0x90 [ 370.818008] [] ? prepare_to_wait+0x2f/0x90 [ 370.818010] [] __might_sleep+0x7f/0x90 [ 370.818014] [] raid1_unplug+0xd3/0x170 [raid1] [ 370.818024] [] blk_flush_plug_list+0x8a/0x1e0 [ 370.818028] [] ? bit_wait+0x50/0x50 [ 370.818031] [] io_schedule_timeout+0x130/0x140 [ 370.818033] [] bit_wait_io+0x36/0x50 [ 370.818034] [] __wait_on_bit+0x65/0x90 [ 370.818041] [] ? ext4_read_block_bitmap_nowait+0xbc/0x630 [ 370.818043] [] ? bit_wait+0x50/0x50 [ 370.818045] [] out_of_line_wait_on_bit+0x72/0x80 [ 370.818047] [] ? autoremove_wake_function+0x40/0x40 [ 370.818050] [] __wait_on_buffer+0x44/0x50 [ 370.818053] [] ext4_wait_block_bitmap+0xe0/0xf0 [ 370.818058] [] ext4_mb_init_cache+0x206/0x790 [ 370.818062] [] ? lru_cache_add+0x1c/0x50 [ 370.818064] [] ext4_mb_init_group+0x11e/0x200 [ 370.818066] [] ext4_mb_load_buddy+0x341/0x360 [ 370.818068] [] ext4_mb_find_by_goal+0x93/0x2f0 [ 370.818070] [] ? ext4_mb_normalize_request+0x1e4/0x5b0 [ 370.818072] [] ext4_mb_regular_allocator+0x67/0x460 [ 370.818074] [] ? ext4_mb_normalize_request+0x1e4/0x5b0 [ 370.818076] [] ext4_mb_new_blocks+0x4cb/0x620 [ 370.818079] [] ext4_ext_map_blocks+0x4c6/0x14d0 [ 370.818081] [] ? ext4_es_lookup_extent+0x4e/0x290 [ 370.818085] [] ext4_map_blocks+0x14d/0x4f0 [ 370.818088] [] ext4_writepages+0x76d/0xe50 [ 370.818094] [] do_writepages+0x21/0x50 [ 370.818097] [] __writeback_single_inode+0x60/0x490 [ 370.818099] [] writeback_sb_inodes+0x2da/0x590 [ 370.818103] [] ? trylock_super+0x1b/0x50 [ 370.818105] [] ? trylock_super+0x1b/0x50 [ 370.818107] [] __writeback_inodes_wb+0x9f/0xd0 [ 370.818109] [] wb_writeback+0x34b/0x3c0 [ 370.818111] [] bdi_writeback_workfn+0x23f/0x550 [ 370.818116] [] process_one_work+0x1c8/0x570 [ 370.818117] [] ? process_one_work+0x14b/0x570 [ 370.818119] [] worker_thread+0x11b/0x470 [ 370.818121] [] ? process_one_work+0x570/0x570 [ 370.818124] [] kthread+0xf8/0x110 [ 370.818126] [] ? kthread_create_on_node+0x210/0x210 [ 370.818129] [] ret_from_fork+0x42/0x70 [ 370.818131] [] ? kthread_create_on_node+0x210/0x210 [ 370.818132] ---[ end trace 7b4deb71e68b6605 ]--- V2: don't change ->in_iowait Cc: NeilBrown Signed-off-by: Shaohua Li Reviewed-by: Jeff Moyer Signed-off-by: Jens Axboe --- kernel/sched/core.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index fe22f7510bce..cfeebb499e79 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -4387,10 +4387,7 @@ long __sched io_schedule_timeout(long timeout) long ret; current->in_iowait = 1; - if (old_iowait) - blk_schedule_flush_plug(current); - else - blk_flush_plug(current); + blk_schedule_flush_plug(current); delayacct_blkio_start(); rq = raw_rq(); From e6c4438ba7cb615448492849970aaf0aaa1cc973 Mon Sep 17 00:00:00 2001 From: Jeff Moyer Date: Fri, 8 May 2015 10:51:30 -0700 Subject: [PATCH 11/24] blk-mq: fix plugging in blk_sq_make_request The following appears in blk_sq_make_request: /* * If we have multiple hardware queues, just go directly to * one of those for sync IO. */ We clearly don't have multiple hardware queues, here! This comment was introduced with this commit 07068d5b8e (blk-mq: split make request handler for multi and single queue): We want slightly different behavior from them: - On single queue devices, we currently use the per-process plug for deferred IO and for merging. - On multi queue devices, we don't use the per-process plug, but we want to go straight to hardware for SYNC IO. The old code had this: use_plug = !is_flush_fua && ((q->nr_hw_queues == 1) || !is_sync); and that was converted to: use_plug = !is_flush_fua && !is_sync; which is not equivalent. For the single queue case, that second half of the && expression is always true. So, what I think was actually inteded follows (and this more closely matches what is done in blk_queue_bio). V2: delete the 'likely', which should not be a big deal Signed-off-by: Jeff Moyer Signed-off-by: Shaohua Li Signed-off-by: Jens Axboe --- block/blk-mq.c | 36 ++++++++++++++---------------------- 1 file changed, 14 insertions(+), 22 deletions(-) diff --git a/block/blk-mq.c b/block/blk-mq.c index ade8a2d1b0aa..a65acffde19a 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -1309,16 +1309,11 @@ static void blk_sq_make_request(struct request_queue *q, struct bio *bio) { const int is_sync = rw_is_sync(bio->bi_rw); const int is_flush_fua = bio->bi_rw & (REQ_FLUSH | REQ_FUA); - unsigned int use_plug, request_count = 0; + struct blk_plug *plug; + unsigned int request_count = 0; struct blk_map_ctx data; struct request *rq; - /* - * If we have multiple hardware queues, just go directly to - * one of those for sync IO. - */ - use_plug = !is_flush_fua && !is_sync; - blk_queue_bounce(q, &bio); if (bio_integrity_enabled(bio) && bio_integrity_prep(bio)) { @@ -1326,7 +1321,7 @@ static void blk_sq_make_request(struct request_queue *q, struct bio *bio) return; } - if (use_plug && !blk_queue_nomerges(q) && + if (!is_flush_fua && !blk_queue_nomerges(q) && blk_attempt_plug_merge(q, bio, &request_count)) return; @@ -1345,21 +1340,18 @@ static void blk_sq_make_request(struct request_queue *q, struct bio *bio) * utilize that to temporarily store requests until the task is * either done or scheduled away. */ - if (use_plug) { - struct blk_plug *plug = current->plug; - - if (plug) { - blk_mq_bio_to_request(rq, bio); - if (list_empty(&plug->mq_list)) - trace_block_plug(q); - else if (request_count >= BLK_MAX_REQUEST_COUNT) { - blk_flush_plug_list(plug, false); - trace_block_plug(q); - } - list_add_tail(&rq->queuelist, &plug->mq_list); - blk_mq_put_ctx(data.ctx); - return; + plug = current->plug; + if (plug) { + blk_mq_bio_to_request(rq, bio); + if (list_empty(&plug->mq_list)) + trace_block_plug(q); + else if (request_count >= BLK_MAX_REQUEST_COUNT) { + blk_flush_plug_list(plug, false); + trace_block_plug(q); } + list_add_tail(&rq->queuelist, &plug->mq_list); + blk_mq_put_ctx(data.ctx); + return; } if (!blk_mq_merge_queue_io(data.hctx, data.ctx, rq, bio)) { From 239ad215f0d8388cbe6c09a0fab8ad8ff5dba420 Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Fri, 8 May 2015 10:51:31 -0700 Subject: [PATCH 12/24] blk-mq: avoid re-initialize request which is failed in direct dispatch If we directly issue a request and it fails, we use blk_mq_merge_queue_io(). But we already assigned bio to a request in blk_mq_bio_to_request. blk_mq_merge_queue_io shouldn't run blk_mq_bio_to_request again. Signed-off-by: Shaohua Li Signed-off-by: Jens Axboe --- block/blk-mq.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/block/blk-mq.c b/block/blk-mq.c index a65acffde19a..f13d0de42f53 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -1284,6 +1284,8 @@ static void blk_mq_make_request(struct request_queue *q, struct bio *bio) blk_mq_end_request(rq, rq->errors); goto done; } + blk_mq_insert_request(rq, false, true, true); + return; } } From f984df1f0f71ef96254411fc3576a10ae561be71 Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Fri, 8 May 2015 10:51:32 -0700 Subject: [PATCH 13/24] blk-mq: do limited block plug for multiple queue case plug is still helpful for workload with IO merge, but it can be harmful otherwise especially with multiple hardware queues, as there is (supposed) no lock contention in this case and plug can introduce latency. For multiple queues, we do limited plug, eg plug only if there is request merge. If a request doesn't have merge with following request, the requet will be dispatched immediately. V2: check blk_queue_nomerges() as suggested by Jeff. Cc: Jens Axboe Cc: Christoph Hellwig Signed-off-by: Shaohua Li Signed-off-by: Jens Axboe --- block/blk-mq.c | 82 ++++++++++++++++++++++++++++++++++++-------------- 1 file changed, 59 insertions(+), 23 deletions(-) diff --git a/block/blk-mq.c b/block/blk-mq.c index f13d0de42f53..902c2eb9a0e7 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -1224,6 +1224,38 @@ static struct request *blk_mq_map_request(struct request_queue *q, return rq; } +static int blk_mq_direct_issue_request(struct request *rq) +{ + int ret; + struct request_queue *q = rq->q; + struct blk_mq_hw_ctx *hctx = q->mq_ops->map_queue(q, + rq->mq_ctx->cpu); + struct blk_mq_queue_data bd = { + .rq = rq, + .list = NULL, + .last = 1 + }; + + /* + * For OK queue, we are done. For error, kill it. Any other + * error (busy), just add it to our list as we previously + * would have done + */ + ret = q->mq_ops->queue_rq(hctx, &bd); + if (ret == BLK_MQ_RQ_QUEUE_OK) + return 0; + else { + __blk_mq_requeue_request(rq); + + if (ret == BLK_MQ_RQ_QUEUE_ERROR) { + rq->errors = -EIO; + blk_mq_end_request(rq, rq->errors); + return 0; + } + return -1; + } +} + /* * Multiple hardware queue variant. This will not use per-process plugs, * but will attempt to bypass the hctx queueing if we can go straight to @@ -1235,6 +1267,8 @@ static void blk_mq_make_request(struct request_queue *q, struct bio *bio) const int is_flush_fua = bio->bi_rw & (REQ_FLUSH | REQ_FUA); struct blk_map_ctx data; struct request *rq; + unsigned int request_count = 0; + struct blk_plug *plug; blk_queue_bounce(q, &bio); @@ -1243,6 +1277,10 @@ static void blk_mq_make_request(struct request_queue *q, struct bio *bio) return; } + if (!is_flush_fua && !blk_queue_nomerges(q) && + blk_attempt_plug_merge(q, bio, &request_count)) + return; + rq = blk_mq_map_request(q, bio, &data); if (unlikely(!rq)) return; @@ -1253,40 +1291,39 @@ static void blk_mq_make_request(struct request_queue *q, struct bio *bio) goto run_queue; } + plug = current->plug; /* * If the driver supports defer issued based on 'last', then * queue it up like normal since we can potentially save some * CPU this way. */ - if (is_sync && !(data.hctx->flags & BLK_MQ_F_DEFER_ISSUE)) { - struct blk_mq_queue_data bd = { - .rq = rq, - .list = NULL, - .last = 1 - }; - int ret; + if (((plug && !blk_queue_nomerges(q)) || is_sync) && + !(data.hctx->flags & BLK_MQ_F_DEFER_ISSUE)) { + struct request *old_rq = NULL; blk_mq_bio_to_request(rq, bio); /* - * For OK queue, we are done. For error, kill it. Any other - * error (busy), just add it to our list as we previously - * would have done + * we do limited pluging. If bio can be merged, do merge. + * Otherwise the existing request in the plug list will be + * issued. So the plug list will have one request at most */ - ret = q->mq_ops->queue_rq(data.hctx, &bd); - if (ret == BLK_MQ_RQ_QUEUE_OK) - goto done; - else { - __blk_mq_requeue_request(rq); - - if (ret == BLK_MQ_RQ_QUEUE_ERROR) { - rq->errors = -EIO; - blk_mq_end_request(rq, rq->errors); - goto done; + if (plug) { + if (!list_empty(&plug->mq_list)) { + old_rq = list_first_entry(&plug->mq_list, + struct request, queuelist); + list_del_init(&old_rq->queuelist); } - blk_mq_insert_request(rq, false, true, true); + list_add_tail(&rq->queuelist, &plug->mq_list); + } else /* is_sync */ + old_rq = rq; + blk_mq_put_ctx(data.ctx); + if (!old_rq) return; - } + if (!blk_mq_direct_issue_request(old_rq)) + return; + blk_mq_insert_request(old_rq, false, true, true); + return; } if (!blk_mq_merge_queue_io(data.hctx, data.ctx, rq, bio)) { @@ -1299,7 +1336,6 @@ static void blk_mq_make_request(struct request_queue *q, struct bio *bio) run_queue: blk_mq_run_hw_queue(data.hctx, !is_sync || is_flush_fua); } -done: blk_mq_put_ctx(data.ctx); } From 5b3f341f098d60da2970758db6a05bd851eb6b39 Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Fri, 8 May 2015 10:51:33 -0700 Subject: [PATCH 14/24] blk-mq: make plug work for mutiple disks and queues Last patch makes plug work for multiple queue case. However it only works for single disk case, because it assumes only one request in the plug list. If a task is accessing multiple disks, eg MD/DM, the assumption is wrong. Let blk_attempt_plug_merge() record request from the same queue. V2: use NULL parameter in !mq case. Fix a bug. Add comments in blk_attempt_plug_merge to make it less (hopefully) confusion. Cc: Jens Axboe Cc: Christoph Hellwig Signed-off-by: Shaohua Li Signed-off-by: Jens Axboe --- block/blk-core.c | 15 ++++++++++++--- block/blk-mq.c | 14 +++++++++----- block/blk.h | 3 ++- 3 files changed, 23 insertions(+), 9 deletions(-) diff --git a/block/blk-core.c b/block/blk-core.c index 9dcfb8ec554b..f0be754c7781 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -1522,7 +1522,8 @@ bool bio_attempt_front_merge(struct request_queue *q, struct request *req, * Caller must ensure !blk_queue_nomerges(q) beforehand. */ bool blk_attempt_plug_merge(struct request_queue *q, struct bio *bio, - unsigned int *request_count) + unsigned int *request_count, + struct request **same_queue_rq) { struct blk_plug *plug; struct request *rq; @@ -1542,8 +1543,16 @@ bool blk_attempt_plug_merge(struct request_queue *q, struct bio *bio, list_for_each_entry_reverse(rq, plug_list, queuelist) { int el_ret; - if (rq->q == q) + if (rq->q == q) { (*request_count)++; + /* + * Only blk-mq multiple hardware queues case checks the + * rq in the same queue, there should be only one such + * rq in a queue + **/ + if (same_queue_rq) + *same_queue_rq = rq; + } if (rq->q != q || !blk_rq_merge_ok(rq, bio)) continue; @@ -1608,7 +1617,7 @@ void blk_queue_bio(struct request_queue *q, struct bio *bio) * any locks. */ if (!blk_queue_nomerges(q) && - blk_attempt_plug_merge(q, bio, &request_count)) + blk_attempt_plug_merge(q, bio, &request_count, NULL)) return; spin_lock_irq(q->queue_lock); diff --git a/block/blk-mq.c b/block/blk-mq.c index 902c2eb9a0e7..31df47443699 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -1269,6 +1269,7 @@ static void blk_mq_make_request(struct request_queue *q, struct bio *bio) struct request *rq; unsigned int request_count = 0; struct blk_plug *plug; + struct request *same_queue_rq = NULL; blk_queue_bounce(q, &bio); @@ -1278,7 +1279,7 @@ static void blk_mq_make_request(struct request_queue *q, struct bio *bio) } if (!is_flush_fua && !blk_queue_nomerges(q) && - blk_attempt_plug_merge(q, bio, &request_count)) + blk_attempt_plug_merge(q, bio, &request_count, &same_queue_rq)) return; rq = blk_mq_map_request(q, bio, &data); @@ -1309,9 +1310,12 @@ static void blk_mq_make_request(struct request_queue *q, struct bio *bio) * issued. So the plug list will have one request at most */ if (plug) { - if (!list_empty(&plug->mq_list)) { - old_rq = list_first_entry(&plug->mq_list, - struct request, queuelist); + /* + * The plug list might get flushed before this. If that + * happens, same_queue_rq is invalid and plug list is empty + **/ + if (same_queue_rq && !list_empty(&plug->mq_list)) { + old_rq = same_queue_rq; list_del_init(&old_rq->queuelist); } list_add_tail(&rq->queuelist, &plug->mq_list); @@ -1360,7 +1364,7 @@ static void blk_sq_make_request(struct request_queue *q, struct bio *bio) } if (!is_flush_fua && !blk_queue_nomerges(q) && - blk_attempt_plug_merge(q, bio, &request_count)) + blk_attempt_plug_merge(q, bio, &request_count, NULL)) return; rq = blk_mq_map_request(q, bio, &data); diff --git a/block/blk.h b/block/blk.h index 4b48d55e588e..026d9594142b 100644 --- a/block/blk.h +++ b/block/blk.h @@ -78,7 +78,8 @@ bool bio_attempt_front_merge(struct request_queue *q, struct request *req, bool bio_attempt_back_merge(struct request_queue *q, struct request *req, struct bio *bio); bool blk_attempt_plug_merge(struct request_queue *q, struct bio *bio, - unsigned int *request_count); + unsigned int *request_count, + struct request **same_queue_rq); void blk_account_io_start(struct request *req, bool new_io); void blk_account_io_completion(struct request *req, unsigned int bytes); From 4ecd4fef3a074c8bb43c391a57742c422469ebbd Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 7 May 2015 09:38:13 +0200 Subject: [PATCH 15/24] block: use an atomic_t for mq_freeze_depth lockdep gets unhappy about the not disabling irqs when using the queue_lock around it. Instead of trying to fix that up just switch to an atomic_t and get rid of the lock. Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- block/blk-mq.c | 24 ++++++++++-------------- include/linux/blkdev.h | 2 +- 2 files changed, 11 insertions(+), 15 deletions(-) diff --git a/block/blk-mq.c b/block/blk-mq.c index 31df47443699..c382a34fe5ac 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -89,7 +89,8 @@ static int blk_mq_queue_enter(struct request_queue *q, gfp_t gfp) return -EBUSY; ret = wait_event_interruptible(q->mq_freeze_wq, - !q->mq_freeze_depth || blk_queue_dying(q)); + !atomic_read(&q->mq_freeze_depth) || + blk_queue_dying(q)); if (blk_queue_dying(q)) return -ENODEV; if (ret) @@ -112,13 +113,10 @@ static void blk_mq_usage_counter_release(struct percpu_ref *ref) void blk_mq_freeze_queue_start(struct request_queue *q) { - bool freeze; + int freeze_depth; - spin_lock_irq(q->queue_lock); - freeze = !q->mq_freeze_depth++; - spin_unlock_irq(q->queue_lock); - - if (freeze) { + freeze_depth = atomic_inc_return(&q->mq_freeze_depth); + if (freeze_depth == 1) { percpu_ref_kill(&q->mq_usage_counter); blk_mq_run_hw_queues(q, false); } @@ -143,13 +141,11 @@ EXPORT_SYMBOL_GPL(blk_mq_freeze_queue); void blk_mq_unfreeze_queue(struct request_queue *q) { - bool wake; + int freeze_depth; - spin_lock_irq(q->queue_lock); - wake = !--q->mq_freeze_depth; - WARN_ON_ONCE(q->mq_freeze_depth < 0); - spin_unlock_irq(q->queue_lock); - if (wake) { + freeze_depth = atomic_dec_return(&q->mq_freeze_depth); + WARN_ON_ONCE(freeze_depth < 0); + if (!freeze_depth) { percpu_ref_reinit(&q->mq_usage_counter); wake_up_all(&q->mq_freeze_wq); } @@ -2081,7 +2077,7 @@ void blk_mq_free_queue(struct request_queue *q) /* Basically redo blk_mq_init_queue with queue frozen */ static void blk_mq_queue_reinit(struct request_queue *q) { - WARN_ON_ONCE(!q->mq_freeze_depth); + WARN_ON_ONCE(!atomic_read(&q->mq_freeze_depth)); blk_mq_sysfs_unregister(q); diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 2da818a48097..bc917956a6d0 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -444,7 +444,7 @@ struct request_queue { struct mutex sysfs_lock; int bypass_depth; - int mq_freeze_depth; + atomic_t mq_freeze_depth; #if defined(CONFIG_BLK_DEV_BSG) bsg_job_fn *bsg_job_fn; From b25de9d6da49b1a8760a89672283128aa8c78345 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 24 Apr 2015 21:41:01 +0200 Subject: [PATCH 16/24] block: remove BIO_EOPNOTSUPP Since the big barrier rewrite/removal in 2007 we never fail FLUSH or FUA requests, which means we can remove the magic BIO_EOPNOTSUPP flag to help propagating those to the buffer_head layer. Signed-off-by: Christoph Hellwig Reviewed-by: Jeff Moyer Acked-by: Tejun Heo Signed-off-by: Jens Axboe --- block/bounce.c | 3 --- fs/btrfs/disk-io.c | 11 ++--------- fs/btrfs/extent_io.c | 2 -- fs/buffer.c | 10 ---------- fs/ext4/page-io.c | 1 - fs/nilfs2/segbuf.c | 12 ------------ include/linux/blk_types.h | 1 - 7 files changed, 2 insertions(+), 38 deletions(-) diff --git a/block/bounce.c b/block/bounce.c index ab21ba203d5c..4bac72579c1f 100644 --- a/block/bounce.c +++ b/block/bounce.c @@ -128,9 +128,6 @@ static void bounce_end_io(struct bio *bio, mempool_t *pool, int err) struct bio_vec *bvec, *org_vec; int i; - if (test_bit(BIO_EOPNOTSUPP, &bio->bi_flags)) - set_bit(BIO_EOPNOTSUPP, &bio_orig->bi_flags); - /* * free up bounce indirect pages used */ diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 2ef9a4b72d06..e08a926fe12c 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -3269,11 +3269,8 @@ static int write_dev_supers(struct btrfs_device *device, */ static void btrfs_end_empty_barrier(struct bio *bio, int err) { - if (err) { - if (err == -EOPNOTSUPP) - set_bit(BIO_EOPNOTSUPP, &bio->bi_flags); + if (err) clear_bit(BIO_UPTODATE, &bio->bi_flags); - } if (bio->bi_private) complete(bio->bi_private); bio_put(bio); @@ -3301,11 +3298,7 @@ static int write_dev_flush(struct btrfs_device *device, int wait) wait_for_completion(&device->flush_wait); - if (bio_flagged(bio, BIO_EOPNOTSUPP)) { - printk_in_rcu("BTRFS: disabling barriers on dev %s\n", - rcu_str_deref(device->name)); - device->nobarriers = 1; - } else if (!bio_flagged(bio, BIO_UPTODATE)) { + if (!bio_flagged(bio, BIO_UPTODATE)) { ret = -EIO; btrfs_dev_stat_inc_and_print(device, BTRFS_DEV_STAT_FLUSH_ERRS); diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 43af5a61ad25..1e155299abc0 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -2767,8 +2767,6 @@ static int __must_check submit_one_bio(int rw, struct bio *bio, else btrfsic_submit_bio(rw, bio); - if (bio_flagged(bio, BIO_EOPNOTSUPP)) - ret = -EOPNOTSUPP; bio_put(bio); return ret; } diff --git a/fs/buffer.c b/fs/buffer.c index c7a5602d01ee..efd85e0e8660 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -2938,10 +2938,6 @@ static void end_bio_bh_io_sync(struct bio *bio, int err) { struct buffer_head *bh = bio->bi_private; - if (err == -EOPNOTSUPP) { - set_bit(BIO_EOPNOTSUPP, &bio->bi_flags); - } - if (unlikely (test_bit(BIO_QUIET,&bio->bi_flags))) set_bit(BH_Quiet, &bh->b_state); @@ -3041,13 +3037,7 @@ int _submit_bh(int rw, struct buffer_head *bh, unsigned long bio_flags) if (buffer_prio(bh)) rw |= REQ_PRIO; - bio_get(bio); submit_bio(rw, bio); - - if (bio_flagged(bio, BIO_EOPNOTSUPP)) - ret = -EOPNOTSUPP; - - bio_put(bio); return ret; } EXPORT_SYMBOL_GPL(_submit_bh); diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c index 5765f88b3904..c5d81e8d84c3 100644 --- a/fs/ext4/page-io.c +++ b/fs/ext4/page-io.c @@ -359,7 +359,6 @@ void ext4_io_submit(struct ext4_io_submit *io) if (bio) { bio_get(io->io_bio); submit_bio(io->io_op, io->io_bio); - BUG_ON(bio_flagged(io->io_bio, BIO_EOPNOTSUPP)); bio_put(io->io_bio); } io->io_bio = NULL; diff --git a/fs/nilfs2/segbuf.c b/fs/nilfs2/segbuf.c index dc3a9efdaab8..42468e5ab3e7 100644 --- a/fs/nilfs2/segbuf.c +++ b/fs/nilfs2/segbuf.c @@ -343,11 +343,6 @@ static void nilfs_end_bio_write(struct bio *bio, int err) const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); struct nilfs_segment_buffer *segbuf = bio->bi_private; - if (err == -EOPNOTSUPP) { - set_bit(BIO_EOPNOTSUPP, &bio->bi_flags); - /* to be detected by nilfs_segbuf_submit_bio() */ - } - if (!uptodate) atomic_inc(&segbuf->sb_err); @@ -374,15 +369,8 @@ static int nilfs_segbuf_submit_bio(struct nilfs_segment_buffer *segbuf, bio->bi_end_io = nilfs_end_bio_write; bio->bi_private = segbuf; - bio_get(bio); submit_bio(mode, bio); segbuf->sb_nbio++; - if (bio_flagged(bio, BIO_EOPNOTSUPP)) { - bio_put(bio); - err = -EOPNOTSUPP; - goto failed; - } - bio_put(bio); wi->bio = NULL; wi->rest_blocks -= wi->end - wi->start; diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index 93d2e7153816..daf95915d104 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -118,7 +118,6 @@ struct bio { #define BIO_CLONED 4 /* doesn't own data */ #define BIO_BOUNCED 5 /* bio is a bounce bio */ #define BIO_USER_MAPPED 6 /* contains user pages */ -#define BIO_EOPNOTSUPP 7 /* not supported */ #define BIO_NULL_MAPPED 8 /* contains invalid user pages */ #define BIO_QUIET 9 /* Make BIO Quiet */ #define BIO_SNAP_STABLE 10 /* bio data must be snapshotted during write */ From 97ca223c3b37ed12a5b67a5dc6247e5a4799d337 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 24 Apr 2015 21:41:02 +0200 Subject: [PATCH 17/24] block: remove unused BIO_RW_BLOCK and BIO_EOF flags Signed-off-by: Christoph Hellwig Acked-by: Tejun Heo Signed-off-by: Jens Axboe --- block/blk-core.c | 2 -- include/linux/blk_types.h | 2 -- 2 files changed, 4 deletions(-) diff --git a/block/blk-core.c b/block/blk-core.c index f0be754c7781..de474b5dee2b 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -1725,8 +1725,6 @@ static void handle_bad_sector(struct bio *bio) bio->bi_rw, (unsigned long long)bio_end_sector(bio), (long long)(i_size_read(bio->bi_bdev->bd_inode) >> 9)); - - set_bit(BIO_EOF, &bio->bi_flags); } #ifdef CONFIG_FAIL_MAKE_REQUEST diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index daf95915d104..09c7a2cd48ef 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -112,8 +112,6 @@ struct bio { * bio flags */ #define BIO_UPTODATE 0 /* ok after I/O completion */ -#define BIO_RW_BLOCK 1 /* RW_AHEAD set, and read/write would block */ -#define BIO_EOF 2 /* out-out-bounds error */ #define BIO_SEG_VALID 3 /* bi_phys_segments valid */ #define BIO_CLONED 4 /* doesn't own data */ #define BIO_BOUNCED 5 /* bio is a bounce bio */ From b2dbe0a60f1bcf4db5c701f1577b3135c3159eb5 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 19 May 2015 09:18:28 -0600 Subject: [PATCH 18/24] block: collapse bio bit space Various previous patches removed bits and left holes, collapse them all. Leave the reset start bit where it is, we don't need to change that. Signed-off-by: Jens Axboe --- include/linux/blk_types.h | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index 09c7a2cd48ef..3f4ded0b1a34 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -112,15 +112,15 @@ struct bio { * bio flags */ #define BIO_UPTODATE 0 /* ok after I/O completion */ -#define BIO_SEG_VALID 3 /* bi_phys_segments valid */ -#define BIO_CLONED 4 /* doesn't own data */ -#define BIO_BOUNCED 5 /* bio is a bounce bio */ -#define BIO_USER_MAPPED 6 /* contains user pages */ -#define BIO_NULL_MAPPED 8 /* contains invalid user pages */ -#define BIO_QUIET 9 /* Make BIO Quiet */ -#define BIO_SNAP_STABLE 10 /* bio data must be snapshotted during write */ -#define BIO_CHAIN 11 /* chained bio, ->bi_remaining in effect */ -#define BIO_REFFED 12 /* bio has elevated ->bi_cnt */ +#define BIO_SEG_VALID 1 /* bi_phys_segments valid */ +#define BIO_CLONED 2 /* doesn't own data */ +#define BIO_BOUNCED 3 /* bio is a bounce bio */ +#define BIO_USER_MAPPED 4 /* contains user pages */ +#define BIO_NULL_MAPPED 5 /* contains invalid user pages */ +#define BIO_QUIET 6 /* Make BIO Quiet */ +#define BIO_SNAP_STABLE 7 /* bio data must be snapshotted during write */ +#define BIO_CHAIN 8 /* chained bio, ->bi_remaining in effect */ +#define BIO_REFFED 9 /* bio has elevated ->bi_cnt */ /* * Flags starting here get preserved by bio_reset() - this includes From 343df3c79c62b644ce6ff5dff96c9e0be1ecb242 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 19 May 2015 09:23:23 +0200 Subject: [PATCH 19/24] suspend: simplify block I/O handling Stop abusing struct page functionality and the swap end_io handler, and instead add a modified version of the blk-lib.c bio_batch helpers. Also move the block I/O code into swap.c as they are directly tied into each other. Signed-off-by: Christoph Hellwig Tested-by: Pavel Machek Tested-by: Ming Lin Acked-by: Pavel Machek Acked-by: Rafael J. Wysocki Signed-off-by: Jens Axboe --- include/linux/swap.h | 1 - kernel/power/Makefile | 3 +- kernel/power/block_io.c | 103 -------------------------- kernel/power/power.h | 9 --- kernel/power/swap.c | 159 ++++++++++++++++++++++++++++++---------- mm/page_io.c | 2 +- 6 files changed, 122 insertions(+), 155 deletions(-) delete mode 100644 kernel/power/block_io.c diff --git a/include/linux/swap.h b/include/linux/swap.h index cee108cbe2d5..38874729dc5f 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -377,7 +377,6 @@ extern void end_swap_bio_write(struct bio *bio, int err); extern int __swap_writepage(struct page *page, struct writeback_control *wbc, void (*end_write_func)(struct bio *, int)); extern int swap_set_page_dirty(struct page *page); -extern void end_swap_bio_read(struct bio *bio, int err); int add_swap_extent(struct swap_info_struct *sis, unsigned long start_page, unsigned long nr_pages, sector_t start_block); diff --git a/kernel/power/Makefile b/kernel/power/Makefile index 29472bff11ef..cb880a14cc39 100644 --- a/kernel/power/Makefile +++ b/kernel/power/Makefile @@ -7,8 +7,7 @@ obj-$(CONFIG_VT_CONSOLE_SLEEP) += console.o obj-$(CONFIG_FREEZER) += process.o obj-$(CONFIG_SUSPEND) += suspend.o obj-$(CONFIG_PM_TEST_SUSPEND) += suspend_test.o -obj-$(CONFIG_HIBERNATION) += hibernate.o snapshot.o swap.o user.o \ - block_io.o +obj-$(CONFIG_HIBERNATION) += hibernate.o snapshot.o swap.o user.o obj-$(CONFIG_PM_AUTOSLEEP) += autosleep.o obj-$(CONFIG_PM_WAKELOCKS) += wakelock.o diff --git a/kernel/power/block_io.c b/kernel/power/block_io.c deleted file mode 100644 index 9a58bc258810..000000000000 --- a/kernel/power/block_io.c +++ /dev/null @@ -1,103 +0,0 @@ -/* - * This file provides functions for block I/O operations on swap/file. - * - * Copyright (C) 1998,2001-2005 Pavel Machek - * Copyright (C) 2006 Rafael J. Wysocki - * - * This file is released under the GPLv2. - */ - -#include -#include -#include -#include - -#include "power.h" - -/** - * submit - submit BIO request. - * @rw: READ or WRITE. - * @off physical offset of page. - * @page: page we're reading or writing. - * @bio_chain: list of pending biod (for async reading) - * - * Straight from the textbook - allocate and initialize the bio. - * If we're reading, make sure the page is marked as dirty. - * Then submit it and, if @bio_chain == NULL, wait. - */ -static int submit(int rw, struct block_device *bdev, sector_t sector, - struct page *page, struct bio **bio_chain) -{ - const int bio_rw = rw | REQ_SYNC; - struct bio *bio; - - bio = bio_alloc(__GFP_WAIT | __GFP_HIGH, 1); - bio->bi_iter.bi_sector = sector; - bio->bi_bdev = bdev; - bio->bi_end_io = end_swap_bio_read; - - if (bio_add_page(bio, page, PAGE_SIZE, 0) < PAGE_SIZE) { - printk(KERN_ERR "PM: Adding page to bio failed at %llu\n", - (unsigned long long)sector); - bio_put(bio); - return -EFAULT; - } - - lock_page(page); - bio_get(bio); - - if (bio_chain == NULL) { - submit_bio(bio_rw, bio); - wait_on_page_locked(page); - if (rw == READ) - bio_set_pages_dirty(bio); - bio_put(bio); - } else { - if (rw == READ) - get_page(page); /* These pages are freed later */ - bio->bi_private = *bio_chain; - *bio_chain = bio; - submit_bio(bio_rw, bio); - } - return 0; -} - -int hib_bio_read_page(pgoff_t page_off, void *addr, struct bio **bio_chain) -{ - return submit(READ, hib_resume_bdev, page_off * (PAGE_SIZE >> 9), - virt_to_page(addr), bio_chain); -} - -int hib_bio_write_page(pgoff_t page_off, void *addr, struct bio **bio_chain) -{ - return submit(WRITE, hib_resume_bdev, page_off * (PAGE_SIZE >> 9), - virt_to_page(addr), bio_chain); -} - -int hib_wait_on_bio_chain(struct bio **bio_chain) -{ - struct bio *bio; - struct bio *next_bio; - int ret = 0; - - if (bio_chain == NULL) - return 0; - - bio = *bio_chain; - if (bio == NULL) - return 0; - while (bio) { - struct page *page; - - next_bio = bio->bi_private; - page = bio->bi_io_vec[0].bv_page; - wait_on_page_locked(page); - if (!PageUptodate(page) || PageError(page)) - ret = -EIO; - put_page(page); - bio_put(bio); - bio = next_bio; - } - *bio_chain = NULL; - return ret; -} diff --git a/kernel/power/power.h b/kernel/power/power.h index ce9b8328a689..caadb566e82b 100644 --- a/kernel/power/power.h +++ b/kernel/power/power.h @@ -163,15 +163,6 @@ extern void swsusp_close(fmode_t); extern int swsusp_unmark(void); #endif -/* kernel/power/block_io.c */ -extern struct block_device *hib_resume_bdev; - -extern int hib_bio_read_page(pgoff_t page_off, void *addr, - struct bio **bio_chain); -extern int hib_bio_write_page(pgoff_t page_off, void *addr, - struct bio **bio_chain); -extern int hib_wait_on_bio_chain(struct bio **bio_chain); - struct timeval; /* kernel/power/swsusp.c */ extern void swsusp_show_speed(ktime_t, ktime_t, unsigned int, char *); diff --git a/kernel/power/swap.c b/kernel/power/swap.c index 570aff817543..2f30ca91e4fa 100644 --- a/kernel/power/swap.c +++ b/kernel/power/swap.c @@ -212,7 +212,84 @@ int swsusp_swap_in_use(void) */ static unsigned short root_swap = 0xffff; -struct block_device *hib_resume_bdev; +static struct block_device *hib_resume_bdev; + +struct hib_bio_batch { + atomic_t count; + wait_queue_head_t wait; + int error; +}; + +static void hib_init_batch(struct hib_bio_batch *hb) +{ + atomic_set(&hb->count, 0); + init_waitqueue_head(&hb->wait); + hb->error = 0; +} + +static void hib_end_io(struct bio *bio, int error) +{ + struct hib_bio_batch *hb = bio->bi_private; + const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); + struct page *page = bio->bi_io_vec[0].bv_page; + + if (!uptodate || error) { + printk(KERN_ALERT "Read-error on swap-device (%u:%u:%Lu)\n", + imajor(bio->bi_bdev->bd_inode), + iminor(bio->bi_bdev->bd_inode), + (unsigned long long)bio->bi_iter.bi_sector); + + if (!error) + error = -EIO; + } + + if (bio_data_dir(bio) == WRITE) + put_page(page); + + if (error && !hb->error) + hb->error = error; + if (atomic_dec_and_test(&hb->count)) + wake_up(&hb->wait); + + bio_put(bio); +} + +static int hib_submit_io(int rw, pgoff_t page_off, void *addr, + struct hib_bio_batch *hb) +{ + struct page *page = virt_to_page(addr); + struct bio *bio; + int error = 0; + + bio = bio_alloc(__GFP_WAIT | __GFP_HIGH, 1); + bio->bi_iter.bi_sector = page_off * (PAGE_SIZE >> 9); + bio->bi_bdev = hib_resume_bdev; + + if (bio_add_page(bio, page, PAGE_SIZE, 0) < PAGE_SIZE) { + printk(KERN_ERR "PM: Adding page to bio failed at %llu\n", + (unsigned long long)bio->bi_iter.bi_sector); + bio_put(bio); + return -EFAULT; + } + + if (hb) { + bio->bi_end_io = hib_end_io; + bio->bi_private = hb; + atomic_inc(&hb->count); + submit_bio(rw, bio); + } else { + error = submit_bio_wait(rw, bio); + bio_put(bio); + } + + return error; +} + +static int hib_wait_io(struct hib_bio_batch *hb) +{ + wait_event(hb->wait, atomic_read(&hb->count) == 0); + return hb->error; +} /* * Saving part @@ -222,7 +299,7 @@ static int mark_swapfiles(struct swap_map_handle *handle, unsigned int flags) { int error; - hib_bio_read_page(swsusp_resume_block, swsusp_header, NULL); + hib_submit_io(READ_SYNC, swsusp_resume_block, swsusp_header, NULL); if (!memcmp("SWAP-SPACE",swsusp_header->sig, 10) || !memcmp("SWAPSPACE2",swsusp_header->sig, 10)) { memcpy(swsusp_header->orig_sig,swsusp_header->sig, 10); @@ -231,7 +308,7 @@ static int mark_swapfiles(struct swap_map_handle *handle, unsigned int flags) swsusp_header->flags = flags; if (flags & SF_CRC32_MODE) swsusp_header->crc32 = handle->crc32; - error = hib_bio_write_page(swsusp_resume_block, + error = hib_submit_io(WRITE_SYNC, swsusp_resume_block, swsusp_header, NULL); } else { printk(KERN_ERR "PM: Swap header not found!\n"); @@ -271,10 +348,10 @@ static int swsusp_swap_check(void) * write_page - Write one page to given swap location. * @buf: Address we're writing. * @offset: Offset of the swap page we're writing to. - * @bio_chain: Link the next write BIO here + * @hb: bio completion batch */ -static int write_page(void *buf, sector_t offset, struct bio **bio_chain) +static int write_page(void *buf, sector_t offset, struct hib_bio_batch *hb) { void *src; int ret; @@ -282,13 +359,13 @@ static int write_page(void *buf, sector_t offset, struct bio **bio_chain) if (!offset) return -ENOSPC; - if (bio_chain) { + if (hb) { src = (void *)__get_free_page(__GFP_WAIT | __GFP_NOWARN | __GFP_NORETRY); if (src) { copy_page(src, buf); } else { - ret = hib_wait_on_bio_chain(bio_chain); /* Free pages */ + ret = hib_wait_io(hb); /* Free pages */ if (ret) return ret; src = (void *)__get_free_page(__GFP_WAIT | @@ -298,14 +375,14 @@ static int write_page(void *buf, sector_t offset, struct bio **bio_chain) copy_page(src, buf); } else { WARN_ON_ONCE(1); - bio_chain = NULL; /* Go synchronous */ + hb = NULL; /* Go synchronous */ src = buf; } } } else { src = buf; } - return hib_bio_write_page(offset, src, bio_chain); + return hib_submit_io(WRITE_SYNC, offset, src, hb); } static void release_swap_writer(struct swap_map_handle *handle) @@ -348,7 +425,7 @@ err_close: } static int swap_write_page(struct swap_map_handle *handle, void *buf, - struct bio **bio_chain) + struct hib_bio_batch *hb) { int error = 0; sector_t offset; @@ -356,7 +433,7 @@ static int swap_write_page(struct swap_map_handle *handle, void *buf, if (!handle->cur) return -EINVAL; offset = alloc_swapdev_block(root_swap); - error = write_page(buf, offset, bio_chain); + error = write_page(buf, offset, hb); if (error) return error; handle->cur->entries[handle->k++] = offset; @@ -365,15 +442,15 @@ static int swap_write_page(struct swap_map_handle *handle, void *buf, if (!offset) return -ENOSPC; handle->cur->next_swap = offset; - error = write_page(handle->cur, handle->cur_swap, bio_chain); + error = write_page(handle->cur, handle->cur_swap, hb); if (error) goto out; clear_page(handle->cur); handle->cur_swap = offset; handle->k = 0; - if (bio_chain && low_free_pages() <= handle->reqd_free_pages) { - error = hib_wait_on_bio_chain(bio_chain); + if (hb && low_free_pages() <= handle->reqd_free_pages) { + error = hib_wait_io(hb); if (error) goto out; /* @@ -445,23 +522,24 @@ static int save_image(struct swap_map_handle *handle, int ret; int nr_pages; int err2; - struct bio *bio; + struct hib_bio_batch hb; ktime_t start; ktime_t stop; + hib_init_batch(&hb); + printk(KERN_INFO "PM: Saving image data pages (%u pages)...\n", nr_to_write); m = nr_to_write / 10; if (!m) m = 1; nr_pages = 0; - bio = NULL; start = ktime_get(); while (1) { ret = snapshot_read_next(snapshot); if (ret <= 0) break; - ret = swap_write_page(handle, data_of(*snapshot), &bio); + ret = swap_write_page(handle, data_of(*snapshot), &hb); if (ret) break; if (!(nr_pages % m)) @@ -469,7 +547,7 @@ static int save_image(struct swap_map_handle *handle, nr_pages / m * 10); nr_pages++; } - err2 = hib_wait_on_bio_chain(&bio); + err2 = hib_wait_io(&hb); stop = ktime_get(); if (!ret) ret = err2; @@ -580,7 +658,7 @@ static int save_image_lzo(struct swap_map_handle *handle, int ret = 0; int nr_pages; int err2; - struct bio *bio; + struct hib_bio_batch hb; ktime_t start; ktime_t stop; size_t off; @@ -589,6 +667,8 @@ static int save_image_lzo(struct swap_map_handle *handle, struct cmp_data *data = NULL; struct crc_data *crc = NULL; + hib_init_batch(&hb); + /* * We'll limit the number of threads for compression to limit memory * footprint. @@ -674,7 +754,6 @@ static int save_image_lzo(struct swap_map_handle *handle, if (!m) m = 1; nr_pages = 0; - bio = NULL; start = ktime_get(); for (;;) { for (thr = 0; thr < nr_threads; thr++) { @@ -748,7 +827,7 @@ static int save_image_lzo(struct swap_map_handle *handle, off += PAGE_SIZE) { memcpy(page, data[thr].cmp + off, PAGE_SIZE); - ret = swap_write_page(handle, page, &bio); + ret = swap_write_page(handle, page, &hb); if (ret) goto out_finish; } @@ -759,7 +838,7 @@ static int save_image_lzo(struct swap_map_handle *handle, } out_finish: - err2 = hib_wait_on_bio_chain(&bio); + err2 = hib_wait_io(&hb); stop = ktime_get(); if (!ret) ret = err2; @@ -906,7 +985,7 @@ static int get_swap_reader(struct swap_map_handle *handle, return -ENOMEM; } - error = hib_bio_read_page(offset, tmp->map, NULL); + error = hib_submit_io(READ_SYNC, offset, tmp->map, NULL); if (error) { release_swap_reader(handle); return error; @@ -919,7 +998,7 @@ static int get_swap_reader(struct swap_map_handle *handle, } static int swap_read_page(struct swap_map_handle *handle, void *buf, - struct bio **bio_chain) + struct hib_bio_batch *hb) { sector_t offset; int error; @@ -930,7 +1009,7 @@ static int swap_read_page(struct swap_map_handle *handle, void *buf, offset = handle->cur->entries[handle->k]; if (!offset) return -EFAULT; - error = hib_bio_read_page(offset, buf, bio_chain); + error = hib_submit_io(READ_SYNC, offset, buf, hb); if (error) return error; if (++handle->k >= MAP_PAGE_ENTRIES) { @@ -968,27 +1047,28 @@ static int load_image(struct swap_map_handle *handle, int ret = 0; ktime_t start; ktime_t stop; - struct bio *bio; + struct hib_bio_batch hb; int err2; unsigned nr_pages; + hib_init_batch(&hb); + printk(KERN_INFO "PM: Loading image data pages (%u pages)...\n", nr_to_read); m = nr_to_read / 10; if (!m) m = 1; nr_pages = 0; - bio = NULL; start = ktime_get(); for ( ; ; ) { ret = snapshot_write_next(snapshot); if (ret <= 0) break; - ret = swap_read_page(handle, data_of(*snapshot), &bio); + ret = swap_read_page(handle, data_of(*snapshot), &hb); if (ret) break; if (snapshot->sync_read) - ret = hib_wait_on_bio_chain(&bio); + ret = hib_wait_io(&hb); if (ret) break; if (!(nr_pages % m)) @@ -996,7 +1076,7 @@ static int load_image(struct swap_map_handle *handle, nr_pages / m * 10); nr_pages++; } - err2 = hib_wait_on_bio_chain(&bio); + err2 = hib_wait_io(&hb); stop = ktime_get(); if (!ret) ret = err2; @@ -1067,7 +1147,7 @@ static int load_image_lzo(struct swap_map_handle *handle, unsigned int m; int ret = 0; int eof = 0; - struct bio *bio; + struct hib_bio_batch hb; ktime_t start; ktime_t stop; unsigned nr_pages; @@ -1080,6 +1160,8 @@ static int load_image_lzo(struct swap_map_handle *handle, struct dec_data *data = NULL; struct crc_data *crc = NULL; + hib_init_batch(&hb); + /* * We'll limit the number of threads for decompression to limit memory * footprint. @@ -1190,7 +1272,6 @@ static int load_image_lzo(struct swap_map_handle *handle, if (!m) m = 1; nr_pages = 0; - bio = NULL; start = ktime_get(); ret = snapshot_write_next(snapshot); @@ -1199,7 +1280,7 @@ static int load_image_lzo(struct swap_map_handle *handle, for(;;) { for (i = 0; !eof && i < want; i++) { - ret = swap_read_page(handle, page[ring], &bio); + ret = swap_read_page(handle, page[ring], &hb); if (ret) { /* * On real read error, finish. On end of data, @@ -1226,7 +1307,7 @@ static int load_image_lzo(struct swap_map_handle *handle, if (!asked) break; - ret = hib_wait_on_bio_chain(&bio); + ret = hib_wait_io(&hb); if (ret) goto out_finish; have += asked; @@ -1281,7 +1362,7 @@ static int load_image_lzo(struct swap_map_handle *handle, * Wait for more data while we are decompressing. */ if (have < LZO_CMP_PAGES && asked) { - ret = hib_wait_on_bio_chain(&bio); + ret = hib_wait_io(&hb); if (ret) goto out_finish; have += asked; @@ -1430,7 +1511,7 @@ int swsusp_check(void) if (!IS_ERR(hib_resume_bdev)) { set_blocksize(hib_resume_bdev, PAGE_SIZE); clear_page(swsusp_header); - error = hib_bio_read_page(swsusp_resume_block, + error = hib_submit_io(READ_SYNC, swsusp_resume_block, swsusp_header, NULL); if (error) goto put; @@ -1438,7 +1519,7 @@ int swsusp_check(void) if (!memcmp(HIBERNATE_SIG, swsusp_header->sig, 10)) { memcpy(swsusp_header->sig, swsusp_header->orig_sig, 10); /* Reset swap signature now */ - error = hib_bio_write_page(swsusp_resume_block, + error = hib_submit_io(WRITE_SYNC, swsusp_resume_block, swsusp_header, NULL); } else { error = -EINVAL; @@ -1482,10 +1563,10 @@ int swsusp_unmark(void) { int error; - hib_bio_read_page(swsusp_resume_block, swsusp_header, NULL); + hib_submit_io(READ_SYNC, swsusp_resume_block, swsusp_header, NULL); if (!memcmp(HIBERNATE_SIG,swsusp_header->sig, 10)) { memcpy(swsusp_header->sig,swsusp_header->orig_sig, 10); - error = hib_bio_write_page(swsusp_resume_block, + error = hib_submit_io(WRITE_SYNC, swsusp_resume_block, swsusp_header, NULL); } else { printk(KERN_ERR "PM: Cannot find swsusp signature!\n"); diff --git a/mm/page_io.c b/mm/page_io.c index 6424869e275e..520baa4b04d7 100644 --- a/mm/page_io.c +++ b/mm/page_io.c @@ -69,7 +69,7 @@ void end_swap_bio_write(struct bio *bio, int err) bio_put(bio); } -void end_swap_bio_read(struct bio *bio, int err) +static void end_swap_bio_read(struct bio *bio, int err) { const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); struct page *page = bio->bi_io_vec[0].bv_page; From be32417796c2b8a83fe4cbece83bea96ab9e378f Mon Sep 17 00:00:00 2001 From: Jarod Wilson Date: Wed, 6 May 2015 12:26:22 +0800 Subject: [PATCH 20/24] block: export blkdev_reread_part() and __blkdev_reread_part() This patch exports blkdev_reread_part() for block drivers, also introduce __blkdev_reread_part(). For some drivers, such as loop, reread of partitions can be run from the release path, and bd_mutex may already be held prior to calling ioctl_by_bdev(bdev, BLKRRPART, 0), so introduce __blkdev_reread_part for use in such cases. CC: Christoph Hellwig CC: Jens Axboe CC: Tejun Heo CC: Alexander Viro CC: Markus Pargmann CC: Stefan Weinhuber CC: Stefan Haberland CC: Sebastian Ott CC: Fabian Frederick CC: Ming Lei CC: David Herrmann CC: Andrew Morton CC: Peter Zijlstra CC: nbd-general@lists.sourceforge.net CC: linux-s390@vger.kernel.org Reviewed-by: Christoph Hellwig Signed-off-by: Jarod Wilson Signed-off-by: Ming Lei Signed-off-by: Jens Axboe --- block/ioctl.c | 28 +++++++++++++++++++++++++--- include/linux/fs.h | 3 +++ 2 files changed, 28 insertions(+), 3 deletions(-) diff --git a/block/ioctl.c b/block/ioctl.c index 7d8befde2aca..203cb4aeea8b 100644 --- a/block/ioctl.c +++ b/block/ioctl.c @@ -150,21 +150,43 @@ static int blkpg_ioctl(struct block_device *bdev, struct blkpg_ioctl_arg __user } } -static int blkdev_reread_part(struct block_device *bdev) +/* + * This is an exported API for the block driver, and will not + * acquire bd_mutex. This API should be used in case that + * caller has held bd_mutex already. + */ +int __blkdev_reread_part(struct block_device *bdev) { struct gendisk *disk = bdev->bd_disk; - int res; if (!disk_part_scan_enabled(disk) || bdev != bdev->bd_contains) return -EINVAL; if (!capable(CAP_SYS_ADMIN)) return -EACCES; + + lockdep_assert_held(&bdev->bd_mutex); + + return rescan_partitions(disk, bdev); +} +EXPORT_SYMBOL(__blkdev_reread_part); + +/* + * This is an exported API for the block driver, and will + * try to acquire bd_mutex. If bd_mutex has been held already + * in current context, please call __blkdev_reread_part(). + */ +int blkdev_reread_part(struct block_device *bdev) +{ + int res; + if (!mutex_trylock(&bdev->bd_mutex)) return -EBUSY; - res = rescan_partitions(disk, bdev); + res = __blkdev_reread_part(bdev); mutex_unlock(&bdev->bd_mutex); + return res; } +EXPORT_SYMBOL(blkdev_reread_part); static int blk_ioctl_discard(struct block_device *bdev, uint64_t start, uint64_t len, int secure) diff --git a/include/linux/fs.h b/include/linux/fs.h index 35ec87e490b1..1ef63900243c 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2279,6 +2279,9 @@ extern struct block_device *blkdev_get_by_path(const char *path, fmode_t mode, extern struct block_device *blkdev_get_by_dev(dev_t dev, fmode_t mode, void *holder); extern void blkdev_put(struct block_device *bdev, fmode_t mode); +extern int __blkdev_reread_part(struct block_device *bdev); +extern int blkdev_reread_part(struct block_device *bdev); + #ifdef CONFIG_SYSFS extern int bd_link_disk_holder(struct block_device *bdev, struct gendisk *disk); extern void bd_unlink_disk_holder(struct block_device *bdev, From b04a5636a665f5529fdf69ee7e5512156196f31c Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Wed, 6 May 2015 12:26:27 +0800 Subject: [PATCH 21/24] block: replace trylock with mutex_lock in blkdev_reread_part() The only possible problem of using mutex_lock() instead of trylock is about deadlock. If there aren't any locks held before calling blkdev_reread_part(), deadlock can't be caused by this conversion. If there are locks held before calling blkdev_reread_part(), and if these locks arn't required in open, close handler and I/O path, deadlock shouldn't be caused too. Both user space's ioctl(BLKRRPART) and md_setup_drive() from init/do_mounts_md.c belongs to the 1st case, so the conversion is safe for the two cases. For loop, the previous patches in this pathset has fixed the ABBA lock dependency, so the conversion is OK. For nbd, tx_lock is held when calling the function: - both open and release won't hold the lock - when blkdev_reread_part() is run, I/O thread has been stopped already, so tx_lock won't be acquired in I/O path at that time. - so the conversion won't cause deadlock for nbd For dasd, both dasd_open(), dasd_release() and request function don't acquire any mutex/semphone, so the conversion should be safe. Reviewed-by: Christoph Hellwig Tested-by: Jarod Wilson Acked-by: Jarod Wilson Signed-off-by: Ming Lei Signed-off-by: Jens Axboe --- block/ioctl.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/block/ioctl.c b/block/ioctl.c index 203cb4aeea8b..8061eba42887 100644 --- a/block/ioctl.c +++ b/block/ioctl.c @@ -174,13 +174,18 @@ EXPORT_SYMBOL(__blkdev_reread_part); * This is an exported API for the block driver, and will * try to acquire bd_mutex. If bd_mutex has been held already * in current context, please call __blkdev_reread_part(). + * + * Make sure the held locks in current context aren't required + * in open()/close() handler and I/O path for avoiding ABBA deadlock: + * - bd_mutex is held before calling block driver's open/close + * handler + * - reading partition table may submit I/O to the block device */ int blkdev_reread_part(struct block_device *bdev) { int res; - if (!mutex_trylock(&bdev->bd_mutex)) - return -EBUSY; + mutex_lock(&bdev->bd_mutex); res = __blkdev_reread_part(bdev); mutex_unlock(&bdev->bd_mutex); From 326e1dbb57368087a36607aaebe9795b8d5453e5 Mon Sep 17 00:00:00 2001 From: Mike Snitzer Date: Fri, 22 May 2015 09:14:03 -0400 Subject: [PATCH 22/24] block: remove management of bi_remaining when restoring original bi_end_io Commit c4cf5261 ("bio: skip atomic inc/dec of ->bi_remaining for non-chains") regressed all existing callers that followed this pattern: 1) saving a bio's original bi_end_io 2) wiring up an intermediate bi_end_io 3) restoring the original bi_end_io from intermediate bi_end_io 4) calling bio_endio() to execute the restored original bi_end_io The regression was due to BIO_CHAIN only ever getting set if bio_inc_remaining() is called. For the above pattern it isn't set until step 3 above (step 2 would've needed to establish BIO_CHAIN). As such the first bio_endio(), in step 2 above, never decremented __bi_remaining before calling the intermediate bi_end_io -- leaving __bi_remaining with the value 1 instead of 0. When bio_inc_remaining() occurred during step 3 it brought it to a value of 2. When the second bio_endio() was called, in step 4 above, it should've called the original bi_end_io but it didn't because there was an extra reference that wasn't dropped (due to atomic operations being optimized away since BIO_CHAIN wasn't set upfront). Fix this issue by removing the __bi_remaining management complexity for all callers that use the above pattern -- bio_chain() is the only interface that _needs_ to be concerned with __bi_remaining. For the above pattern callers just expect the bi_end_io they set to get called! Remove bio_endio_nodec() and also remove all bio_inc_remaining() calls that aren't associated with the bio_chain() interface. Also, the bio_inc_remaining() interface has been moved local to bio.c. Fixes: c4cf5261 ("bio: skip atomic inc/dec of ->bi_remaining for non-chains") Reviewed-by: Christoph Hellwig Reviewed-by: Jan Kara Signed-off-by: Mike Snitzer Signed-off-by: Jens Axboe --- block/bio-integrity.c | 4 ++-- block/bio.c | 35 ++++++++++++++--------------------- drivers/md/bcache/io.c | 2 +- drivers/md/dm-cache-target.c | 6 ------ drivers/md/dm-raid1.c | 2 -- drivers/md/dm-snap.c | 1 - drivers/md/dm-thin.c | 9 +++------ drivers/md/dm-verity.c | 2 +- fs/btrfs/disk-io.c | 2 +- fs/btrfs/volumes.c | 16 +++++----------- fs/btrfs/volumes.h | 2 -- include/linux/bio.h | 12 ------------ 12 files changed, 27 insertions(+), 66 deletions(-) diff --git a/block/bio-integrity.c b/block/bio-integrity.c index 5cbd5d9ea61d..0436c21db7f2 100644 --- a/block/bio-integrity.c +++ b/block/bio-integrity.c @@ -361,7 +361,7 @@ static void bio_integrity_verify_fn(struct work_struct *work) /* Restore original bio completion handler */ bio->bi_end_io = bip->bip_end_io; - bio_endio_nodec(bio, error); + bio_endio(bio, error); } /** @@ -388,7 +388,7 @@ void bio_integrity_endio(struct bio *bio, int error) */ if (error) { bio->bi_end_io = bip->bip_end_io; - bio_endio_nodec(bio, error); + bio_endio(bio, error); return; } diff --git a/block/bio.c b/block/bio.c index c2ff8a88aef1..259197d97de1 100644 --- a/block/bio.c +++ b/block/bio.c @@ -303,6 +303,17 @@ static void bio_chain_endio(struct bio *bio, int error) bio_put(bio); } +/* + * Increment chain count for the bio. Make sure the CHAIN flag update + * is visible before the raised count. + */ +static inline void bio_inc_remaining(struct bio *bio) +{ + bio->bi_flags |= (1 << BIO_CHAIN); + smp_mb__before_atomic(); + atomic_inc(&bio->__bi_remaining); +} + /** * bio_chain - chain bio completions * @bio: the target bio @@ -1756,8 +1767,10 @@ static inline bool bio_remaining_done(struct bio *bio) BUG_ON(atomic_read(&bio->__bi_remaining) <= 0); - if (atomic_dec_and_test(&bio->__bi_remaining)) + if (atomic_dec_and_test(&bio->__bi_remaining)) { + clear_bit(BIO_CHAIN, &bio->bi_flags); return true; + } return false; } @@ -1808,26 +1821,6 @@ void bio_endio(struct bio *bio, int error) } EXPORT_SYMBOL(bio_endio); -/** - * bio_endio_nodec - end I/O on a bio, without decrementing bi_remaining - * @bio: bio - * @error: error, if any - * - * For code that has saved and restored bi_end_io; thing hard before using this - * function, probably you should've cloned the entire bio. - **/ -void bio_endio_nodec(struct bio *bio, int error) -{ - /* - * If it's not flagged as a chain, we are not going to dec the count - */ - if (bio_flagged(bio, BIO_CHAIN)) - bio_inc_remaining(bio); - - bio_endio(bio, error); -} -EXPORT_SYMBOL(bio_endio_nodec); - /** * bio_split - split a bio * @bio: bio to split diff --git a/drivers/md/bcache/io.c b/drivers/md/bcache/io.c index fa028fa82df4..cb64e64a4789 100644 --- a/drivers/md/bcache/io.c +++ b/drivers/md/bcache/io.c @@ -55,7 +55,7 @@ static void bch_bio_submit_split_done(struct closure *cl) s->bio->bi_end_io = s->bi_end_io; s->bio->bi_private = s->bi_private; - bio_endio_nodec(s->bio, 0); + bio_endio(s->bio, 0); closure_debug_destroy(&s->cl); mempool_free(s, s->p->bio_split_hook); diff --git a/drivers/md/dm-cache-target.c b/drivers/md/dm-cache-target.c index 705eb7b99d69..41b2594a80c6 100644 --- a/drivers/md/dm-cache-target.c +++ b/drivers/md/dm-cache-target.c @@ -86,12 +86,6 @@ static void dm_unhook_bio(struct dm_hook_info *h, struct bio *bio) { bio->bi_end_io = h->bi_end_io; bio->bi_private = h->bi_private; - - /* - * Must bump bi_remaining to allow bio to complete with - * restored bi_end_io. - */ - bio_inc_remaining(bio); } /*----------------------------------------------------------------*/ diff --git a/drivers/md/dm-raid1.c b/drivers/md/dm-raid1.c index d6a1c096b777..743fa9bbae9e 100644 --- a/drivers/md/dm-raid1.c +++ b/drivers/md/dm-raid1.c @@ -1254,8 +1254,6 @@ static int mirror_end_io(struct dm_target *ti, struct bio *bio, int error) dm_bio_restore(bd, bio); bio_record->details.bi_bdev = NULL; - bio_inc_remaining(bio); - queue_bio(ms, bio, rw); return DM_ENDIO_INCOMPLETE; } diff --git a/drivers/md/dm-snap.c b/drivers/md/dm-snap.c index 8bfeae218531..7c82d3ccce87 100644 --- a/drivers/md/dm-snap.c +++ b/drivers/md/dm-snap.c @@ -1478,7 +1478,6 @@ out: if (full_bio) { full_bio->bi_end_io = pe->full_bio_end_io; full_bio->bi_private = pe->full_bio_private; - bio_inc_remaining(full_bio); } increment_pending_exceptions_done_count(); diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c index 342dbdad6131..e852602c0091 100644 --- a/drivers/md/dm-thin.c +++ b/drivers/md/dm-thin.c @@ -793,10 +793,9 @@ static void inc_remap_and_issue_cell(struct thin_c *tc, static void process_prepared_mapping_fail(struct dm_thin_new_mapping *m) { - if (m->bio) { + if (m->bio) m->bio->bi_end_io = m->saved_bi_end_io; - bio_inc_remaining(m->bio); - } + cell_error(m->tc->pool, m->cell); list_del(&m->list); mempool_free(m, m->tc->pool->mapping_pool); @@ -810,10 +809,8 @@ static void process_prepared_mapping(struct dm_thin_new_mapping *m) int r; bio = m->bio; - if (bio) { + if (bio) bio->bi_end_io = m->saved_bi_end_io; - bio_inc_remaining(bio); - } if (m->err) { cell_error(pool, m->cell); diff --git a/drivers/md/dm-verity.c b/drivers/md/dm-verity.c index 66616db33e6f..bb9c6a00e4b0 100644 --- a/drivers/md/dm-verity.c +++ b/drivers/md/dm-verity.c @@ -459,7 +459,7 @@ static void verity_finish_io(struct dm_verity_io *io, int error) bio->bi_end_io = io->orig_bi_end_io; bio->bi_private = io->orig_bi_private; - bio_endio_nodec(bio, error); + bio_endio(bio, error); } static void verity_work(struct work_struct *w) diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index e08a926fe12c..0bccf18dc1dc 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -1745,7 +1745,7 @@ static void end_workqueue_fn(struct btrfs_work *work) bio->bi_private = end_io_wq->private; bio->bi_end_io = end_io_wq->end_io; kmem_cache_free(btrfs_end_io_wq_cache, end_io_wq); - bio_endio_nodec(bio, error); + bio_endio(bio, error); } static int cleaner_kthread(void *arg) diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 8e8d1d1e28a5..dac77d42a9ab 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -5585,10 +5585,10 @@ int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree, static inline void btrfs_end_bbio(struct btrfs_bio *bbio, struct bio *bio, int err) { - if (likely(bbio->flags & BTRFS_BIO_ORIG_BIO_SUBMITTED)) - bio_endio_nodec(bio, err); - else - bio_endio(bio, err); + bio->bi_private = bbio->private; + bio->bi_end_io = bbio->end_io; + bio_endio(bio, err); + btrfs_put_bbio(bbio); } @@ -5632,8 +5632,6 @@ static void btrfs_end_bio(struct bio *bio, int err) bio = bbio->orig_bio; } - bio->bi_private = bbio->private; - bio->bi_end_io = bbio->end_io; btrfs_io_bio(bio)->mirror_num = bbio->mirror_num; /* only send an error to the higher layers if it is * beyond the tolerance of the btrfs bio @@ -5815,8 +5813,6 @@ static void bbio_error(struct btrfs_bio *bbio, struct bio *bio, u64 logical) /* Shoud be the original bio. */ WARN_ON(bio != bbio->orig_bio); - bio->bi_private = bbio->private; - bio->bi_end_io = bbio->end_io; btrfs_io_bio(bio)->mirror_num = bbio->mirror_num; bio->bi_iter.bi_sector = logical >> 9; @@ -5897,10 +5893,8 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio, if (dev_nr < total_devs - 1) { bio = btrfs_bio_clone(first_bio, GFP_NOFS); BUG_ON(!bio); /* -ENOMEM */ - } else { + } else bio = first_bio; - bbio->flags |= BTRFS_BIO_ORIG_BIO_SUBMITTED; - } submit_stripe_bio(root, bbio, bio, bbio->stripes[dev_nr].physical, dev_nr, rw, diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index ebc31331a837..cedae0356558 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -292,8 +292,6 @@ struct btrfs_bio_stripe { struct btrfs_bio; typedef void (btrfs_bio_end_io_t) (struct btrfs_bio *bio, int err); -#define BTRFS_BIO_ORIG_BIO_SUBMITTED (1 << 0) - struct btrfs_bio { atomic_t refs; atomic_t stripes_pending; diff --git a/include/linux/bio.h b/include/linux/bio.h index 7486ea103f6e..f0291cf64cc5 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -427,7 +427,6 @@ static inline struct bio *bio_clone_kmalloc(struct bio *bio, gfp_t gfp_mask) } extern void bio_endio(struct bio *, int); -extern void bio_endio_nodec(struct bio *, int); struct request_queue; extern int bio_phys_segments(struct request_queue *, struct bio *); @@ -658,17 +657,6 @@ static inline struct bio *bio_list_get(struct bio_list *bl) return bio; } -/* - * Increment chain count for the bio. Make sure the CHAIN flag update - * is visible before the raised count. - */ -static inline void bio_inc_remaining(struct bio *bio) -{ - bio->bi_flags |= (1 << BIO_CHAIN); - smp_mb__before_atomic(); - atomic_inc(&bio->__bi_remaining); -} - /* * bio_set is used to allow other portions of the IO system to * allocate their own private memory pools for bio and iovec structures. From 5f1b670d0bef508a5554d92525f5f6d00d640b38 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 22 May 2015 09:14:04 -0400 Subject: [PATCH 23/24] block, dm: don't copy bios for request clones Currently dm-multipath has to clone the bios for every request sent to the lower devices, which wastes cpu cycles and ties down memory. This patch instead adds a new REQ_CLONE flag that instructs req_bio_endio to not complete bios attached to a request, which we set on clone requests similar to bios in a flush sequence. With this change I/O errors on a path failure only get propagated to dm-multipath, which can then either resubmit the I/O or complete the bios on the original request. I've done some basic testing of this on a Linux target with ALUA support, and it survives path failures during I/O nicely. Signed-off-by: Christoph Hellwig Signed-off-by: Mike Snitzer Signed-off-by: Jens Axboe --- block/blk-core.c | 94 +++------------------ drivers/md/dm-table.c | 25 ++++-- drivers/md/dm.c | 171 +++++++++----------------------------- drivers/md/dm.h | 5 +- include/linux/blk_types.h | 2 + include/linux/blkdev.h | 6 +- 6 files changed, 73 insertions(+), 230 deletions(-) diff --git a/block/blk-core.c b/block/blk-core.c index de474b5dee2b..aa819a58ea24 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -117,7 +117,7 @@ EXPORT_SYMBOL(blk_rq_init); static void req_bio_endio(struct request *rq, struct bio *bio, unsigned int nbytes, int error) { - if (error) + if (error && !(rq->cmd_flags & REQ_CLONE)) clear_bit(BIO_UPTODATE, &bio->bi_flags); else if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) error = -EIO; @@ -128,7 +128,8 @@ static void req_bio_endio(struct request *rq, struct bio *bio, bio_advance(bio, nbytes); /* don't actually finish bio if it's part of flush sequence */ - if (bio->bi_iter.bi_size == 0 && !(rq->cmd_flags & REQ_FLUSH_SEQ)) + if (bio->bi_iter.bi_size == 0 && + !(rq->cmd_flags & (REQ_FLUSH_SEQ|REQ_CLONE))) bio_endio(bio, error); } @@ -2909,95 +2910,22 @@ int blk_lld_busy(struct request_queue *q) } EXPORT_SYMBOL_GPL(blk_lld_busy); -/** - * blk_rq_unprep_clone - Helper function to free all bios in a cloned request - * @rq: the clone request to be cleaned up - * - * Description: - * Free all bios in @rq for a cloned request. - */ -void blk_rq_unprep_clone(struct request *rq) -{ - struct bio *bio; - - while ((bio = rq->bio) != NULL) { - rq->bio = bio->bi_next; - - bio_put(bio); - } -} -EXPORT_SYMBOL_GPL(blk_rq_unprep_clone); - -/* - * Copy attributes of the original request to the clone request. - * The actual data parts (e.g. ->cmd, ->sense) are not copied. - */ -static void __blk_rq_prep_clone(struct request *dst, struct request *src) +void blk_rq_prep_clone(struct request *dst, struct request *src) { dst->cpu = src->cpu; - dst->cmd_flags |= (src->cmd_flags & REQ_CLONE_MASK) | REQ_NOMERGE; + dst->cmd_flags |= (src->cmd_flags & REQ_CLONE_MASK); + dst->cmd_flags |= REQ_NOMERGE | REQ_CLONE; dst->cmd_type = src->cmd_type; dst->__sector = blk_rq_pos(src); dst->__data_len = blk_rq_bytes(src); dst->nr_phys_segments = src->nr_phys_segments; dst->ioprio = src->ioprio; dst->extra_len = src->extra_len; -} - -/** - * blk_rq_prep_clone - Helper function to setup clone request - * @rq: the request to be setup - * @rq_src: original request to be cloned - * @bs: bio_set that bios for clone are allocated from - * @gfp_mask: memory allocation mask for bio - * @bio_ctr: setup function to be called for each clone bio. - * Returns %0 for success, non %0 for failure. - * @data: private data to be passed to @bio_ctr - * - * Description: - * Clones bios in @rq_src to @rq, and copies attributes of @rq_src to @rq. - * The actual data parts of @rq_src (e.g. ->cmd, ->sense) - * are not copied, and copying such parts is the caller's responsibility. - * Also, pages which the original bios are pointing to are not copied - * and the cloned bios just point same pages. - * So cloned bios must be completed before original bios, which means - * the caller must complete @rq before @rq_src. - */ -int blk_rq_prep_clone(struct request *rq, struct request *rq_src, - struct bio_set *bs, gfp_t gfp_mask, - int (*bio_ctr)(struct bio *, struct bio *, void *), - void *data) -{ - struct bio *bio, *bio_src; - - if (!bs) - bs = fs_bio_set; - - __rq_for_each_bio(bio_src, rq_src) { - bio = bio_clone_fast(bio_src, gfp_mask, bs); - if (!bio) - goto free_and_out; - - if (bio_ctr && bio_ctr(bio, bio_src, data)) - goto free_and_out; - - if (rq->bio) { - rq->biotail->bi_next = bio; - rq->biotail = bio; - } else - rq->bio = rq->biotail = bio; - } - - __blk_rq_prep_clone(rq, rq_src); - - return 0; - -free_and_out: - if (bio) - bio_put(bio); - blk_rq_unprep_clone(rq); - - return -ENOMEM; + dst->bio = src->bio; + dst->biotail = src->biotail; + dst->cmd = src->cmd; + dst->cmd_len = src->cmd_len; + dst->sense = src->sense; } EXPORT_SYMBOL_GPL(blk_rq_prep_clone); diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c index d9b00b8565c6..3662b2e49b8d 100644 --- a/drivers/md/dm-table.c +++ b/drivers/md/dm-table.c @@ -940,21 +940,28 @@ static int dm_table_alloc_md_mempools(struct dm_table *t, struct mapped_device * { unsigned type = dm_table_get_type(t); unsigned per_bio_data_size = 0; - struct dm_target *tgt; unsigned i; - if (unlikely(type == DM_TYPE_NONE)) { + switch (type) { + case DM_TYPE_BIO_BASED: + for (i = 0; i < t->num_targets; i++) { + struct dm_target *tgt = t->targets + i; + + per_bio_data_size = max(per_bio_data_size, + tgt->per_bio_data_size); + } + t->mempools = dm_alloc_bio_mempools(t->integrity_supported, + per_bio_data_size); + break; + case DM_TYPE_REQUEST_BASED: + case DM_TYPE_MQ_REQUEST_BASED: + t->mempools = dm_alloc_rq_mempools(md, type); + break; + default: DMWARN("no table type is set, can't allocate mempools"); return -EINVAL; } - if (type == DM_TYPE_BIO_BASED) - for (i = 0; i < t->num_targets; i++) { - tgt = t->targets + i; - per_bio_data_size = max(per_bio_data_size, tgt->per_bio_data_size); - } - - t->mempools = dm_alloc_md_mempools(md, type, t->integrity_supported, per_bio_data_size); if (!t->mempools) return -ENOMEM; diff --git a/drivers/md/dm.c b/drivers/md/dm.c index a930b72314ac..38837f8ea327 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -990,57 +990,6 @@ static void clone_endio(struct bio *bio, int error) dec_pending(io, error); } -/* - * Partial completion handling for request-based dm - */ -static void end_clone_bio(struct bio *clone, int error) -{ - struct dm_rq_clone_bio_info *info = - container_of(clone, struct dm_rq_clone_bio_info, clone); - struct dm_rq_target_io *tio = info->tio; - struct bio *bio = info->orig; - unsigned int nr_bytes = info->orig->bi_iter.bi_size; - - bio_put(clone); - - if (tio->error) - /* - * An error has already been detected on the request. - * Once error occurred, just let clone->end_io() handle - * the remainder. - */ - return; - else if (error) { - /* - * Don't notice the error to the upper layer yet. - * The error handling decision is made by the target driver, - * when the request is completed. - */ - tio->error = error; - return; - } - - /* - * I/O for the bio successfully completed. - * Notice the data completion to the upper layer. - */ - - /* - * bios are processed from the head of the list. - * So the completing bio should always be rq->bio. - * If it's not, something wrong is happening. - */ - if (tio->orig->bio != bio) - DMERR("bio completion is going in the middle of the request"); - - /* - * Update the original request. - * Do not use blk_end_request() here, because it may complete - * the original request before the clone, and break the ordering. - */ - blk_update_request(tio->orig, 0, nr_bytes); -} - static struct dm_rq_target_io *tio_from_request(struct request *rq) { return (rq->q->mq_ops ? blk_mq_rq_to_pdu(rq) : rq->special); @@ -1089,8 +1038,6 @@ static void free_rq_clone(struct request *clone, bool must_be_mapped) WARN_ON_ONCE(must_be_mapped && !clone->q); - blk_rq_unprep_clone(clone); - if (md->type == DM_TYPE_MQ_REQUEST_BASED) /* stacked on blk-mq queue(s) */ tio->ti->type->release_clone_rq(clone); @@ -1821,39 +1768,13 @@ static void dm_dispatch_clone_request(struct request *clone, struct request *rq) dm_complete_request(rq, r); } -static int dm_rq_bio_constructor(struct bio *bio, struct bio *bio_orig, - void *data) +static void setup_clone(struct request *clone, struct request *rq, + struct dm_rq_target_io *tio) { - struct dm_rq_target_io *tio = data; - struct dm_rq_clone_bio_info *info = - container_of(bio, struct dm_rq_clone_bio_info, clone); - - info->orig = bio_orig; - info->tio = tio; - bio->bi_end_io = end_clone_bio; - - return 0; -} - -static int setup_clone(struct request *clone, struct request *rq, - struct dm_rq_target_io *tio, gfp_t gfp_mask) -{ - int r; - - r = blk_rq_prep_clone(clone, rq, tio->md->bs, gfp_mask, - dm_rq_bio_constructor, tio); - if (r) - return r; - - clone->cmd = rq->cmd; - clone->cmd_len = rq->cmd_len; - clone->sense = rq->sense; + blk_rq_prep_clone(clone, rq); clone->end_io = end_clone_request; clone->end_io_data = tio; - tio->clone = clone; - - return 0; } static struct request *clone_rq(struct request *rq, struct mapped_device *md, @@ -1874,12 +1795,7 @@ static struct request *clone_rq(struct request *rq, struct mapped_device *md, clone = tio->clone; blk_rq_init(NULL, clone); - if (setup_clone(clone, rq, tio, gfp_mask)) { - /* -ENOMEM */ - if (alloc_clone) - free_clone_request(md, clone); - return NULL; - } + setup_clone(clone, rq, tio); return clone; } @@ -1973,11 +1889,7 @@ static int map_request(struct dm_rq_target_io *tio, struct request *rq, } if (IS_ERR(clone)) return DM_MAPIO_REQUEUE; - if (setup_clone(clone, rq, tio, GFP_ATOMIC)) { - /* -ENOMEM */ - ti->type->release_clone_rq(clone); - return DM_MAPIO_REQUEUE; - } + setup_clone(clone, rq, tio); } switch (r) { @@ -2431,8 +2343,6 @@ static void __bind_mempools(struct mapped_device *md, struct dm_table *t) goto out; } - BUG_ON(!p || md->io_pool || md->rq_pool || md->bs); - md->io_pool = p->io_pool; p->io_pool = NULL; md->rq_pool = p->rq_pool; @@ -3536,48 +3446,23 @@ int dm_noflush_suspending(struct dm_target *ti) } EXPORT_SYMBOL_GPL(dm_noflush_suspending); -struct dm_md_mempools *dm_alloc_md_mempools(struct mapped_device *md, unsigned type, - unsigned integrity, unsigned per_bio_data_size) +struct dm_md_mempools *dm_alloc_bio_mempools(unsigned integrity, + unsigned per_bio_data_size) { - struct dm_md_mempools *pools = kzalloc(sizeof(*pools), GFP_KERNEL); - struct kmem_cache *cachep = NULL; - unsigned int pool_size = 0; + struct dm_md_mempools *pools; + unsigned int pool_size = dm_get_reserved_bio_based_ios(); unsigned int front_pad; + pools = kzalloc(sizeof(*pools), GFP_KERNEL); if (!pools) return NULL; - type = filter_md_type(type, md); + front_pad = roundup(per_bio_data_size, __alignof__(struct dm_target_io)) + + offsetof(struct dm_target_io, clone); - switch (type) { - case DM_TYPE_BIO_BASED: - cachep = _io_cache; - pool_size = dm_get_reserved_bio_based_ios(); - front_pad = roundup(per_bio_data_size, __alignof__(struct dm_target_io)) + offsetof(struct dm_target_io, clone); - break; - case DM_TYPE_REQUEST_BASED: - cachep = _rq_tio_cache; - pool_size = dm_get_reserved_rq_based_ios(); - pools->rq_pool = mempool_create_slab_pool(pool_size, _rq_cache); - if (!pools->rq_pool) - goto out; - /* fall through to setup remaining rq-based pools */ - case DM_TYPE_MQ_REQUEST_BASED: - if (!pool_size) - pool_size = dm_get_reserved_rq_based_ios(); - front_pad = offsetof(struct dm_rq_clone_bio_info, clone); - /* per_bio_data_size is not used. See __bind_mempools(). */ - WARN_ON(per_bio_data_size != 0); - break; - default: - BUG(); - } - - if (cachep) { - pools->io_pool = mempool_create_slab_pool(pool_size, cachep); - if (!pools->io_pool) - goto out; - } + pools->io_pool = mempool_create_slab_pool(pool_size, _io_cache); + if (!pools->io_pool) + goto out; pools->bs = bioset_create_nobvec(pool_size, front_pad); if (!pools->bs) @@ -3587,10 +3472,34 @@ struct dm_md_mempools *dm_alloc_md_mempools(struct mapped_device *md, unsigned t goto out; return pools; - out: dm_free_md_mempools(pools); + return NULL; +} +struct dm_md_mempools *dm_alloc_rq_mempools(struct mapped_device *md, + unsigned type) +{ + unsigned int pool_size = dm_get_reserved_rq_based_ios(); + struct dm_md_mempools *pools; + + pools = kzalloc(sizeof(*pools), GFP_KERNEL); + if (!pools) + return NULL; + + if (filter_md_type(type, md) == DM_TYPE_REQUEST_BASED) { + pools->rq_pool = mempool_create_slab_pool(pool_size, _rq_cache); + if (!pools->rq_pool) + goto out; + } + + pools->io_pool = mempool_create_slab_pool(pool_size, _rq_tio_cache); + if (!pools->io_pool) + goto out; + + return pools; +out: + dm_free_md_mempools(pools); return NULL; } diff --git a/drivers/md/dm.h b/drivers/md/dm.h index 6123c2bf9150..e6e66d087b26 100644 --- a/drivers/md/dm.h +++ b/drivers/md/dm.h @@ -222,8 +222,9 @@ void dm_kcopyd_exit(void); /* * Mempool operations */ -struct dm_md_mempools *dm_alloc_md_mempools(struct mapped_device *md, unsigned type, - unsigned integrity, unsigned per_bio_data_size); +struct dm_md_mempools *dm_alloc_bio_mempools(unsigned integrity, + unsigned per_bio_data_size); +struct dm_md_mempools *dm_alloc_rq_mempools(struct mapped_device *md, unsigned type); void dm_free_md_mempools(struct dm_md_mempools *pools); /* diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index 3f4ded0b1a34..45a6be89957c 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -192,6 +192,7 @@ enum rq_flag_bits { __REQ_HASHED, /* on IO scheduler merge hash */ __REQ_MQ_INFLIGHT, /* track inflight for MQ */ __REQ_NO_TIMEOUT, /* requests may never expire */ + __REQ_CLONE, /* cloned bios */ __REQ_NR_BITS, /* stops here */ }; @@ -246,5 +247,6 @@ enum rq_flag_bits { #define REQ_HASHED (1ULL << __REQ_HASHED) #define REQ_MQ_INFLIGHT (1ULL << __REQ_MQ_INFLIGHT) #define REQ_NO_TIMEOUT (1ULL << __REQ_NO_TIMEOUT) +#define REQ_CLONE (1ULL << __REQ_CLONE) #endif /* __LINUX_BLK_TYPES_H */ diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index bc917956a6d0..9ded80da2c16 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -775,11 +775,7 @@ extern void blk_add_request_payload(struct request *rq, struct page *page, unsigned int len); extern int blk_rq_check_limits(struct request_queue *q, struct request *rq); extern int blk_lld_busy(struct request_queue *q); -extern int blk_rq_prep_clone(struct request *rq, struct request *rq_src, - struct bio_set *bs, gfp_t gfp_mask, - int (*bio_ctr)(struct bio *, struct bio *, void *), - void *data); -extern void blk_rq_unprep_clone(struct request *rq); +extern void blk_rq_prep_clone(struct request *rq, struct request *rq_src); extern int blk_insert_cloned_request(struct request_queue *q, struct request *rq); extern void blk_delay_queue(struct request_queue *, unsigned long); From f6454b049d81bb3d732b6a8afde08420589c6af9 Mon Sep 17 00:00:00 2001 From: Julia Lawall Date: Tue, 26 May 2015 21:59:53 +0200 Subject: [PATCH 24/24] block: fix returnvar.cocci warnings Remove unneeded variable used to store return value. Generated by: scripts/coccinelle/misc/returnvar.cocci Signed-off-by: Fengguang Wu Signed-off-by: Julia Lawall Signed-off-by: Jens Axboe --- fs/buffer.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/fs/buffer.c b/fs/buffer.c index efd85e0e8660..f96173ad62d9 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -2996,7 +2996,6 @@ void guard_bio_eod(int rw, struct bio *bio) int _submit_bh(int rw, struct buffer_head *bh, unsigned long bio_flags) { struct bio *bio; - int ret = 0; BUG_ON(!buffer_locked(bh)); BUG_ON(!buffer_mapped(bh)); @@ -3038,7 +3037,7 @@ int _submit_bh(int rw, struct buffer_head *bh, unsigned long bio_flags) rw |= REQ_PRIO; submit_bio(rw, bio); - return ret; + return 0; } EXPORT_SYMBOL_GPL(_submit_bh);