From eb5df87fab0ae7114b83dc7f338b27d039374767 Mon Sep 17 00:00:00 2001 From: Bob Liu Date: Sat, 14 Nov 2015 11:12:10 +0800 Subject: [PATCH 01/21] xen/blkif: document blkif multi-queue/ring extension Document the multi-queue/ring feature in terms of XenStore keys to be written by the backend and by the frontend. Signed-off-by: Bob Liu Signed-off-by: Konrad Rzeszutek Wilk --- include/xen/interface/io/blkif.h | 48 ++++++++++++++++++++++++++++++++ 1 file changed, 48 insertions(+) diff --git a/include/xen/interface/io/blkif.h b/include/xen/interface/io/blkif.h index c33e1c489eb2..8b8cfadf7833 100644 --- a/include/xen/interface/io/blkif.h +++ b/include/xen/interface/io/blkif.h @@ -27,6 +27,54 @@ typedef uint16_t blkif_vdev_t; typedef uint64_t blkif_sector_t; +/* + * Multiple hardware queues/rings: + * If supported, the backend will write the key "multi-queue-max-queues" to + * the directory for that vbd, and set its value to the maximum supported + * number of queues. + * Frontends that are aware of this feature and wish to use it can write the + * key "multi-queue-num-queues" with the number they wish to use, which must be + * greater than zero, and no more than the value reported by the backend in + * "multi-queue-max-queues". + * + * For frontends requesting just one queue, the usual event-channel and + * ring-ref keys are written as before, simplifying the backend processing + * to avoid distinguishing between a frontend that doesn't understand the + * multi-queue feature, and one that does, but requested only one queue. + * + * Frontends requesting two or more queues must not write the toplevel + * event-channel and ring-ref keys, instead writing those keys under sub-keys + * having the name "queue-N" where N is the integer ID of the queue/ring for + * which those keys belong. Queues are indexed from zero. + * For example, a frontend with two queues must write the following set of + * queue-related keys: + * + * /local/domain/1/device/vbd/0/multi-queue-num-queues = "2" + * /local/domain/1/device/vbd/0/queue-0 = "" + * /local/domain/1/device/vbd/0/queue-0/ring-ref = "" + * /local/domain/1/device/vbd/0/queue-0/event-channel = "" + * /local/domain/1/device/vbd/0/queue-1 = "" + * /local/domain/1/device/vbd/0/queue-1/ring-ref = "" + * /local/domain/1/device/vbd/0/queue-1/event-channel = "" + * + * It is also possible to use multiple queues/rings together with + * feature multi-page ring buffer. + * For example, a frontend requests two queues/rings and the size of each ring + * buffer is two pages must write the following set of related keys: + * + * /local/domain/1/device/vbd/0/multi-queue-num-queues = "2" + * /local/domain/1/device/vbd/0/ring-page-order = "1" + * /local/domain/1/device/vbd/0/queue-0 = "" + * /local/domain/1/device/vbd/0/queue-0/ring-ref0 = "" + * /local/domain/1/device/vbd/0/queue-0/ring-ref1 = "" + * /local/domain/1/device/vbd/0/queue-0/event-channel = "" + * /local/domain/1/device/vbd/0/queue-1 = "" + * /local/domain/1/device/vbd/0/queue-1/ring-ref0 = "" + * /local/domain/1/device/vbd/0/queue-1/ring-ref1 = "" + * /local/domain/1/device/vbd/0/queue-1/event-channel = "" + * + */ + /* * REQUEST CODES. */ From 81f351615772365d46ceeac3e50c9dd4e8f9dc89 Mon Sep 17 00:00:00 2001 From: Bob Liu Date: Sat, 14 Nov 2015 11:12:11 +0800 Subject: [PATCH 02/21] xen/blkfront: separate per ring information out of device info Split per ring information to a new structure "blkfront_ring_info". A ring is the representation of a hardware queue, every vbd device can associate with one or more rings depending on how many hardware queues/rings to be used. This patch is a preparation for supporting real multi hardware queues/rings. We also add a backpointer to 'struct blkfront_info' (dev_info) which is not needed (we could use containers_of) but further patch ("xen/blkfront: pseudo support for multi hardware queues/rings") will make allocation of 'blkfront_ring_info' dynamic. Signed-off-by: Arianna Avanzini Signed-off-by: Bob Liu Signed-off-by: Konrad Rzeszutek Wilk --- drivers/block/xen-blkfront.c | 357 +++++++++++++++++++---------------- 1 file changed, 196 insertions(+), 161 deletions(-) diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c index 2fee2eef988d..0c3ad214a792 100644 --- a/drivers/block/xen-blkfront.c +++ b/drivers/block/xen-blkfront.c @@ -119,6 +119,23 @@ MODULE_PARM_DESC(max_ring_page_order, "Maximum order of pages to be used for the */ #define RINGREF_NAME_LEN (20) +/* + * Per-ring info. + * Every blkfront device can associate with one or more blkfront_ring_info, + * depending on how many hardware queues/rings to be used. + */ +struct blkfront_ring_info { + struct blkif_front_ring ring; + unsigned int ring_ref[XENBUS_MAX_RING_GRANTS]; + unsigned int evtchn, irq; + struct work_struct work; + struct gnttab_free_callback callback; + struct blk_shadow shadow[BLK_MAX_RING_SIZE]; + struct list_head indirect_pages; + unsigned long shadow_free; + struct blkfront_info *dev_info; +}; + /* * We have one of these per vbd, whether ide, scsi or 'other'. They * hang in private_data off the gendisk structure. We may end up @@ -133,18 +150,10 @@ struct blkfront_info int vdevice; blkif_vdev_t handle; enum blkif_state connected; - int ring_ref[XENBUS_MAX_RING_GRANTS]; unsigned int nr_ring_pages; - struct blkif_front_ring ring; - unsigned int evtchn, irq; struct request_queue *rq; - struct work_struct work; - struct gnttab_free_callback callback; - struct blk_shadow shadow[BLK_MAX_RING_SIZE]; struct list_head grants; - struct list_head indirect_pages; unsigned int persistent_gnts_c; - unsigned long shadow_free; unsigned int feature_flush; unsigned int feature_discard:1; unsigned int feature_secdiscard:1; @@ -155,6 +164,7 @@ struct blkfront_info unsigned int max_indirect_segments; int is_ready; struct blk_mq_tag_set tag_set; + struct blkfront_ring_info rinfo; }; static unsigned int nr_minors; @@ -198,33 +208,35 @@ static DEFINE_SPINLOCK(minor_lock); #define GREFS(_psegs) ((_psegs) * GRANTS_PER_PSEG) -static int blkfront_setup_indirect(struct blkfront_info *info); +static int blkfront_setup_indirect(struct blkfront_ring_info *rinfo); static int blkfront_gather_backend_features(struct blkfront_info *info); -static int get_id_from_freelist(struct blkfront_info *info) +static int get_id_from_freelist(struct blkfront_ring_info *rinfo) { - unsigned long free = info->shadow_free; - BUG_ON(free >= BLK_RING_SIZE(info)); - info->shadow_free = info->shadow[free].req.u.rw.id; - info->shadow[free].req.u.rw.id = 0x0fffffee; /* debug */ + unsigned long free = rinfo->shadow_free; + + BUG_ON(free >= BLK_RING_SIZE(rinfo->dev_info)); + rinfo->shadow_free = rinfo->shadow[free].req.u.rw.id; + rinfo->shadow[free].req.u.rw.id = 0x0fffffee; /* debug */ return free; } -static int add_id_to_freelist(struct blkfront_info *info, +static int add_id_to_freelist(struct blkfront_ring_info *rinfo, unsigned long id) { - if (info->shadow[id].req.u.rw.id != id) + if (rinfo->shadow[id].req.u.rw.id != id) return -EINVAL; - if (info->shadow[id].request == NULL) + if (rinfo->shadow[id].request == NULL) return -EINVAL; - info->shadow[id].req.u.rw.id = info->shadow_free; - info->shadow[id].request = NULL; - info->shadow_free = id; + rinfo->shadow[id].req.u.rw.id = rinfo->shadow_free; + rinfo->shadow[id].request = NULL; + rinfo->shadow_free = id; return 0; } -static int fill_grant_buffer(struct blkfront_info *info, int num) +static int fill_grant_buffer(struct blkfront_ring_info *rinfo, int num) { + struct blkfront_info *info = rinfo->dev_info; struct page *granted_page; struct grant *gnt_list_entry, *n; int i = 0; @@ -326,8 +338,8 @@ static struct grant *get_indirect_grant(grant_ref_t *gref_head, struct page *indirect_page; /* Fetch a pre-allocated page to use for indirect grefs */ - BUG_ON(list_empty(&info->indirect_pages)); - indirect_page = list_first_entry(&info->indirect_pages, + BUG_ON(list_empty(&info->rinfo.indirect_pages)); + indirect_page = list_first_entry(&info->rinfo.indirect_pages, struct page, lru); list_del(&indirect_page->lru); gnt_list_entry->page = indirect_page; @@ -403,8 +415,8 @@ static void xlbd_release_minors(unsigned int minor, unsigned int nr) static void blkif_restart_queue_callback(void *arg) { - struct blkfront_info *info = (struct blkfront_info *)arg; - schedule_work(&info->work); + struct blkfront_ring_info *rinfo = (struct blkfront_ring_info *)arg; + schedule_work(&rinfo->work); } static int blkif_getgeo(struct block_device *bd, struct hd_geometry *hg) @@ -456,16 +468,16 @@ static int blkif_ioctl(struct block_device *bdev, fmode_t mode, return 0; } -static int blkif_queue_discard_req(struct request *req) +static int blkif_queue_discard_req(struct request *req, struct blkfront_ring_info *rinfo) { - struct blkfront_info *info = req->rq_disk->private_data; + struct blkfront_info *info = rinfo->dev_info; struct blkif_request *ring_req; unsigned long id; /* Fill out a communications ring structure. */ - ring_req = RING_GET_REQUEST(&info->ring, info->ring.req_prod_pvt); - id = get_id_from_freelist(info); - info->shadow[id].request = req; + ring_req = RING_GET_REQUEST(&rinfo->ring, rinfo->ring.req_prod_pvt); + id = get_id_from_freelist(rinfo); + rinfo->shadow[id].request = req; ring_req->operation = BLKIF_OP_DISCARD; ring_req->u.discard.nr_sectors = blk_rq_sectors(req); @@ -476,10 +488,10 @@ static int blkif_queue_discard_req(struct request *req) else ring_req->u.discard.flag = 0; - info->ring.req_prod_pvt++; + rinfo->ring.req_prod_pvt++; /* Keep a private copy so we can reissue requests when recovering. */ - info->shadow[id].req = *ring_req; + rinfo->shadow[id].req = *ring_req; return 0; } @@ -487,7 +499,7 @@ static int blkif_queue_discard_req(struct request *req) struct setup_rw_req { unsigned int grant_idx; struct blkif_request_segment *segments; - struct blkfront_info *info; + struct blkfront_ring_info *rinfo; struct blkif_request *ring_req; grant_ref_t gref_head; unsigned int id; @@ -507,8 +519,9 @@ static void blkif_setup_rw_req_grant(unsigned long gfn, unsigned int offset, /* Convenient aliases */ unsigned int grant_idx = setup->grant_idx; struct blkif_request *ring_req = setup->ring_req; - struct blkfront_info *info = setup->info; - struct blk_shadow *shadow = &info->shadow[setup->id]; + struct blkfront_ring_info *rinfo = setup->rinfo; + struct blkfront_info *info = rinfo->dev_info; + struct blk_shadow *shadow = &rinfo->shadow[setup->id]; if ((ring_req->operation == BLKIF_OP_INDIRECT) && (grant_idx % GRANTS_PER_INDIRECT_FRAME == 0)) { @@ -566,16 +579,16 @@ static void blkif_setup_rw_req_grant(unsigned long gfn, unsigned int offset, (setup->grant_idx)++; } -static int blkif_queue_rw_req(struct request *req) +static int blkif_queue_rw_req(struct request *req, struct blkfront_ring_info *rinfo) { - struct blkfront_info *info = req->rq_disk->private_data; + struct blkfront_info *info = rinfo->dev_info; struct blkif_request *ring_req; unsigned long id; int i; struct setup_rw_req setup = { .grant_idx = 0, .segments = NULL, - .info = info, + .rinfo = rinfo, .need_copy = rq_data_dir(req) && info->feature_persistent, }; @@ -603,9 +616,9 @@ static int blkif_queue_rw_req(struct request *req) max_grefs - info->persistent_gnts_c, &setup.gref_head) < 0) { gnttab_request_free_callback( - &info->callback, + &rinfo->callback, blkif_restart_queue_callback, - info, + rinfo, max_grefs); return 1; } @@ -613,23 +626,23 @@ static int blkif_queue_rw_req(struct request *req) new_persistent_gnts = 0; /* Fill out a communications ring structure. */ - ring_req = RING_GET_REQUEST(&info->ring, info->ring.req_prod_pvt); - id = get_id_from_freelist(info); - info->shadow[id].request = req; + ring_req = RING_GET_REQUEST(&rinfo->ring, rinfo->ring.req_prod_pvt); + id = get_id_from_freelist(rinfo); + rinfo->shadow[id].request = req; BUG_ON(info->max_indirect_segments == 0 && GREFS(req->nr_phys_segments) > BLKIF_MAX_SEGMENTS_PER_REQUEST); BUG_ON(info->max_indirect_segments && GREFS(req->nr_phys_segments) > info->max_indirect_segments); - num_sg = blk_rq_map_sg(req->q, req, info->shadow[id].sg); + num_sg = blk_rq_map_sg(req->q, req, rinfo->shadow[id].sg); num_grant = 0; /* Calculate the number of grant used */ - for_each_sg(info->shadow[id].sg, sg, num_sg, i) + for_each_sg(rinfo->shadow[id].sg, sg, num_sg, i) num_grant += gnttab_count_grant(sg->offset, sg->length); ring_req->u.rw.id = id; - info->shadow[id].num_sg = num_sg; + rinfo->shadow[id].num_sg = num_sg; if (num_grant > BLKIF_MAX_SEGMENTS_PER_REQUEST) { /* * The indirect operation can only be a BLKIF_OP_READ or @@ -674,7 +687,7 @@ static int blkif_queue_rw_req(struct request *req) setup.ring_req = ring_req; setup.id = id; - for_each_sg(info->shadow[id].sg, sg, num_sg, i) { + for_each_sg(rinfo->shadow[id].sg, sg, num_sg, i) { BUG_ON(sg->offset + sg->length > PAGE_SIZE); if (setup.need_copy) { @@ -694,10 +707,10 @@ static int blkif_queue_rw_req(struct request *req) if (setup.segments) kunmap_atomic(setup.segments); - info->ring.req_prod_pvt++; + rinfo->ring.req_prod_pvt++; /* Keep a private copy so we can reissue requests when recovering. */ - info->shadow[id].req = *ring_req; + rinfo->shadow[id].req = *ring_req; if (new_persistent_gnts) gnttab_free_grant_references(setup.gref_head); @@ -711,27 +724,25 @@ static int blkif_queue_rw_req(struct request *req) * * @req: a request struct */ -static int blkif_queue_request(struct request *req) +static int blkif_queue_request(struct request *req, struct blkfront_ring_info *rinfo) { - struct blkfront_info *info = req->rq_disk->private_data; - - if (unlikely(info->connected != BLKIF_STATE_CONNECTED)) + if (unlikely(rinfo->dev_info->connected != BLKIF_STATE_CONNECTED)) return 1; if (unlikely(req->cmd_flags & (REQ_DISCARD | REQ_SECURE))) - return blkif_queue_discard_req(req); + return blkif_queue_discard_req(req, rinfo); else - return blkif_queue_rw_req(req); + return blkif_queue_rw_req(req, rinfo); } -static inline void flush_requests(struct blkfront_info *info) +static inline void flush_requests(struct blkfront_ring_info *rinfo) { int notify; - RING_PUSH_REQUESTS_AND_CHECK_NOTIFY(&info->ring, notify); + RING_PUSH_REQUESTS_AND_CHECK_NOTIFY(&rinfo->ring, notify); if (notify) - notify_remote_via_irq(info->irq); + notify_remote_via_irq(rinfo->irq); } static inline bool blkif_request_flush_invalid(struct request *req, @@ -747,20 +758,21 @@ static inline bool blkif_request_flush_invalid(struct request *req, static int blkif_queue_rq(struct blk_mq_hw_ctx *hctx, const struct blk_mq_queue_data *qd) { - struct blkfront_info *info = qd->rq->rq_disk->private_data; + struct blkfront_ring_info *rinfo = (struct blkfront_ring_info *)hctx->driver_data; + struct blkfront_info *info = rinfo->dev_info; blk_mq_start_request(qd->rq); spin_lock_irq(&info->io_lock); - if (RING_FULL(&info->ring)) + if (RING_FULL(&rinfo->ring)) goto out_busy; - if (blkif_request_flush_invalid(qd->rq, info)) + if (blkif_request_flush_invalid(qd->rq, rinfo->dev_info)) goto out_err; - if (blkif_queue_request(qd->rq)) + if (blkif_queue_request(qd->rq, rinfo)) goto out_busy; - flush_requests(info); + flush_requests(rinfo); spin_unlock_irq(&info->io_lock); return BLK_MQ_RQ_QUEUE_OK; @@ -774,9 +786,19 @@ out_busy: return BLK_MQ_RQ_QUEUE_BUSY; } +static int blk_mq_init_hctx(struct blk_mq_hw_ctx *hctx, void *data, + unsigned int index) +{ + struct blkfront_info *info = (struct blkfront_info *)data; + + hctx->driver_data = &info->rinfo; + return 0; +} + static struct blk_mq_ops blkfront_mq_ops = { .queue_rq = blkif_queue_rq, .map_queue = blk_mq_map_queue, + .init_hctx = blk_mq_init_hctx, }; static int xlvbd_init_blk_queue(struct gendisk *gd, u16 sector_size, @@ -1029,6 +1051,7 @@ static int xlvbd_alloc_gendisk(blkif_sector_t capacity, static void xlvbd_release_gendisk(struct blkfront_info *info) { unsigned int minor, nr_minors; + struct blkfront_ring_info *rinfo = &info->rinfo; if (info->rq == NULL) return; @@ -1037,10 +1060,10 @@ static void xlvbd_release_gendisk(struct blkfront_info *info) blk_mq_stop_hw_queues(info->rq); /* No more gnttab callback work. */ - gnttab_cancel_free_callback(&info->callback); + gnttab_cancel_free_callback(&rinfo->callback); /* Flush gnttab callback work. Must be done with no locks held. */ - flush_work(&info->work); + flush_work(&rinfo->work); del_gendisk(info->gd); @@ -1057,20 +1080,20 @@ static void xlvbd_release_gendisk(struct blkfront_info *info) } /* Must be called with io_lock holded */ -static void kick_pending_request_queues(struct blkfront_info *info) +static void kick_pending_request_queues(struct blkfront_ring_info *rinfo) { - if (!RING_FULL(&info->ring)) - blk_mq_start_stopped_hw_queues(info->rq, true); + if (!RING_FULL(&rinfo->ring)) + blk_mq_start_stopped_hw_queues(rinfo->dev_info->rq, true); } static void blkif_restart_queue(struct work_struct *work) { - struct blkfront_info *info = container_of(work, struct blkfront_info, work); + struct blkfront_ring_info *rinfo = container_of(work, struct blkfront_ring_info, work); - spin_lock_irq(&info->io_lock); - if (info->connected == BLKIF_STATE_CONNECTED) - kick_pending_request_queues(info); - spin_unlock_irq(&info->io_lock); + spin_lock_irq(&rinfo->dev_info->io_lock); + if (rinfo->dev_info->connected == BLKIF_STATE_CONNECTED) + kick_pending_request_queues(rinfo); + spin_unlock_irq(&rinfo->dev_info->io_lock); } static void blkif_free(struct blkfront_info *info, int suspend) @@ -1078,6 +1101,7 @@ static void blkif_free(struct blkfront_info *info, int suspend) struct grant *persistent_gnt; struct grant *n; int i, j, segs; + struct blkfront_ring_info *rinfo = &info->rinfo; /* Prevent new requests being issued until we fix things up. */ spin_lock_irq(&info->io_lock); @@ -1090,7 +1114,7 @@ static void blkif_free(struct blkfront_info *info, int suspend) /* Remove all persistent grants */ if (!list_empty(&info->grants)) { list_for_each_entry_safe(persistent_gnt, n, - &info->grants, node) { + &info->grants, node) { list_del(&persistent_gnt->node); if (persistent_gnt->gref != GRANT_INVALID_REF) { gnttab_end_foreign_access(persistent_gnt->gref, @@ -1108,11 +1132,11 @@ static void blkif_free(struct blkfront_info *info, int suspend) * Remove indirect pages, this only happens when using indirect * descriptors but not persistent grants */ - if (!list_empty(&info->indirect_pages)) { + if (!list_empty(&rinfo->indirect_pages)) { struct page *indirect_page, *n; BUG_ON(info->feature_persistent); - list_for_each_entry_safe(indirect_page, n, &info->indirect_pages, lru) { + list_for_each_entry_safe(indirect_page, n, &rinfo->indirect_pages, lru) { list_del(&indirect_page->lru); __free_page(indirect_page); } @@ -1123,21 +1147,21 @@ static void blkif_free(struct blkfront_info *info, int suspend) * Clear persistent grants present in requests already * on the shared ring */ - if (!info->shadow[i].request) + if (!rinfo->shadow[i].request) goto free_shadow; - segs = info->shadow[i].req.operation == BLKIF_OP_INDIRECT ? - info->shadow[i].req.u.indirect.nr_segments : - info->shadow[i].req.u.rw.nr_segments; + segs = rinfo->shadow[i].req.operation == BLKIF_OP_INDIRECT ? + rinfo->shadow[i].req.u.indirect.nr_segments : + rinfo->shadow[i].req.u.rw.nr_segments; for (j = 0; j < segs; j++) { - persistent_gnt = info->shadow[i].grants_used[j]; + persistent_gnt = rinfo->shadow[i].grants_used[j]; gnttab_end_foreign_access(persistent_gnt->gref, 0, 0UL); if (info->feature_persistent) __free_page(persistent_gnt->page); kfree(persistent_gnt); } - if (info->shadow[i].req.operation != BLKIF_OP_INDIRECT) + if (rinfo->shadow[i].req.operation != BLKIF_OP_INDIRECT) /* * If this is not an indirect operation don't try to * free indirect segments @@ -1145,41 +1169,41 @@ static void blkif_free(struct blkfront_info *info, int suspend) goto free_shadow; for (j = 0; j < INDIRECT_GREFS(segs); j++) { - persistent_gnt = info->shadow[i].indirect_grants[j]; + persistent_gnt = rinfo->shadow[i].indirect_grants[j]; gnttab_end_foreign_access(persistent_gnt->gref, 0, 0UL); __free_page(persistent_gnt->page); kfree(persistent_gnt); } free_shadow: - kfree(info->shadow[i].grants_used); - info->shadow[i].grants_used = NULL; - kfree(info->shadow[i].indirect_grants); - info->shadow[i].indirect_grants = NULL; - kfree(info->shadow[i].sg); - info->shadow[i].sg = NULL; + kfree(rinfo->shadow[i].grants_used); + rinfo->shadow[i].grants_used = NULL; + kfree(rinfo->shadow[i].indirect_grants); + rinfo->shadow[i].indirect_grants = NULL; + kfree(rinfo->shadow[i].sg); + rinfo->shadow[i].sg = NULL; } /* No more gnttab callback work. */ - gnttab_cancel_free_callback(&info->callback); + gnttab_cancel_free_callback(&rinfo->callback); spin_unlock_irq(&info->io_lock); /* Flush gnttab callback work. Must be done with no locks held. */ - flush_work(&info->work); + flush_work(&rinfo->work); /* Free resources associated with old device channel. */ for (i = 0; i < info->nr_ring_pages; i++) { - if (info->ring_ref[i] != GRANT_INVALID_REF) { - gnttab_end_foreign_access(info->ring_ref[i], 0, 0); - info->ring_ref[i] = GRANT_INVALID_REF; + if (rinfo->ring_ref[i] != GRANT_INVALID_REF) { + gnttab_end_foreign_access(rinfo->ring_ref[i], 0, 0); + rinfo->ring_ref[i] = GRANT_INVALID_REF; } } - free_pages((unsigned long)info->ring.sring, get_order(info->nr_ring_pages * PAGE_SIZE)); - info->ring.sring = NULL; + free_pages((unsigned long)rinfo->ring.sring, get_order(info->nr_ring_pages * PAGE_SIZE)); + rinfo->ring.sring = NULL; - if (info->irq) - unbind_from_irqhandler(info->irq, info); - info->evtchn = info->irq = 0; + if (rinfo->irq) + unbind_from_irqhandler(rinfo->irq, rinfo); + rinfo->evtchn = rinfo->irq = 0; } @@ -1209,12 +1233,13 @@ static void blkif_copy_from_grant(unsigned long gfn, unsigned int offset, kunmap_atomic(shared_data); } -static void blkif_completion(struct blk_shadow *s, struct blkfront_info *info, +static void blkif_completion(struct blk_shadow *s, struct blkfront_ring_info *rinfo, struct blkif_response *bret) { int i = 0; struct scatterlist *sg; int num_sg, num_grant; + struct blkfront_info *info = rinfo->dev_info; struct copy_from_grant data = { .s = s, .grant_idx = 0, @@ -1284,7 +1309,7 @@ static void blkif_completion(struct blk_shadow *s, struct blkfront_info *info, */ if (!info->feature_persistent) { indirect_page = s->indirect_grants[i]->page; - list_add(&indirect_page->lru, &info->indirect_pages); + list_add(&indirect_page->lru, &rinfo->indirect_pages); } s->indirect_grants[i]->gref = GRANT_INVALID_REF; list_add_tail(&s->indirect_grants[i]->node, &info->grants); @@ -1299,7 +1324,8 @@ static irqreturn_t blkif_interrupt(int irq, void *dev_id) struct blkif_response *bret; RING_IDX i, rp; unsigned long flags; - struct blkfront_info *info = (struct blkfront_info *)dev_id; + struct blkfront_ring_info *rinfo = (struct blkfront_ring_info *)dev_id; + struct blkfront_info *info = rinfo->dev_info; int error; spin_lock_irqsave(&info->io_lock, flags); @@ -1310,13 +1336,13 @@ static irqreturn_t blkif_interrupt(int irq, void *dev_id) } again: - rp = info->ring.sring->rsp_prod; + rp = rinfo->ring.sring->rsp_prod; rmb(); /* Ensure we see queued responses up to 'rp'. */ - for (i = info->ring.rsp_cons; i != rp; i++) { + for (i = rinfo->ring.rsp_cons; i != rp; i++) { unsigned long id; - bret = RING_GET_RESPONSE(&info->ring, i); + bret = RING_GET_RESPONSE(&rinfo->ring, i); id = bret->id; /* * The backend has messed up and given us an id that we would @@ -1330,12 +1356,12 @@ static irqreturn_t blkif_interrupt(int irq, void *dev_id) * the id is busted. */ continue; } - req = info->shadow[id].request; + req = rinfo->shadow[id].request; if (bret->operation != BLKIF_OP_DISCARD) - blkif_completion(&info->shadow[id], info, bret); + blkif_completion(&rinfo->shadow[id], rinfo, bret); - if (add_id_to_freelist(info, id)) { + if (add_id_to_freelist(rinfo, id)) { WARN(1, "%s: response to %s (id %ld) couldn't be recycled!\n", info->gd->disk_name, op_name(bret->operation), id); continue; @@ -1364,7 +1390,7 @@ static irqreturn_t blkif_interrupt(int irq, void *dev_id) error = -EOPNOTSUPP; } if (unlikely(bret->status == BLKIF_RSP_ERROR && - info->shadow[id].req.u.rw.nr_segments == 0)) { + rinfo->shadow[id].req.u.rw.nr_segments == 0)) { printk(KERN_WARNING "blkfront: %s: empty %s op failed\n", info->gd->disk_name, op_name(bret->operation)); error = -EOPNOTSUPP; @@ -1389,17 +1415,17 @@ static irqreturn_t blkif_interrupt(int irq, void *dev_id) } } - info->ring.rsp_cons = i; + rinfo->ring.rsp_cons = i; - if (i != info->ring.req_prod_pvt) { + if (i != rinfo->ring.req_prod_pvt) { int more_to_do; - RING_FINAL_CHECK_FOR_RESPONSES(&info->ring, more_to_do); + RING_FINAL_CHECK_FOR_RESPONSES(&rinfo->ring, more_to_do); if (more_to_do) goto again; } else - info->ring.sring->rsp_event = i + 1; + rinfo->ring.sring->rsp_event = i + 1; - kick_pending_request_queues(info); + kick_pending_request_queues(rinfo); spin_unlock_irqrestore(&info->io_lock, flags); @@ -1408,15 +1434,16 @@ static irqreturn_t blkif_interrupt(int irq, void *dev_id) static int setup_blkring(struct xenbus_device *dev, - struct blkfront_info *info) + struct blkfront_ring_info *rinfo) { struct blkif_sring *sring; int err, i; + struct blkfront_info *info = rinfo->dev_info; unsigned long ring_size = info->nr_ring_pages * XEN_PAGE_SIZE; grant_ref_t gref[XENBUS_MAX_RING_GRANTS]; for (i = 0; i < info->nr_ring_pages; i++) - info->ring_ref[i] = GRANT_INVALID_REF; + rinfo->ring_ref[i] = GRANT_INVALID_REF; sring = (struct blkif_sring *)__get_free_pages(GFP_NOIO | __GFP_HIGH, get_order(ring_size)); @@ -1425,29 +1452,29 @@ static int setup_blkring(struct xenbus_device *dev, return -ENOMEM; } SHARED_RING_INIT(sring); - FRONT_RING_INIT(&info->ring, sring, ring_size); + FRONT_RING_INIT(&rinfo->ring, sring, ring_size); - err = xenbus_grant_ring(dev, info->ring.sring, info->nr_ring_pages, gref); + err = xenbus_grant_ring(dev, rinfo->ring.sring, info->nr_ring_pages, gref); if (err < 0) { free_pages((unsigned long)sring, get_order(ring_size)); - info->ring.sring = NULL; + rinfo->ring.sring = NULL; goto fail; } for (i = 0; i < info->nr_ring_pages; i++) - info->ring_ref[i] = gref[i]; + rinfo->ring_ref[i] = gref[i]; - err = xenbus_alloc_evtchn(dev, &info->evtchn); + err = xenbus_alloc_evtchn(dev, &rinfo->evtchn); if (err) goto fail; - err = bind_evtchn_to_irqhandler(info->evtchn, blkif_interrupt, 0, - "blkif", info); + err = bind_evtchn_to_irqhandler(rinfo->evtchn, blkif_interrupt, 0, + "blkif", rinfo); if (err <= 0) { xenbus_dev_fatal(dev, err, "bind_evtchn_to_irqhandler failed"); goto fail; } - info->irq = err; + rinfo->irq = err; return 0; fail: @@ -1465,6 +1492,7 @@ static int talk_to_blkback(struct xenbus_device *dev, int err, i; unsigned int max_page_order = 0; unsigned int ring_page_order = 0; + struct blkfront_ring_info *rinfo = &info->rinfo; err = xenbus_scanf(XBT_NIL, info->xbdev->otherend, "max-ring-page-order", "%u", &max_page_order); @@ -1476,7 +1504,7 @@ static int talk_to_blkback(struct xenbus_device *dev, } /* Create shared ring, alloc event channel. */ - err = setup_blkring(dev, info); + err = setup_blkring(dev, rinfo); if (err) goto out; @@ -1489,7 +1517,7 @@ again: if (info->nr_ring_pages == 1) { err = xenbus_printf(xbt, dev->nodename, - "ring-ref", "%u", info->ring_ref[0]); + "ring-ref", "%u", rinfo->ring_ref[0]); if (err) { message = "writing ring-ref"; goto abort_transaction; @@ -1507,7 +1535,7 @@ again: snprintf(ring_ref_name, RINGREF_NAME_LEN, "ring-ref%u", i); err = xenbus_printf(xbt, dev->nodename, ring_ref_name, - "%u", info->ring_ref[i]); + "%u", rinfo->ring_ref[i]); if (err) { message = "writing ring-ref"; goto abort_transaction; @@ -1515,7 +1543,7 @@ again: } } err = xenbus_printf(xbt, dev->nodename, - "event-channel", "%u", info->evtchn); + "event-channel", "%u", rinfo->evtchn); if (err) { message = "writing event-channel"; goto abort_transaction; @@ -1541,8 +1569,8 @@ again: } for (i = 0; i < BLK_RING_SIZE(info); i++) - info->shadow[i].req.u.rw.id = i+1; - info->shadow[BLK_RING_SIZE(info)-1].req.u.rw.id = 0x0fffffff; + rinfo->shadow[i].req.u.rw.id = i+1; + rinfo->shadow[BLK_RING_SIZE(info)-1].req.u.rw.id = 0x0fffffff; xenbus_switch_state(dev, XenbusStateInitialised); return 0; @@ -1568,6 +1596,7 @@ static int blkfront_probe(struct xenbus_device *dev, { int err, vdevice; struct blkfront_info *info; + struct blkfront_ring_info *rinfo; /* FIXME: Use dynamic device id if this is not set. */ err = xenbus_scanf(XBT_NIL, dev->nodename, @@ -1617,15 +1646,18 @@ static int blkfront_probe(struct xenbus_device *dev, return -ENOMEM; } + rinfo = &info->rinfo; + INIT_LIST_HEAD(&rinfo->indirect_pages); + rinfo->dev_info = info; + INIT_WORK(&rinfo->work, blkif_restart_queue); + mutex_init(&info->mutex); spin_lock_init(&info->io_lock); info->xbdev = dev; info->vdevice = vdevice; INIT_LIST_HEAD(&info->grants); - INIT_LIST_HEAD(&info->indirect_pages); info->persistent_gnts_c = 0; info->connected = BLKIF_STATE_DISCONNECTED; - INIT_WORK(&info->work, blkif_restart_queue); /* Front end dir is a number, which is used as the id. */ info->handle = simple_strtoul(strrchr(dev->nodename, '/')+1, NULL, 0); @@ -1659,19 +1691,20 @@ static int blkif_recover(struct blkfront_info *info) int pending, size; struct split_bio *split_bio; struct list_head requests; + struct blkfront_ring_info *rinfo = &info->rinfo; /* Stage 1: Make a safe copy of the shadow state. */ - copy = kmemdup(info->shadow, sizeof(info->shadow), + copy = kmemdup(rinfo->shadow, sizeof(rinfo->shadow), GFP_NOIO | __GFP_REPEAT | __GFP_HIGH); if (!copy) return -ENOMEM; /* Stage 2: Set up free list. */ - memset(&info->shadow, 0, sizeof(info->shadow)); + memset(&rinfo->shadow, 0, sizeof(rinfo->shadow)); for (i = 0; i < BLK_RING_SIZE(info); i++) - info->shadow[i].req.u.rw.id = i+1; - info->shadow_free = info->ring.req_prod_pvt; - info->shadow[BLK_RING_SIZE(info)-1].req.u.rw.id = 0x0fffffff; + rinfo->shadow[i].req.u.rw.id = i+1; + rinfo->shadow_free = rinfo->ring.req_prod_pvt; + rinfo->shadow[BLK_RING_SIZE(info)-1].req.u.rw.id = 0x0fffffff; rc = blkfront_gather_backend_features(info); if (rc) { @@ -1717,7 +1750,7 @@ static int blkif_recover(struct blkfront_info *info) info->connected = BLKIF_STATE_CONNECTED; /* Kick any other new requests queued since we resumed */ - kick_pending_request_queues(info); + kick_pending_request_queues(rinfo); list_for_each_entry_safe(req, n, &requests, queuelist) { /* Requeue pending requests (flush or discard) */ @@ -1851,10 +1884,11 @@ static void blkfront_setup_discard(struct blkfront_info *info) info->feature_secdiscard = !!discard_secure; } -static int blkfront_setup_indirect(struct blkfront_info *info) +static int blkfront_setup_indirect(struct blkfront_ring_info *rinfo) { unsigned int psegs, grants; int err, i; + struct blkfront_info *info = rinfo->dev_info; if (info->max_indirect_segments == 0) grants = BLKIF_MAX_SEGMENTS_PER_REQUEST; @@ -1862,7 +1896,7 @@ static int blkfront_setup_indirect(struct blkfront_info *info) grants = info->max_indirect_segments; psegs = grants / GRANTS_PER_PSEG; - err = fill_grant_buffer(info, + err = fill_grant_buffer(rinfo, (grants + INDIRECT_GREFS(grants)) * BLK_RING_SIZE(info)); if (err) goto out_of_memory; @@ -1875,31 +1909,31 @@ static int blkfront_setup_indirect(struct blkfront_info *info) */ int num = INDIRECT_GREFS(grants) * BLK_RING_SIZE(info); - BUG_ON(!list_empty(&info->indirect_pages)); + BUG_ON(!list_empty(&rinfo->indirect_pages)); for (i = 0; i < num; i++) { struct page *indirect_page = alloc_page(GFP_NOIO); if (!indirect_page) goto out_of_memory; - list_add(&indirect_page->lru, &info->indirect_pages); + list_add(&indirect_page->lru, &rinfo->indirect_pages); } } for (i = 0; i < BLK_RING_SIZE(info); i++) { - info->shadow[i].grants_used = kzalloc( - sizeof(info->shadow[i].grants_used[0]) * grants, + rinfo->shadow[i].grants_used = kzalloc( + sizeof(rinfo->shadow[i].grants_used[0]) * grants, GFP_NOIO); - info->shadow[i].sg = kzalloc(sizeof(info->shadow[i].sg[0]) * psegs, GFP_NOIO); + rinfo->shadow[i].sg = kzalloc(sizeof(rinfo->shadow[i].sg[0]) * psegs, GFP_NOIO); if (info->max_indirect_segments) - info->shadow[i].indirect_grants = kzalloc( - sizeof(info->shadow[i].indirect_grants[0]) * + rinfo->shadow[i].indirect_grants = kzalloc( + sizeof(rinfo->shadow[i].indirect_grants[0]) * INDIRECT_GREFS(grants), GFP_NOIO); - if ((info->shadow[i].grants_used == NULL) || - (info->shadow[i].sg == NULL) || + if ((rinfo->shadow[i].grants_used == NULL) || + (rinfo->shadow[i].sg == NULL) || (info->max_indirect_segments && - (info->shadow[i].indirect_grants == NULL))) + (rinfo->shadow[i].indirect_grants == NULL))) goto out_of_memory; - sg_init_table(info->shadow[i].sg, psegs); + sg_init_table(rinfo->shadow[i].sg, psegs); } @@ -1907,16 +1941,16 @@ static int blkfront_setup_indirect(struct blkfront_info *info) out_of_memory: for (i = 0; i < BLK_RING_SIZE(info); i++) { - kfree(info->shadow[i].grants_used); - info->shadow[i].grants_used = NULL; - kfree(info->shadow[i].sg); - info->shadow[i].sg = NULL; - kfree(info->shadow[i].indirect_grants); - info->shadow[i].indirect_grants = NULL; + kfree(rinfo->shadow[i].grants_used); + rinfo->shadow[i].grants_used = NULL; + kfree(rinfo->shadow[i].sg); + rinfo->shadow[i].sg = NULL; + kfree(rinfo->shadow[i].indirect_grants); + rinfo->shadow[i].indirect_grants = NULL; } - if (!list_empty(&info->indirect_pages)) { + if (!list_empty(&rinfo->indirect_pages)) { struct page *indirect_page, *n; - list_for_each_entry_safe(indirect_page, n, &info->indirect_pages, lru) { + list_for_each_entry_safe(indirect_page, n, &rinfo->indirect_pages, lru) { list_del(&indirect_page->lru); __free_page(indirect_page); } @@ -1983,7 +2017,7 @@ static int blkfront_gather_backend_features(struct blkfront_info *info) info->max_indirect_segments = min(indirect_segments, xen_blkif_max_segments); - return blkfront_setup_indirect(info); + return blkfront_setup_indirect(&info->rinfo); } /* @@ -1997,6 +2031,7 @@ static void blkfront_connect(struct blkfront_info *info) unsigned int physical_sector_size; unsigned int binfo; int err; + struct blkfront_ring_info *rinfo = &info->rinfo; switch (info->connected) { case BLKIF_STATE_CONNECTED: @@ -2073,7 +2108,7 @@ static void blkfront_connect(struct blkfront_info *info) /* Kick pending requests. */ spin_lock_irq(&info->io_lock); info->connected = BLKIF_STATE_CONNECTED; - kick_pending_request_queues(info); + kick_pending_request_queues(rinfo); spin_unlock_irq(&info->io_lock); add_disk(info->gd); From 3df0e5059908b8fdba351c4b5dd77caadd95a949 Mon Sep 17 00:00:00 2001 From: Bob Liu Date: Sat, 14 Nov 2015 11:12:12 +0800 Subject: [PATCH 03/21] xen/blkfront: pseudo support for multi hardware queues/rings Preparatory patch for multiple hardware queues (rings). The number of rings is unconditionally set to 1, larger number will be enabled in patch "xen/blkfront: negotiate number of queues/rings to be used with backend" so as to make review easier. Note that blkfront_gather_backend_features does not call blkfront_setup_indirect anymore (as that needs to be done per ring). That means that in blkif_recover/blkif_connect we have to do it in a loop (bounded by nr_rings). Signed-off-by: Bob Liu Signed-off-by: Konrad Rzeszutek Wilk --- drivers/block/xen-blkfront.c | 347 ++++++++++++++++++++--------------- 1 file changed, 200 insertions(+), 147 deletions(-) diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c index 0c3ad214a792..0638b1722a40 100644 --- a/drivers/block/xen-blkfront.c +++ b/drivers/block/xen-blkfront.c @@ -150,6 +150,7 @@ struct blkfront_info int vdevice; blkif_vdev_t handle; enum blkif_state connected; + /* Number of pages per ring buffer. */ unsigned int nr_ring_pages; struct request_queue *rq; struct list_head grants; @@ -164,7 +165,8 @@ struct blkfront_info unsigned int max_indirect_segments; int is_ready; struct blk_mq_tag_set tag_set; - struct blkfront_ring_info rinfo; + struct blkfront_ring_info *rinfo; + unsigned int nr_rings; }; static unsigned int nr_minors; @@ -209,7 +211,7 @@ static DEFINE_SPINLOCK(minor_lock); #define GREFS(_psegs) ((_psegs) * GRANTS_PER_PSEG) static int blkfront_setup_indirect(struct blkfront_ring_info *rinfo); -static int blkfront_gather_backend_features(struct blkfront_info *info); +static void blkfront_gather_backend_features(struct blkfront_info *info); static int get_id_from_freelist(struct blkfront_ring_info *rinfo) { @@ -338,8 +340,8 @@ static struct grant *get_indirect_grant(grant_ref_t *gref_head, struct page *indirect_page; /* Fetch a pre-allocated page to use for indirect grefs */ - BUG_ON(list_empty(&info->rinfo.indirect_pages)); - indirect_page = list_first_entry(&info->rinfo.indirect_pages, + BUG_ON(list_empty(&info->rinfo->indirect_pages)); + indirect_page = list_first_entry(&info->rinfo->indirect_pages, struct page, lru); list_del(&indirect_page->lru); gnt_list_entry->page = indirect_page; @@ -597,7 +599,6 @@ static int blkif_queue_rw_req(struct request *req, struct blkfront_ring_info *ri * existing persistent grants, or if we have to get new grants, * as there are not sufficiently many free. */ - bool new_persistent_gnts; struct scatterlist *sg; int num_sg, max_grefs, num_grant; @@ -609,12 +610,12 @@ static int blkif_queue_rw_req(struct request *req, struct blkfront_ring_info *ri */ max_grefs += INDIRECT_GREFS(max_grefs); - /* Check if we have enough grants to allocate a requests */ - if (info->persistent_gnts_c < max_grefs) { - new_persistent_gnts = 1; - if (gnttab_alloc_grant_references( - max_grefs - info->persistent_gnts_c, - &setup.gref_head) < 0) { + /* + * We have to reserve 'max_grefs' grants because persistent + * grants are shared by all rings. + */ + if (max_grefs > 0) + if (gnttab_alloc_grant_references(max_grefs, &setup.gref_head) < 0) { gnttab_request_free_callback( &rinfo->callback, blkif_restart_queue_callback, @@ -622,8 +623,6 @@ static int blkif_queue_rw_req(struct request *req, struct blkfront_ring_info *ri max_grefs); return 1; } - } else - new_persistent_gnts = 0; /* Fill out a communications ring structure. */ ring_req = RING_GET_REQUEST(&rinfo->ring, rinfo->ring.req_prod_pvt); @@ -712,7 +711,7 @@ static int blkif_queue_rw_req(struct request *req, struct blkfront_ring_info *ri /* Keep a private copy so we can reissue requests when recovering. */ rinfo->shadow[id].req = *ring_req; - if (new_persistent_gnts) + if (max_grefs > 0) gnttab_free_grant_references(setup.gref_head); return 0; @@ -791,7 +790,8 @@ static int blk_mq_init_hctx(struct blk_mq_hw_ctx *hctx, void *data, { struct blkfront_info *info = (struct blkfront_info *)data; - hctx->driver_data = &info->rinfo; + BUG_ON(info->nr_rings <= index); + hctx->driver_data = &info->rinfo[index]; return 0; } @@ -1050,8 +1050,7 @@ static int xlvbd_alloc_gendisk(blkif_sector_t capacity, static void xlvbd_release_gendisk(struct blkfront_info *info) { - unsigned int minor, nr_minors; - struct blkfront_ring_info *rinfo = &info->rinfo; + unsigned int minor, nr_minors, i; if (info->rq == NULL) return; @@ -1059,11 +1058,15 @@ static void xlvbd_release_gendisk(struct blkfront_info *info) /* No more blkif_request(). */ blk_mq_stop_hw_queues(info->rq); - /* No more gnttab callback work. */ - gnttab_cancel_free_callback(&rinfo->callback); + for (i = 0; i < info->nr_rings; i++) { + struct blkfront_ring_info *rinfo = &info->rinfo[i]; - /* Flush gnttab callback work. Must be done with no locks held. */ - flush_work(&rinfo->work); + /* No more gnttab callback work. */ + gnttab_cancel_free_callback(&rinfo->callback); + + /* Flush gnttab callback work. Must be done with no locks held. */ + flush_work(&rinfo->work); + } del_gendisk(info->gd); @@ -1096,37 +1099,11 @@ static void blkif_restart_queue(struct work_struct *work) spin_unlock_irq(&rinfo->dev_info->io_lock); } -static void blkif_free(struct blkfront_info *info, int suspend) +static void blkif_free_ring(struct blkfront_ring_info *rinfo) { struct grant *persistent_gnt; - struct grant *n; + struct blkfront_info *info = rinfo->dev_info; int i, j, segs; - struct blkfront_ring_info *rinfo = &info->rinfo; - - /* Prevent new requests being issued until we fix things up. */ - spin_lock_irq(&info->io_lock); - info->connected = suspend ? - BLKIF_STATE_SUSPENDED : BLKIF_STATE_DISCONNECTED; - /* No more blkif_request(). */ - if (info->rq) - blk_mq_stop_hw_queues(info->rq); - - /* Remove all persistent grants */ - if (!list_empty(&info->grants)) { - list_for_each_entry_safe(persistent_gnt, n, - &info->grants, node) { - list_del(&persistent_gnt->node); - if (persistent_gnt->gref != GRANT_INVALID_REF) { - gnttab_end_foreign_access(persistent_gnt->gref, - 0, 0UL); - info->persistent_gnts_c--; - } - if (info->feature_persistent) - __free_page(persistent_gnt->page); - kfree(persistent_gnt); - } - } - BUG_ON(info->persistent_gnts_c != 0); /* * Remove indirect pages, this only happens when using indirect @@ -1186,7 +1163,6 @@ free_shadow: /* No more gnttab callback work. */ gnttab_cancel_free_callback(&rinfo->callback); - spin_unlock_irq(&info->io_lock); /* Flush gnttab callback work. Must be done with no locks held. */ flush_work(&rinfo->work); @@ -1204,7 +1180,45 @@ free_shadow: if (rinfo->irq) unbind_from_irqhandler(rinfo->irq, rinfo); rinfo->evtchn = rinfo->irq = 0; +} +static void blkif_free(struct blkfront_info *info, int suspend) +{ + struct grant *persistent_gnt, *n; + unsigned int i; + + /* Prevent new requests being issued until we fix things up. */ + spin_lock_irq(&info->io_lock); + info->connected = suspend ? + BLKIF_STATE_SUSPENDED : BLKIF_STATE_DISCONNECTED; + /* No more blkif_request(). */ + if (info->rq) + blk_mq_stop_hw_queues(info->rq); + + /* Remove all persistent grants */ + if (!list_empty(&info->grants)) { + list_for_each_entry_safe(persistent_gnt, n, + &info->grants, node) { + list_del(&persistent_gnt->node); + if (persistent_gnt->gref != GRANT_INVALID_REF) { + gnttab_end_foreign_access(persistent_gnt->gref, + 0, 0UL); + info->persistent_gnts_c--; + } + if (info->feature_persistent) + __free_page(persistent_gnt->page); + kfree(persistent_gnt); + } + } + BUG_ON(info->persistent_gnts_c != 0); + + for (i = 0; i < info->nr_rings; i++) + blkif_free_ring(&info->rinfo[i]); + + kfree(info->rinfo); + info->rinfo = NULL; + info->nr_rings = 0; + spin_unlock_irq(&info->io_lock); } struct copy_from_grant { @@ -1492,7 +1506,7 @@ static int talk_to_blkback(struct xenbus_device *dev, int err, i; unsigned int max_page_order = 0; unsigned int ring_page_order = 0; - struct blkfront_ring_info *rinfo = &info->rinfo; + struct blkfront_ring_info *rinfo; err = xenbus_scanf(XBT_NIL, info->xbdev->otherend, "max-ring-page-order", "%u", &max_page_order); @@ -1503,10 +1517,13 @@ static int talk_to_blkback(struct xenbus_device *dev, info->nr_ring_pages = 1 << ring_page_order; } - /* Create shared ring, alloc event channel. */ - err = setup_blkring(dev, rinfo); - if (err) - goto out; + for (i = 0; i < info->nr_rings; i++) { + rinfo = &info->rinfo[i]; + /* Create shared ring, alloc event channel. */ + err = setup_blkring(dev, rinfo); + if (err) + goto destroy_blkring; + } again: err = xenbus_transaction_start(&xbt); @@ -1515,37 +1532,43 @@ again: goto destroy_blkring; } - if (info->nr_ring_pages == 1) { - err = xenbus_printf(xbt, dev->nodename, - "ring-ref", "%u", rinfo->ring_ref[0]); - if (err) { - message = "writing ring-ref"; - goto abort_transaction; - } - } else { - err = xenbus_printf(xbt, dev->nodename, - "ring-page-order", "%u", ring_page_order); - if (err) { - message = "writing ring-page-order"; - goto abort_transaction; - } - - for (i = 0; i < info->nr_ring_pages; i++) { - char ring_ref_name[RINGREF_NAME_LEN]; - - snprintf(ring_ref_name, RINGREF_NAME_LEN, "ring-ref%u", i); - err = xenbus_printf(xbt, dev->nodename, ring_ref_name, - "%u", rinfo->ring_ref[i]); + if (info->nr_rings == 1) { + rinfo = &info->rinfo[0]; + if (info->nr_ring_pages == 1) { + err = xenbus_printf(xbt, dev->nodename, + "ring-ref", "%u", rinfo->ring_ref[0]); if (err) { message = "writing ring-ref"; goto abort_transaction; } + } else { + err = xenbus_printf(xbt, dev->nodename, + "ring-page-order", "%u", ring_page_order); + if (err) { + message = "writing ring-page-order"; + goto abort_transaction; + } + + for (i = 0; i < info->nr_ring_pages; i++) { + char ring_ref_name[RINGREF_NAME_LEN]; + + snprintf(ring_ref_name, RINGREF_NAME_LEN, "ring-ref%u", i); + err = xenbus_printf(xbt, dev->nodename, ring_ref_name, + "%u", rinfo->ring_ref[i]); + if (err) { + message = "writing ring-ref"; + goto abort_transaction; + } + } } - } - err = xenbus_printf(xbt, dev->nodename, - "event-channel", "%u", rinfo->evtchn); - if (err) { - message = "writing event-channel"; + err = xenbus_printf(xbt, dev->nodename, + "event-channel", "%u", rinfo->evtchn); + if (err) { + message = "writing event-channel"; + goto abort_transaction; + } + } else { + /* Not supported at this stage. */ goto abort_transaction; } err = xenbus_printf(xbt, dev->nodename, "protocol", "%s", @@ -1568,9 +1591,15 @@ again: goto destroy_blkring; } - for (i = 0; i < BLK_RING_SIZE(info); i++) - rinfo->shadow[i].req.u.rw.id = i+1; - rinfo->shadow[BLK_RING_SIZE(info)-1].req.u.rw.id = 0x0fffffff; + for (i = 0; i < info->nr_rings; i++) { + unsigned int j; + + rinfo = &info->rinfo[i]; + + for (j = 0; j < BLK_RING_SIZE(info); j++) + rinfo->shadow[j].req.u.rw.id = j + 1; + rinfo->shadow[BLK_RING_SIZE(info)-1].req.u.rw.id = 0x0fffffff; + } xenbus_switch_state(dev, XenbusStateInitialised); return 0; @@ -1581,7 +1610,7 @@ again: xenbus_dev_fatal(dev, err, "%s", message); destroy_blkring: blkif_free(info, 0); - out: + return err; } @@ -1595,8 +1624,8 @@ static int blkfront_probe(struct xenbus_device *dev, const struct xenbus_device_id *id) { int err, vdevice; + unsigned int r_index; struct blkfront_info *info; - struct blkfront_ring_info *rinfo; /* FIXME: Use dynamic device id if this is not set. */ err = xenbus_scanf(XBT_NIL, dev->nodename, @@ -1646,10 +1675,22 @@ static int blkfront_probe(struct xenbus_device *dev, return -ENOMEM; } - rinfo = &info->rinfo; - INIT_LIST_HEAD(&rinfo->indirect_pages); - rinfo->dev_info = info; - INIT_WORK(&rinfo->work, blkif_restart_queue); + info->nr_rings = 1; + info->rinfo = kzalloc(sizeof(struct blkfront_ring_info) * info->nr_rings, GFP_KERNEL); + if (!info->rinfo) { + xenbus_dev_fatal(dev, -ENOMEM, "allocating ring_info structure"); + kfree(info); + return -ENOMEM; + } + + for (r_index = 0; r_index < info->nr_rings; r_index++) { + struct blkfront_ring_info *rinfo; + + rinfo = &info->rinfo[r_index]; + INIT_LIST_HEAD(&rinfo->indirect_pages); + rinfo->dev_info = info; + INIT_WORK(&rinfo->work, blkif_restart_queue); + } mutex_init(&info->mutex); spin_lock_init(&info->io_lock); @@ -1681,7 +1722,7 @@ static void split_bio_end(struct bio *bio) static int blkif_recover(struct blkfront_info *info) { - int i; + unsigned int i, r_index; struct request *req, *n; struct blk_shadow *copy; int rc; @@ -1691,57 +1732,62 @@ static int blkif_recover(struct blkfront_info *info) int pending, size; struct split_bio *split_bio; struct list_head requests; - struct blkfront_ring_info *rinfo = &info->rinfo; - - /* Stage 1: Make a safe copy of the shadow state. */ - copy = kmemdup(rinfo->shadow, sizeof(rinfo->shadow), - GFP_NOIO | __GFP_REPEAT | __GFP_HIGH); - if (!copy) - return -ENOMEM; - - /* Stage 2: Set up free list. */ - memset(&rinfo->shadow, 0, sizeof(rinfo->shadow)); - for (i = 0; i < BLK_RING_SIZE(info); i++) - rinfo->shadow[i].req.u.rw.id = i+1; - rinfo->shadow_free = rinfo->ring.req_prod_pvt; - rinfo->shadow[BLK_RING_SIZE(info)-1].req.u.rw.id = 0x0fffffff; - - rc = blkfront_gather_backend_features(info); - if (rc) { - kfree(copy); - return rc; - } + blkfront_gather_backend_features(info); segs = info->max_indirect_segments ? : BLKIF_MAX_SEGMENTS_PER_REQUEST; blk_queue_max_segments(info->rq, segs); bio_list_init(&bio_list); INIT_LIST_HEAD(&requests); - for (i = 0; i < BLK_RING_SIZE(info); i++) { - /* Not in use? */ - if (!copy[i].request) - continue; - /* - * Get the bios in the request so we can re-queue them. - */ - if (copy[i].request->cmd_flags & - (REQ_FLUSH | REQ_FUA | REQ_DISCARD | REQ_SECURE)) { - /* - * Flush operations don't contain bios, so - * we need to requeue the whole request - */ - list_add(©[i].request->queuelist, &requests); - continue; + for (r_index = 0; r_index < info->nr_rings; r_index++) { + struct blkfront_ring_info *rinfo; + + rinfo = &info->rinfo[r_index]; + /* Stage 1: Make a safe copy of the shadow state. */ + copy = kmemdup(rinfo->shadow, sizeof(rinfo->shadow), + GFP_NOIO | __GFP_REPEAT | __GFP_HIGH); + if (!copy) + return -ENOMEM; + + /* Stage 2: Set up free list. */ + memset(&rinfo->shadow, 0, sizeof(rinfo->shadow)); + for (i = 0; i < BLK_RING_SIZE(info); i++) + rinfo->shadow[i].req.u.rw.id = i+1; + rinfo->shadow_free = rinfo->ring.req_prod_pvt; + rinfo->shadow[BLK_RING_SIZE(info)-1].req.u.rw.id = 0x0fffffff; + + rc = blkfront_setup_indirect(rinfo); + if (rc) { + kfree(copy); + return rc; } - merge_bio.head = copy[i].request->bio; - merge_bio.tail = copy[i].request->biotail; - bio_list_merge(&bio_list, &merge_bio); - copy[i].request->bio = NULL; - blk_end_request_all(copy[i].request, 0); + + for (i = 0; i < BLK_RING_SIZE(info); i++) { + /* Not in use? */ + if (!copy[i].request) + continue; + + /* + * Get the bios in the request so we can re-queue them. + */ + if (copy[i].request->cmd_flags & + (REQ_FLUSH | REQ_FUA | REQ_DISCARD | REQ_SECURE)) { + /* + * Flush operations don't contain bios, so + * we need to requeue the whole request + */ + list_add(©[i].request->queuelist, &requests); + continue; + } + merge_bio.head = copy[i].request->bio; + merge_bio.tail = copy[i].request->biotail; + bio_list_merge(&bio_list, &merge_bio); + copy[i].request->bio = NULL; + blk_end_request_all(copy[i].request, 0); + } + + kfree(copy); } - - kfree(copy); - xenbus_switch_state(info->xbdev, XenbusStateConnected); spin_lock_irq(&info->io_lock); @@ -1749,8 +1795,13 @@ static int blkif_recover(struct blkfront_info *info) /* Now safe for us to use the shared ring */ info->connected = BLKIF_STATE_CONNECTED; - /* Kick any other new requests queued since we resumed */ - kick_pending_request_queues(rinfo); + for (r_index = 0; r_index < info->nr_rings; r_index++) { + struct blkfront_ring_info *rinfo; + + rinfo = &info->rinfo[r_index]; + /* Kick any other new requests queued since we resumed */ + kick_pending_request_queues(rinfo); + } list_for_each_entry_safe(req, n, &requests, queuelist) { /* Requeue pending requests (flush or discard) */ @@ -1961,7 +2012,7 @@ out_of_memory: /* * Gather all backend feature-* */ -static int blkfront_gather_backend_features(struct blkfront_info *info) +static void blkfront_gather_backend_features(struct blkfront_info *info) { int err; int barrier, flush, discard, persistent; @@ -2016,8 +2067,6 @@ static int blkfront_gather_backend_features(struct blkfront_info *info) else info->max_indirect_segments = min(indirect_segments, xen_blkif_max_segments); - - return blkfront_setup_indirect(&info->rinfo); } /* @@ -2030,8 +2079,7 @@ static void blkfront_connect(struct blkfront_info *info) unsigned long sector_size; unsigned int physical_sector_size; unsigned int binfo; - int err; - struct blkfront_ring_info *rinfo = &info->rinfo; + int err, i; switch (info->connected) { case BLKIF_STATE_CONNECTED: @@ -2088,11 +2136,15 @@ static void blkfront_connect(struct blkfront_info *info) if (err != 1) physical_sector_size = sector_size; - err = blkfront_gather_backend_features(info); - if (err) { - xenbus_dev_fatal(info->xbdev, err, "setup_indirect at %s", - info->xbdev->otherend); - return; + blkfront_gather_backend_features(info); + for (i = 0; i < info->nr_rings; i++) { + err = blkfront_setup_indirect(&info->rinfo[i]); + if (err) { + xenbus_dev_fatal(info->xbdev, err, "setup_indirect at %s", + info->xbdev->otherend); + blkif_free(info, 0); + break; + } } err = xlvbd_alloc_gendisk(sectors, info, binfo, sector_size, @@ -2108,7 +2160,8 @@ static void blkfront_connect(struct blkfront_info *info) /* Kick pending requests. */ spin_lock_irq(&info->io_lock); info->connected = BLKIF_STATE_CONNECTED; - kick_pending_request_queues(rinfo); + for (i = 0; i < info->nr_rings; i++) + kick_pending_request_queues(&info->rinfo[i]); spin_unlock_irq(&info->io_lock); add_disk(info->gd); From 11659569f7202d0cb6553e81f9b8aa04dfeb94ce Mon Sep 17 00:00:00 2001 From: Bob Liu Date: Sat, 14 Nov 2015 11:12:13 +0800 Subject: [PATCH 04/21] xen/blkfront: split per device io_lock After patch "xen/blkfront: separate per ring information out of device info", per-ring data is protected by a per-device lock ('io_lock'). This is not a good way and will effect the scalability, so introduce a per-ring lock ('ring_lock'). The old 'io_lock' is renamed to 'dev_lock' which protects the ->grants list and ->persistent_gnts_c which are shared by all rings. Note that in 'blkfront_probe' the 'blkfront_info' is setup via kzalloc so setting ->persistent_gnts_c to zero is not needed. Signed-off-by: Bob Liu Signed-off-by: Konrad Rzeszutek Wilk --- drivers/block/xen-blkfront.c | 73 +++++++++++++++++++++++------------- 1 file changed, 47 insertions(+), 26 deletions(-) diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c index 0638b1722a40..a9058bbdaa6b 100644 --- a/drivers/block/xen-blkfront.c +++ b/drivers/block/xen-blkfront.c @@ -125,6 +125,8 @@ MODULE_PARM_DESC(max_ring_page_order, "Maximum order of pages to be used for the * depending on how many hardware queues/rings to be used. */ struct blkfront_ring_info { + /* Lock to protect data in every ring buffer. */ + spinlock_t ring_lock; struct blkif_front_ring ring; unsigned int ring_ref[XENBUS_MAX_RING_GRANTS]; unsigned int evtchn, irq; @@ -143,7 +145,6 @@ struct blkfront_ring_info { */ struct blkfront_info { - spinlock_t io_lock; struct mutex mutex; struct xenbus_device *xbdev; struct gendisk *gd; @@ -153,6 +154,11 @@ struct blkfront_info /* Number of pages per ring buffer. */ unsigned int nr_ring_pages; struct request_queue *rq; + /* + * Lock to protect info->grants list and persistent_gnts_c shared by all + * rings. + */ + spinlock_t dev_lock; struct list_head grants; unsigned int persistent_gnts_c; unsigned int feature_flush; @@ -258,7 +264,9 @@ static int fill_grant_buffer(struct blkfront_ring_info *rinfo, int num) } gnt_list_entry->gref = GRANT_INVALID_REF; + spin_lock_irq(&info->dev_lock); list_add(&gnt_list_entry->node, &info->grants); + spin_unlock_irq(&info->dev_lock); i++; } @@ -267,7 +275,9 @@ static int fill_grant_buffer(struct blkfront_ring_info *rinfo, int num) out_of_memory: list_for_each_entry_safe(gnt_list_entry, n, &info->grants, node) { + spin_lock_irq(&info->dev_lock); list_del(&gnt_list_entry->node); + spin_unlock_irq(&info->dev_lock); if (info->feature_persistent) __free_page(gnt_list_entry->page); kfree(gnt_list_entry); @@ -280,7 +290,9 @@ out_of_memory: static struct grant *get_free_grant(struct blkfront_info *info) { struct grant *gnt_list_entry; + unsigned long flags; + spin_lock_irqsave(&info->dev_lock, flags); BUG_ON(list_empty(&info->grants)); gnt_list_entry = list_first_entry(&info->grants, struct grant, node); @@ -288,6 +300,7 @@ static struct grant *get_free_grant(struct blkfront_info *info) if (gnt_list_entry->gref != GRANT_INVALID_REF) info->persistent_gnts_c--; + spin_unlock_irqrestore(&info->dev_lock, flags); return gnt_list_entry; } @@ -757,11 +770,11 @@ static inline bool blkif_request_flush_invalid(struct request *req, static int blkif_queue_rq(struct blk_mq_hw_ctx *hctx, const struct blk_mq_queue_data *qd) { + unsigned long flags; struct blkfront_ring_info *rinfo = (struct blkfront_ring_info *)hctx->driver_data; - struct blkfront_info *info = rinfo->dev_info; blk_mq_start_request(qd->rq); - spin_lock_irq(&info->io_lock); + spin_lock_irqsave(&rinfo->ring_lock, flags); if (RING_FULL(&rinfo->ring)) goto out_busy; @@ -772,15 +785,15 @@ static int blkif_queue_rq(struct blk_mq_hw_ctx *hctx, goto out_busy; flush_requests(rinfo); - spin_unlock_irq(&info->io_lock); + spin_unlock_irqrestore(&rinfo->ring_lock, flags); return BLK_MQ_RQ_QUEUE_OK; out_err: - spin_unlock_irq(&info->io_lock); + spin_unlock_irqrestore(&rinfo->ring_lock, flags); return BLK_MQ_RQ_QUEUE_ERROR; out_busy: - spin_unlock_irq(&info->io_lock); + spin_unlock_irqrestore(&rinfo->ring_lock, flags); blk_mq_stop_hw_queue(hctx); return BLK_MQ_RQ_QUEUE_BUSY; } @@ -1082,21 +1095,28 @@ static void xlvbd_release_gendisk(struct blkfront_info *info) info->gd = NULL; } -/* Must be called with io_lock holded */ -static void kick_pending_request_queues(struct blkfront_ring_info *rinfo) +/* Already hold rinfo->ring_lock. */ +static inline void kick_pending_request_queues_locked(struct blkfront_ring_info *rinfo) { if (!RING_FULL(&rinfo->ring)) blk_mq_start_stopped_hw_queues(rinfo->dev_info->rq, true); } +static void kick_pending_request_queues(struct blkfront_ring_info *rinfo) +{ + unsigned long flags; + + spin_lock_irqsave(&rinfo->ring_lock, flags); + kick_pending_request_queues_locked(rinfo); + spin_unlock_irqrestore(&rinfo->ring_lock, flags); +} + static void blkif_restart_queue(struct work_struct *work) { struct blkfront_ring_info *rinfo = container_of(work, struct blkfront_ring_info, work); - spin_lock_irq(&rinfo->dev_info->io_lock); if (rinfo->dev_info->connected == BLKIF_STATE_CONNECTED) kick_pending_request_queues(rinfo); - spin_unlock_irq(&rinfo->dev_info->io_lock); } static void blkif_free_ring(struct blkfront_ring_info *rinfo) @@ -1188,7 +1208,6 @@ static void blkif_free(struct blkfront_info *info, int suspend) unsigned int i; /* Prevent new requests being issued until we fix things up. */ - spin_lock_irq(&info->io_lock); info->connected = suspend ? BLKIF_STATE_SUSPENDED : BLKIF_STATE_DISCONNECTED; /* No more blkif_request(). */ @@ -1196,6 +1215,7 @@ static void blkif_free(struct blkfront_info *info, int suspend) blk_mq_stop_hw_queues(info->rq); /* Remove all persistent grants */ + spin_lock_irq(&info->dev_lock); if (!list_empty(&info->grants)) { list_for_each_entry_safe(persistent_gnt, n, &info->grants, node) { @@ -1211,6 +1231,7 @@ static void blkif_free(struct blkfront_info *info, int suspend) } } BUG_ON(info->persistent_gnts_c != 0); + spin_unlock_irq(&info->dev_lock); for (i = 0; i < info->nr_rings; i++) blkif_free_ring(&info->rinfo[i]); @@ -1218,7 +1239,6 @@ static void blkif_free(struct blkfront_info *info, int suspend) kfree(info->rinfo); info->rinfo = NULL; info->nr_rings = 0; - spin_unlock_irq(&info->io_lock); } struct copy_from_grant { @@ -1253,6 +1273,7 @@ static void blkif_completion(struct blk_shadow *s, struct blkfront_ring_info *ri int i = 0; struct scatterlist *sg; int num_sg, num_grant; + unsigned long flags; struct blkfront_info *info = rinfo->dev_info; struct copy_from_grant data = { .s = s, @@ -1291,8 +1312,10 @@ static void blkif_completion(struct blk_shadow *s, struct blkfront_ring_info *ri if (!info->feature_persistent) pr_alert_ratelimited("backed has not unmapped grant: %u\n", s->grants_used[i]->gref); + spin_lock_irqsave(&info->dev_lock, flags); list_add(&s->grants_used[i]->node, &info->grants); info->persistent_gnts_c++; + spin_unlock_irqrestore(&info->dev_lock, flags); } else { /* * If the grant is not mapped by the backend we end the @@ -1302,7 +1325,9 @@ static void blkif_completion(struct blk_shadow *s, struct blkfront_ring_info *ri */ gnttab_end_foreign_access(s->grants_used[i]->gref, 0, 0UL); s->grants_used[i]->gref = GRANT_INVALID_REF; + spin_lock_irqsave(&info->dev_lock, flags); list_add_tail(&s->grants_used[i]->node, &info->grants); + spin_unlock_irqrestore(&info->dev_lock, flags); } } if (s->req.operation == BLKIF_OP_INDIRECT) { @@ -1311,8 +1336,10 @@ static void blkif_completion(struct blk_shadow *s, struct blkfront_ring_info *ri if (!info->feature_persistent) pr_alert_ratelimited("backed has not unmapped grant: %u\n", s->indirect_grants[i]->gref); + spin_lock_irqsave(&info->dev_lock, flags); list_add(&s->indirect_grants[i]->node, &info->grants); info->persistent_gnts_c++; + spin_unlock_irqrestore(&info->dev_lock, flags); } else { struct page *indirect_page; @@ -1326,7 +1353,9 @@ static void blkif_completion(struct blk_shadow *s, struct blkfront_ring_info *ri list_add(&indirect_page->lru, &rinfo->indirect_pages); } s->indirect_grants[i]->gref = GRANT_INVALID_REF; + spin_lock_irqsave(&info->dev_lock, flags); list_add_tail(&s->indirect_grants[i]->node, &info->grants); + spin_unlock_irqrestore(&info->dev_lock, flags); } } } @@ -1342,13 +1371,10 @@ static irqreturn_t blkif_interrupt(int irq, void *dev_id) struct blkfront_info *info = rinfo->dev_info; int error; - spin_lock_irqsave(&info->io_lock, flags); - - if (unlikely(info->connected != BLKIF_STATE_CONNECTED)) { - spin_unlock_irqrestore(&info->io_lock, flags); + if (unlikely(info->connected != BLKIF_STATE_CONNECTED)) return IRQ_HANDLED; - } + spin_lock_irqsave(&rinfo->ring_lock, flags); again: rp = rinfo->ring.sring->rsp_prod; rmb(); /* Ensure we see queued responses up to 'rp'. */ @@ -1439,9 +1465,9 @@ static irqreturn_t blkif_interrupt(int irq, void *dev_id) } else rinfo->ring.sring->rsp_event = i + 1; - kick_pending_request_queues(rinfo); + kick_pending_request_queues_locked(rinfo); - spin_unlock_irqrestore(&info->io_lock, flags); + spin_unlock_irqrestore(&rinfo->ring_lock, flags); return IRQ_HANDLED; } @@ -1690,14 +1716,14 @@ static int blkfront_probe(struct xenbus_device *dev, INIT_LIST_HEAD(&rinfo->indirect_pages); rinfo->dev_info = info; INIT_WORK(&rinfo->work, blkif_restart_queue); + spin_lock_init(&rinfo->ring_lock); } mutex_init(&info->mutex); - spin_lock_init(&info->io_lock); + spin_lock_init(&info->dev_lock); info->xbdev = dev; info->vdevice = vdevice; INIT_LIST_HEAD(&info->grants); - info->persistent_gnts_c = 0; info->connected = BLKIF_STATE_DISCONNECTED; /* Front end dir is a number, which is used as the id. */ @@ -1790,8 +1816,6 @@ static int blkif_recover(struct blkfront_info *info) } xenbus_switch_state(info->xbdev, XenbusStateConnected); - spin_lock_irq(&info->io_lock); - /* Now safe for us to use the shared ring */ info->connected = BLKIF_STATE_CONNECTED; @@ -1809,7 +1833,6 @@ static int blkif_recover(struct blkfront_info *info) BUG_ON(req->nr_phys_segments > segs); blk_mq_requeue_request(req); } - spin_unlock_irq(&info->io_lock); blk_mq_kick_requeue_list(info->rq); while ((bio = bio_list_pop(&bio_list)) != NULL) { @@ -2158,11 +2181,9 @@ static void blkfront_connect(struct blkfront_info *info) xenbus_switch_state(info->xbdev, XenbusStateConnected); /* Kick pending requests. */ - spin_lock_irq(&info->io_lock); info->connected = BLKIF_STATE_CONNECTED; for (i = 0; i < info->nr_rings; i++) kick_pending_request_queues(&info->rinfo[i]); - spin_unlock_irq(&info->io_lock); add_disk(info->gd); From 28d949bcc28bbc2d206f9c3f69b892575e81c040 Mon Sep 17 00:00:00 2001 From: Bob Liu Date: Sat, 14 Nov 2015 11:12:14 +0800 Subject: [PATCH 05/21] xen/blkfront: negotiate number of queues/rings to be used with backend The max number of hardware queues for xen/blkfront is set by parameter 'max_queues'(default 4), while it is also capped by the max value that the xen/blkback exposes through XenStore key 'multi-queue-max-queues'. The negotiated number is the smaller one and would be written back to xenstore as "multi-queue-num-queues", blkback needs to read this negotiated number. Signed-off-by: Bob Liu Signed-off-by: Konrad Rzeszutek Wilk --- drivers/block/xen-blkfront.c | 162 ++++++++++++++++++++++++++--------- 1 file changed, 120 insertions(+), 42 deletions(-) diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c index a9058bbdaa6b..81c87e69cbe9 100644 --- a/drivers/block/xen-blkfront.c +++ b/drivers/block/xen-blkfront.c @@ -99,6 +99,10 @@ static unsigned int xen_blkif_max_segments = 32; module_param_named(max, xen_blkif_max_segments, int, S_IRUGO); MODULE_PARM_DESC(max, "Maximum amount of segments in indirect requests (default is 32)"); +static unsigned int xen_blkif_max_queues = 4; +module_param_named(max_queues, xen_blkif_max_queues, uint, S_IRUGO); +MODULE_PARM_DESC(max_queues, "Maximum number of hardware queues/rings used per virtual disk"); + /* * Maximum order of pages to be used for the shared ring between front and * backend, 4KB page granularity is used. @@ -118,6 +122,10 @@ MODULE_PARM_DESC(max_ring_page_order, "Maximum order of pages to be used for the * characters are enough. Define to 20 to keep consist with backend. */ #define RINGREF_NAME_LEN (20) +/* + * queue-%u would take 7 + 10(UINT_MAX) = 17 characters. + */ +#define QUEUE_NAME_LEN (17) /* * Per-ring info. @@ -823,7 +831,7 @@ static int xlvbd_init_blk_queue(struct gendisk *gd, u16 sector_size, memset(&info->tag_set, 0, sizeof(info->tag_set)); info->tag_set.ops = &blkfront_mq_ops; - info->tag_set.nr_hw_queues = 1; + info->tag_set.nr_hw_queues = info->nr_rings; info->tag_set.queue_depth = BLK_RING_SIZE(info); info->tag_set.numa_node = NUMA_NO_NODE; info->tag_set.flags = BLK_MQ_F_SHOULD_MERGE | BLK_MQ_F_SG_MERGE; @@ -1522,6 +1530,53 @@ fail: return err; } +/* + * Write out per-ring/queue nodes including ring-ref and event-channel, and each + * ring buffer may have multi pages depending on ->nr_ring_pages. + */ +static int write_per_ring_nodes(struct xenbus_transaction xbt, + struct blkfront_ring_info *rinfo, const char *dir) +{ + int err; + unsigned int i; + const char *message = NULL; + struct blkfront_info *info = rinfo->dev_info; + + if (info->nr_ring_pages == 1) { + err = xenbus_printf(xbt, dir, "ring-ref", "%u", rinfo->ring_ref[0]); + if (err) { + message = "writing ring-ref"; + goto abort_transaction; + } + } else { + for (i = 0; i < info->nr_ring_pages; i++) { + char ring_ref_name[RINGREF_NAME_LEN]; + + snprintf(ring_ref_name, RINGREF_NAME_LEN, "ring-ref%u", i); + err = xenbus_printf(xbt, dir, ring_ref_name, + "%u", rinfo->ring_ref[i]); + if (err) { + message = "writing ring-ref"; + goto abort_transaction; + } + } + } + + err = xenbus_printf(xbt, dir, "event-channel", "%u", rinfo->evtchn); + if (err) { + message = "writing event-channel"; + goto abort_transaction; + } + + return 0; + +abort_transaction: + xenbus_transaction_end(xbt, 1); + if (message) + xenbus_dev_fatal(info->xbdev, err, "%s", message); + + return err; +} /* Common code used when first setting up, and when resuming. */ static int talk_to_blkback(struct xenbus_device *dev, @@ -1529,10 +1584,9 @@ static int talk_to_blkback(struct xenbus_device *dev, { const char *message = NULL; struct xenbus_transaction xbt; - int err, i; - unsigned int max_page_order = 0; + int err; + unsigned int i, max_page_order = 0; unsigned int ring_page_order = 0; - struct blkfront_ring_info *rinfo; err = xenbus_scanf(XBT_NIL, info->xbdev->otherend, "max-ring-page-order", "%u", &max_page_order); @@ -1544,7 +1598,8 @@ static int talk_to_blkback(struct xenbus_device *dev, } for (i = 0; i < info->nr_rings; i++) { - rinfo = &info->rinfo[i]; + struct blkfront_ring_info *rinfo = &info->rinfo[i]; + /* Create shared ring, alloc event channel. */ err = setup_blkring(dev, rinfo); if (err) @@ -1558,44 +1613,49 @@ again: goto destroy_blkring; } - if (info->nr_rings == 1) { - rinfo = &info->rinfo[0]; - if (info->nr_ring_pages == 1) { - err = xenbus_printf(xbt, dev->nodename, - "ring-ref", "%u", rinfo->ring_ref[0]); - if (err) { - message = "writing ring-ref"; - goto abort_transaction; - } - } else { - err = xenbus_printf(xbt, dev->nodename, - "ring-page-order", "%u", ring_page_order); - if (err) { - message = "writing ring-page-order"; - goto abort_transaction; - } - - for (i = 0; i < info->nr_ring_pages; i++) { - char ring_ref_name[RINGREF_NAME_LEN]; - - snprintf(ring_ref_name, RINGREF_NAME_LEN, "ring-ref%u", i); - err = xenbus_printf(xbt, dev->nodename, ring_ref_name, - "%u", rinfo->ring_ref[i]); - if (err) { - message = "writing ring-ref"; - goto abort_transaction; - } - } - } - err = xenbus_printf(xbt, dev->nodename, - "event-channel", "%u", rinfo->evtchn); + if (info->nr_ring_pages > 1) { + err = xenbus_printf(xbt, dev->nodename, "ring-page-order", "%u", + ring_page_order); if (err) { - message = "writing event-channel"; + message = "writing ring-page-order"; goto abort_transaction; } + } + + /* We already got the number of queues/rings in _probe */ + if (info->nr_rings == 1) { + err = write_per_ring_nodes(xbt, &info->rinfo[0], dev->nodename); + if (err) + goto destroy_blkring; } else { - /* Not supported at this stage. */ - goto abort_transaction; + char *path; + size_t pathsize; + + err = xenbus_printf(xbt, dev->nodename, "multi-queue-num-queues", "%u", + info->nr_rings); + if (err) { + message = "writing multi-queue-num-queues"; + goto abort_transaction; + } + + pathsize = strlen(dev->nodename) + QUEUE_NAME_LEN; + path = kmalloc(pathsize, GFP_KERNEL); + if (!path) { + err = -ENOMEM; + message = "ENOMEM while writing ring references"; + goto abort_transaction; + } + + for (i = 0; i < info->nr_rings; i++) { + memset(path, 0, pathsize); + snprintf(path, pathsize, "%s/queue-%u", dev->nodename, i); + err = write_per_ring_nodes(xbt, &info->rinfo[i], path); + if (err) { + kfree(path); + goto destroy_blkring; + } + } + kfree(path); } err = xenbus_printf(xbt, dev->nodename, "protocol", "%s", XEN_IO_PROTO_ABI_NATIVE); @@ -1619,8 +1679,7 @@ again: for (i = 0; i < info->nr_rings; i++) { unsigned int j; - - rinfo = &info->rinfo[i]; + struct blkfront_ring_info *rinfo = &info->rinfo[i]; for (j = 0; j < BLK_RING_SIZE(info); j++) rinfo->shadow[j].req.u.rw.id = j + 1; @@ -1652,6 +1711,7 @@ static int blkfront_probe(struct xenbus_device *dev, int err, vdevice; unsigned int r_index; struct blkfront_info *info; + unsigned int backend_max_queues = 0; /* FIXME: Use dynamic device id if this is not set. */ err = xenbus_scanf(XBT_NIL, dev->nodename, @@ -1701,7 +1761,18 @@ static int blkfront_probe(struct xenbus_device *dev, return -ENOMEM; } - info->nr_rings = 1; + info->xbdev = dev; + /* Check if backend supports multiple queues. */ + err = xenbus_scanf(XBT_NIL, info->xbdev->otherend, + "multi-queue-max-queues", "%u", &backend_max_queues); + if (err < 0) + backend_max_queues = 1; + + info->nr_rings = min(backend_max_queues, xen_blkif_max_queues); + /* We need at least one ring. */ + if (!info->nr_rings) + info->nr_rings = 1; + info->rinfo = kzalloc(sizeof(struct blkfront_ring_info) * info->nr_rings, GFP_KERNEL); if (!info->rinfo) { xenbus_dev_fatal(dev, -ENOMEM, "allocating ring_info structure"); @@ -2390,6 +2461,7 @@ static struct xenbus_driver blkfront_driver = { static int __init xlblk_init(void) { int ret; + int nr_cpus = num_online_cpus(); if (!xen_domain()) return -ENODEV; @@ -2400,6 +2472,12 @@ static int __init xlblk_init(void) xen_blkif_max_ring_order = 0; } + if (xen_blkif_max_queues > nr_cpus) { + pr_info("Invalid max_queues (%d), will use default max: %d.\n", + xen_blkif_max_queues, nr_cpus); + xen_blkif_max_queues = nr_cpus; + } + if (!xen_has_pv_disk_devices()) return -ENODEV; From 6f03a7ff89485f0a7a559bf5c7631d2986c4ecfa Mon Sep 17 00:00:00 2001 From: Konrad Rzeszutek Wilk Date: Mon, 16 Nov 2015 15:14:41 -0500 Subject: [PATCH 06/21] xen/blkfront: Cleanup of comments, fix unaligned variables, and syntax errors. Signed-off-by: Konrad Rzeszutek Wilk --- drivers/block/xen-blkfront.c | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c index 81c87e69cbe9..87ab09fc7b5f 100644 --- a/drivers/block/xen-blkfront.c +++ b/drivers/block/xen-blkfront.c @@ -118,8 +118,8 @@ MODULE_PARM_DESC(max_ring_page_order, "Maximum order of pages to be used for the __CONST_RING_SIZE(blkif, XEN_PAGE_SIZE * XENBUS_MAX_RING_GRANTS) /* - * ring-ref%i i=(-1UL) would take 11 characters + 'ring-ref' is 8, so 19 - * characters are enough. Define to 20 to keep consist with backend. + * ring-ref%u i=(-1UL) would take 11 characters + 'ring-ref' is 8, so 19 + * characters are enough. Define to 20 to keep consistent with backend. */ #define RINGREF_NAME_LEN (20) /* @@ -238,7 +238,7 @@ static int get_id_from_freelist(struct blkfront_ring_info *rinfo) } static int add_id_to_freelist(struct blkfront_ring_info *rinfo, - unsigned long id) + unsigned long id) { if (rinfo->shadow[id].req.u.rw.id != id) return -EINVAL; @@ -257,7 +257,7 @@ static int fill_grant_buffer(struct blkfront_ring_info *rinfo, int num) struct grant *gnt_list_entry, *n; int i = 0; - while(i < num) { + while (i < num) { gnt_list_entry = kzalloc(sizeof(struct grant), GFP_NOIO); if (!gnt_list_entry) goto out_of_memory; @@ -776,7 +776,7 @@ static inline bool blkif_request_flush_invalid(struct request *req, } static int blkif_queue_rq(struct blk_mq_hw_ctx *hctx, - const struct blk_mq_queue_data *qd) + const struct blk_mq_queue_data *qd) { unsigned long flags; struct blkfront_ring_info *rinfo = (struct blkfront_ring_info *)hctx->driver_data; @@ -1968,8 +1968,7 @@ static int blkfront_resume(struct xenbus_device *dev) return err; } -static void -blkfront_closing(struct blkfront_info *info) +static void blkfront_closing(struct blkfront_info *info) { struct xenbus_device *xbdev = info->xbdev; struct block_device *bdev = NULL; From 75f070b3967b0c3bf0e1bc43411b06bab6c2c2cd Mon Sep 17 00:00:00 2001 From: Bob Liu Date: Mon, 16 Nov 2015 16:25:33 -0500 Subject: [PATCH 07/21] xen/blkfront: Remove duplicate setting of ->xbdev. We do the same exact operations a bit earlier in the function. Signed-off-by: Bob Liu Signed-off-by: Konrad Rzeszutek Wilk --- drivers/block/xen-blkfront.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c index 87ab09fc7b5f..b7f06cf14788 100644 --- a/drivers/block/xen-blkfront.c +++ b/drivers/block/xen-blkfront.c @@ -1792,7 +1792,6 @@ static int blkfront_probe(struct xenbus_device *dev, mutex_init(&info->mutex); spin_lock_init(&info->dev_lock); - info->xbdev = dev; info->vdevice = vdevice; INIT_LIST_HEAD(&info->grants); info->connected = BLKIF_STATE_DISCONNECTED; From 73716df7da4f60dd2d59a9302227d0394f1b8fcc Mon Sep 17 00:00:00 2001 From: Bob Liu Date: Mon, 16 Nov 2015 16:51:39 -0500 Subject: [PATCH 08/21] xen/blkfront: make persistent grants pool per-queue Make persistent grants per-queue/ring instead of per-device, so that we can drop the 'dev_lock' and get better scalability. Test was done based on null_blk driver: dom0: v4.2-rc8 16vcpus 10GB "modprobe null_blk" domu: v4.2-rc8 16vcpus 10GB [test] rw=read direct=1 ioengine=libaio bs=4k time_based runtime=30 filename=/dev/xvdb numjobs=16 iodepth=64 iodepth_batch=64 iodepth_batch_complete=64 group_reporting Queues: 1 4 8 16 Iops orig(k): 810 1064 780 700 Iops patched(k): 810 1230(~20%) 1024(~20%) 850(~20%) Signed-off-by: Bob Liu Signed-off-by: Konrad Rzeszutek Wilk --- drivers/block/xen-blkfront.c | 110 ++++++++++++++--------------------- 1 file changed, 43 insertions(+), 67 deletions(-) diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c index b7f06cf14788..9d46960b7d72 100644 --- a/drivers/block/xen-blkfront.c +++ b/drivers/block/xen-blkfront.c @@ -142,6 +142,8 @@ struct blkfront_ring_info { struct gnttab_free_callback callback; struct blk_shadow shadow[BLK_MAX_RING_SIZE]; struct list_head indirect_pages; + struct list_head grants; + unsigned int persistent_gnts_c; unsigned long shadow_free; struct blkfront_info *dev_info; }; @@ -162,13 +164,6 @@ struct blkfront_info /* Number of pages per ring buffer. */ unsigned int nr_ring_pages; struct request_queue *rq; - /* - * Lock to protect info->grants list and persistent_gnts_c shared by all - * rings. - */ - spinlock_t dev_lock; - struct list_head grants; - unsigned int persistent_gnts_c; unsigned int feature_flush; unsigned int feature_discard:1; unsigned int feature_secdiscard:1; @@ -272,9 +267,7 @@ static int fill_grant_buffer(struct blkfront_ring_info *rinfo, int num) } gnt_list_entry->gref = GRANT_INVALID_REF; - spin_lock_irq(&info->dev_lock); - list_add(&gnt_list_entry->node, &info->grants); - spin_unlock_irq(&info->dev_lock); + list_add(&gnt_list_entry->node, &rinfo->grants); i++; } @@ -282,10 +275,8 @@ static int fill_grant_buffer(struct blkfront_ring_info *rinfo, int num) out_of_memory: list_for_each_entry_safe(gnt_list_entry, n, - &info->grants, node) { - spin_lock_irq(&info->dev_lock); + &rinfo->grants, node) { list_del(&gnt_list_entry->node); - spin_unlock_irq(&info->dev_lock); if (info->feature_persistent) __free_page(gnt_list_entry->page); kfree(gnt_list_entry); @@ -295,20 +286,17 @@ out_of_memory: return -ENOMEM; } -static struct grant *get_free_grant(struct blkfront_info *info) +static struct grant *get_free_grant(struct blkfront_ring_info *rinfo) { struct grant *gnt_list_entry; - unsigned long flags; - spin_lock_irqsave(&info->dev_lock, flags); - BUG_ON(list_empty(&info->grants)); - gnt_list_entry = list_first_entry(&info->grants, struct grant, + BUG_ON(list_empty(&rinfo->grants)); + gnt_list_entry = list_first_entry(&rinfo->grants, struct grant, node); list_del(&gnt_list_entry->node); if (gnt_list_entry->gref != GRANT_INVALID_REF) - info->persistent_gnts_c--; - spin_unlock_irqrestore(&info->dev_lock, flags); + rinfo->persistent_gnts_c--; return gnt_list_entry; } @@ -324,9 +312,10 @@ static inline void grant_foreign_access(const struct grant *gnt_list_entry, static struct grant *get_grant(grant_ref_t *gref_head, unsigned long gfn, - struct blkfront_info *info) + struct blkfront_ring_info *rinfo) { - struct grant *gnt_list_entry = get_free_grant(info); + struct grant *gnt_list_entry = get_free_grant(rinfo); + struct blkfront_info *info = rinfo->dev_info; if (gnt_list_entry->gref != GRANT_INVALID_REF) return gnt_list_entry; @@ -347,9 +336,10 @@ static struct grant *get_grant(grant_ref_t *gref_head, } static struct grant *get_indirect_grant(grant_ref_t *gref_head, - struct blkfront_info *info) + struct blkfront_ring_info *rinfo) { - struct grant *gnt_list_entry = get_free_grant(info); + struct grant *gnt_list_entry = get_free_grant(rinfo); + struct blkfront_info *info = rinfo->dev_info; if (gnt_list_entry->gref != GRANT_INVALID_REF) return gnt_list_entry; @@ -361,8 +351,8 @@ static struct grant *get_indirect_grant(grant_ref_t *gref_head, struct page *indirect_page; /* Fetch a pre-allocated page to use for indirect grefs */ - BUG_ON(list_empty(&info->rinfo->indirect_pages)); - indirect_page = list_first_entry(&info->rinfo->indirect_pages, + BUG_ON(list_empty(&rinfo->indirect_pages)); + indirect_page = list_first_entry(&rinfo->indirect_pages, struct page, lru); list_del(&indirect_page->lru); gnt_list_entry->page = indirect_page; @@ -543,7 +533,6 @@ static void blkif_setup_rw_req_grant(unsigned long gfn, unsigned int offset, unsigned int grant_idx = setup->grant_idx; struct blkif_request *ring_req = setup->ring_req; struct blkfront_ring_info *rinfo = setup->rinfo; - struct blkfront_info *info = rinfo->dev_info; struct blk_shadow *shadow = &rinfo->shadow[setup->id]; if ((ring_req->operation == BLKIF_OP_INDIRECT) && @@ -552,13 +541,13 @@ static void blkif_setup_rw_req_grant(unsigned long gfn, unsigned int offset, kunmap_atomic(setup->segments); n = grant_idx / GRANTS_PER_INDIRECT_FRAME; - gnt_list_entry = get_indirect_grant(&setup->gref_head, info); + gnt_list_entry = get_indirect_grant(&setup->gref_head, rinfo); shadow->indirect_grants[n] = gnt_list_entry; setup->segments = kmap_atomic(gnt_list_entry->page); ring_req->u.indirect.indirect_grefs[n] = gnt_list_entry->gref; } - gnt_list_entry = get_grant(&setup->gref_head, gfn, info); + gnt_list_entry = get_grant(&setup->gref_head, gfn, rinfo); ref = gnt_list_entry->gref; shadow->grants_used[grant_idx] = gnt_list_entry; @@ -1129,7 +1118,7 @@ static void blkif_restart_queue(struct work_struct *work) static void blkif_free_ring(struct blkfront_ring_info *rinfo) { - struct grant *persistent_gnt; + struct grant *persistent_gnt, *n; struct blkfront_info *info = rinfo->dev_info; int i, j, segs; @@ -1147,6 +1136,23 @@ static void blkif_free_ring(struct blkfront_ring_info *rinfo) } } + /* Remove all persistent grants. */ + if (!list_empty(&rinfo->grants)) { + list_for_each_entry_safe(persistent_gnt, n, + &rinfo->grants, node) { + list_del(&persistent_gnt->node); + if (persistent_gnt->gref != GRANT_INVALID_REF) { + gnttab_end_foreign_access(persistent_gnt->gref, + 0, 0UL); + rinfo->persistent_gnts_c--; + } + if (info->feature_persistent) + __free_page(persistent_gnt->page); + kfree(persistent_gnt); + } + } + BUG_ON(rinfo->persistent_gnts_c != 0); + for (i = 0; i < BLK_RING_SIZE(info); i++) { /* * Clear persistent grants present in requests already @@ -1212,7 +1218,6 @@ free_shadow: static void blkif_free(struct blkfront_info *info, int suspend) { - struct grant *persistent_gnt, *n; unsigned int i; /* Prevent new requests being issued until we fix things up. */ @@ -1222,25 +1227,6 @@ static void blkif_free(struct blkfront_info *info, int suspend) if (info->rq) blk_mq_stop_hw_queues(info->rq); - /* Remove all persistent grants */ - spin_lock_irq(&info->dev_lock); - if (!list_empty(&info->grants)) { - list_for_each_entry_safe(persistent_gnt, n, - &info->grants, node) { - list_del(&persistent_gnt->node); - if (persistent_gnt->gref != GRANT_INVALID_REF) { - gnttab_end_foreign_access(persistent_gnt->gref, - 0, 0UL); - info->persistent_gnts_c--; - } - if (info->feature_persistent) - __free_page(persistent_gnt->page); - kfree(persistent_gnt); - } - } - BUG_ON(info->persistent_gnts_c != 0); - spin_unlock_irq(&info->dev_lock); - for (i = 0; i < info->nr_rings; i++) blkif_free_ring(&info->rinfo[i]); @@ -1281,7 +1267,6 @@ static void blkif_completion(struct blk_shadow *s, struct blkfront_ring_info *ri int i = 0; struct scatterlist *sg; int num_sg, num_grant; - unsigned long flags; struct blkfront_info *info = rinfo->dev_info; struct copy_from_grant data = { .s = s, @@ -1320,10 +1305,8 @@ static void blkif_completion(struct blk_shadow *s, struct blkfront_ring_info *ri if (!info->feature_persistent) pr_alert_ratelimited("backed has not unmapped grant: %u\n", s->grants_used[i]->gref); - spin_lock_irqsave(&info->dev_lock, flags); - list_add(&s->grants_used[i]->node, &info->grants); - info->persistent_gnts_c++; - spin_unlock_irqrestore(&info->dev_lock, flags); + list_add(&s->grants_used[i]->node, &rinfo->grants); + rinfo->persistent_gnts_c++; } else { /* * If the grant is not mapped by the backend we end the @@ -1333,9 +1316,7 @@ static void blkif_completion(struct blk_shadow *s, struct blkfront_ring_info *ri */ gnttab_end_foreign_access(s->grants_used[i]->gref, 0, 0UL); s->grants_used[i]->gref = GRANT_INVALID_REF; - spin_lock_irqsave(&info->dev_lock, flags); - list_add_tail(&s->grants_used[i]->node, &info->grants); - spin_unlock_irqrestore(&info->dev_lock, flags); + list_add_tail(&s->grants_used[i]->node, &rinfo->grants); } } if (s->req.operation == BLKIF_OP_INDIRECT) { @@ -1344,10 +1325,8 @@ static void blkif_completion(struct blk_shadow *s, struct blkfront_ring_info *ri if (!info->feature_persistent) pr_alert_ratelimited("backed has not unmapped grant: %u\n", s->indirect_grants[i]->gref); - spin_lock_irqsave(&info->dev_lock, flags); - list_add(&s->indirect_grants[i]->node, &info->grants); - info->persistent_gnts_c++; - spin_unlock_irqrestore(&info->dev_lock, flags); + list_add(&s->indirect_grants[i]->node, &rinfo->grants); + rinfo->persistent_gnts_c++; } else { struct page *indirect_page; @@ -1361,9 +1340,7 @@ static void blkif_completion(struct blk_shadow *s, struct blkfront_ring_info *ri list_add(&indirect_page->lru, &rinfo->indirect_pages); } s->indirect_grants[i]->gref = GRANT_INVALID_REF; - spin_lock_irqsave(&info->dev_lock, flags); - list_add_tail(&s->indirect_grants[i]->node, &info->grants); - spin_unlock_irqrestore(&info->dev_lock, flags); + list_add_tail(&s->indirect_grants[i]->node, &rinfo->grants); } } } @@ -1785,15 +1762,14 @@ static int blkfront_probe(struct xenbus_device *dev, rinfo = &info->rinfo[r_index]; INIT_LIST_HEAD(&rinfo->indirect_pages); + INIT_LIST_HEAD(&rinfo->grants); rinfo->dev_info = info; INIT_WORK(&rinfo->work, blkif_restart_queue); spin_lock_init(&rinfo->ring_lock); } mutex_init(&info->mutex); - spin_lock_init(&info->dev_lock); info->vdevice = vdevice; - INIT_LIST_HEAD(&info->grants); info->connected = BLKIF_STATE_DISCONNECTED; /* Front end dir is a number, which is used as the id. */ From 45fc82642e54018740a25444d1165901501b601b Mon Sep 17 00:00:00 2001 From: Peng Fan Date: Wed, 25 Nov 2015 18:26:01 +0800 Subject: [PATCH 09/21] xen/blkfront: correct setting for xen_blkif_max_ring_order MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit According to this piece code: " pr_info("Invalid max_ring_order (%d), will use default max: %d.\n", xen_blkif_max_ring_order, XENBUS_MAX_RING_GRANT_ORDER); " if xen_blkif_max_ring_order is bigger that XENBUS_MAX_RING_GRANT_ORDER, need to set xen_blkif_max_ring_order using XENBUS_MAX_RING_GRANT_ORDER, but not 0. Signed-off-by: Peng Fan Cc: Boris Ostrovsky Cc: David Vrabel Cc: "Roger Pau MonnĂ©" Signed-off-by: Konrad Rzeszutek Wilk --- drivers/block/xen-blkfront.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c index 9d46960b7d72..8b0f3d92d8b4 100644 --- a/drivers/block/xen-blkfront.c +++ b/drivers/block/xen-blkfront.c @@ -2443,7 +2443,7 @@ static int __init xlblk_init(void) if (xen_blkif_max_ring_order > XENBUS_MAX_RING_GRANT_ORDER) { pr_info("Invalid max_ring_order (%d), will use default max: %d.\n", xen_blkif_max_ring_order, XENBUS_MAX_RING_GRANT_ORDER); - xen_blkif_max_ring_order = 0; + xen_blkif_max_ring_order = XENBUS_MAX_RING_GRANT_ORDER; } if (xen_blkif_max_queues > nr_cpus) { From 597957000ab5b1b38085c20868f3f7b9c305bae5 Mon Sep 17 00:00:00 2001 From: Bob Liu Date: Sat, 14 Nov 2015 11:12:15 +0800 Subject: [PATCH 10/21] xen/blkback: separate ring information out of struct xen_blkif Split per ring information to an new structure "xen_blkif_ring", so that one vbd device can be associated with one or more rings/hardware queues. Introduce 'pers_gnts_lock' to protect the pool of persistent grants since we may have multi backend threads. This patch is a preparation for supporting multi hardware queues/rings. Signed-off-by: Arianna Avanzini Signed-off-by: Bob Liu Signed-off-by: Konrad Rzeszutek Wilk --- v2: Align the variables in the structure. --- drivers/block/xen-blkback/blkback.c | 235 ++++++++++++++++------------ drivers/block/xen-blkback/common.h | 56 ++++--- drivers/block/xen-blkback/xenbus.c | 96 ++++++------ 3 files changed, 215 insertions(+), 172 deletions(-) diff --git a/drivers/block/xen-blkback/blkback.c b/drivers/block/xen-blkback/blkback.c index f9099940c272..4fd8640d146c 100644 --- a/drivers/block/xen-blkback/blkback.c +++ b/drivers/block/xen-blkback/blkback.c @@ -173,11 +173,11 @@ static inline void shrink_free_pagepool(struct xen_blkif *blkif, int num) #define vaddr(page) ((unsigned long)pfn_to_kaddr(page_to_pfn(page))) -static int do_block_io_op(struct xen_blkif *blkif); -static int dispatch_rw_block_io(struct xen_blkif *blkif, +static int do_block_io_op(struct xen_blkif_ring *ring); +static int dispatch_rw_block_io(struct xen_blkif_ring *ring, struct blkif_request *req, struct pending_req *pending_req); -static void make_response(struct xen_blkif *blkif, u64 id, +static void make_response(struct xen_blkif_ring *ring, u64 id, unsigned short op, int st); #define foreach_grant_safe(pos, n, rbtree, node) \ @@ -189,14 +189,8 @@ static void make_response(struct xen_blkif *blkif, u64 id, /* - * We don't need locking around the persistent grant helpers - * because blkback uses a single-thread for each backed, so we - * can be sure that this functions will never be called recursively. - * - * The only exception to that is put_persistent_grant, that can be called - * from interrupt context (by xen_blkbk_unmap), so we have to use atomic - * bit operations to modify the flags of a persistent grant and to count - * the number of used grants. + * pers_gnts_lock must be used around all the persistent grant helpers + * because blkback may use multi-thread/queue for each backend. */ static int add_persistent_gnt(struct xen_blkif *blkif, struct persistent_gnt *persistent_gnt) @@ -204,6 +198,7 @@ static int add_persistent_gnt(struct xen_blkif *blkif, struct rb_node **new = NULL, *parent = NULL; struct persistent_gnt *this; + BUG_ON(!spin_is_locked(&blkif->pers_gnts_lock)); if (blkif->persistent_gnt_c >= xen_blkif_max_pgrants) { if (!blkif->vbd.overflow_max_grants) blkif->vbd.overflow_max_grants = 1; @@ -241,6 +236,7 @@ static struct persistent_gnt *get_persistent_gnt(struct xen_blkif *blkif, struct persistent_gnt *data; struct rb_node *node = NULL; + BUG_ON(!spin_is_locked(&blkif->pers_gnts_lock)); node = blkif->persistent_gnts.rb_node; while (node) { data = container_of(node, struct persistent_gnt, node); @@ -265,6 +261,7 @@ static struct persistent_gnt *get_persistent_gnt(struct xen_blkif *blkif, static void put_persistent_gnt(struct xen_blkif *blkif, struct persistent_gnt *persistent_gnt) { + BUG_ON(!spin_is_locked(&blkif->pers_gnts_lock)); if(!test_bit(PERSISTENT_GNT_ACTIVE, persistent_gnt->flags)) pr_alert_ratelimited("freeing a grant already unused\n"); set_bit(PERSISTENT_GNT_WAS_ACTIVE, persistent_gnt->flags); @@ -286,6 +283,7 @@ static void free_persistent_gnts(struct xen_blkif *blkif, struct rb_root *root, unmap_data.unmap_ops = unmap; unmap_data.kunmap_ops = NULL; + BUG_ON(!spin_is_locked(&blkif->pers_gnts_lock)); foreach_grant_safe(persistent_gnt, n, root, node) { BUG_ON(persistent_gnt->handle == BLKBACK_INVALID_HANDLE); @@ -322,11 +320,13 @@ void xen_blkbk_unmap_purged_grants(struct work_struct *work) int segs_to_unmap = 0; struct xen_blkif *blkif = container_of(work, typeof(*blkif), persistent_purge_work); struct gntab_unmap_queue_data unmap_data; + unsigned long flags; unmap_data.pages = pages; unmap_data.unmap_ops = unmap; unmap_data.kunmap_ops = NULL; + spin_lock_irqsave(&blkif->pers_gnts_lock, flags); while(!list_empty(&blkif->persistent_purge_list)) { persistent_gnt = list_first_entry(&blkif->persistent_purge_list, struct persistent_gnt, @@ -348,6 +348,7 @@ void xen_blkbk_unmap_purged_grants(struct work_struct *work) } kfree(persistent_gnt); } + spin_unlock_irqrestore(&blkif->pers_gnts_lock, flags); if (segs_to_unmap > 0) { unmap_data.count = segs_to_unmap; BUG_ON(gnttab_unmap_refs_sync(&unmap_data)); @@ -362,16 +363,18 @@ static void purge_persistent_gnt(struct xen_blkif *blkif) unsigned int num_clean, total; bool scan_used = false, clean_used = false; struct rb_root *root; + unsigned long flags; + spin_lock_irqsave(&blkif->pers_gnts_lock, flags); if (blkif->persistent_gnt_c < xen_blkif_max_pgrants || (blkif->persistent_gnt_c == xen_blkif_max_pgrants && !blkif->vbd.overflow_max_grants)) { - return; + goto out; } if (work_busy(&blkif->persistent_purge_work)) { pr_alert_ratelimited("Scheduled work from previous purge is still busy, cannot purge list\n"); - return; + goto out; } num_clean = (xen_blkif_max_pgrants / 100) * LRU_PERCENT_CLEAN; @@ -379,7 +382,7 @@ static void purge_persistent_gnt(struct xen_blkif *blkif) num_clean = min(blkif->persistent_gnt_c, num_clean); if ((num_clean == 0) || (num_clean > (blkif->persistent_gnt_c - atomic_read(&blkif->persistent_gnt_in_use)))) - return; + goto out; /* * At this point, we can assure that there will be no calls @@ -436,29 +439,35 @@ finished: } blkif->persistent_gnt_c -= (total - num_clean); + spin_unlock_irqrestore(&blkif->pers_gnts_lock, flags); blkif->vbd.overflow_max_grants = 0; /* We can defer this work */ schedule_work(&blkif->persistent_purge_work); pr_debug("Purged %u/%u\n", (total - num_clean), total); return; + +out: + spin_unlock_irqrestore(&blkif->pers_gnts_lock, flags); + + return; } /* * Retrieve from the 'pending_reqs' a free pending_req structure to be used. */ -static struct pending_req *alloc_req(struct xen_blkif *blkif) +static struct pending_req *alloc_req(struct xen_blkif_ring *ring) { struct pending_req *req = NULL; unsigned long flags; - spin_lock_irqsave(&blkif->pending_free_lock, flags); - if (!list_empty(&blkif->pending_free)) { - req = list_entry(blkif->pending_free.next, struct pending_req, + spin_lock_irqsave(&ring->pending_free_lock, flags); + if (!list_empty(&ring->pending_free)) { + req = list_entry(ring->pending_free.next, struct pending_req, free_list); list_del(&req->free_list); } - spin_unlock_irqrestore(&blkif->pending_free_lock, flags); + spin_unlock_irqrestore(&ring->pending_free_lock, flags); return req; } @@ -466,17 +475,17 @@ static struct pending_req *alloc_req(struct xen_blkif *blkif) * Return the 'pending_req' structure back to the freepool. We also * wake up the thread if it was waiting for a free page. */ -static void free_req(struct xen_blkif *blkif, struct pending_req *req) +static void free_req(struct xen_blkif_ring *ring, struct pending_req *req) { unsigned long flags; int was_empty; - spin_lock_irqsave(&blkif->pending_free_lock, flags); - was_empty = list_empty(&blkif->pending_free); - list_add(&req->free_list, &blkif->pending_free); - spin_unlock_irqrestore(&blkif->pending_free_lock, flags); + spin_lock_irqsave(&ring->pending_free_lock, flags); + was_empty = list_empty(&ring->pending_free); + list_add(&req->free_list, &ring->pending_free); + spin_unlock_irqrestore(&ring->pending_free_lock, flags); if (was_empty) - wake_up(&blkif->pending_free_wq); + wake_up(&ring->pending_free_wq); } /* @@ -556,10 +565,10 @@ abort: /* * Notification from the guest OS. */ -static void blkif_notify_work(struct xen_blkif *blkif) +static void blkif_notify_work(struct xen_blkif_ring *ring) { - blkif->waiting_reqs = 1; - wake_up(&blkif->wq); + ring->waiting_reqs = 1; + wake_up(&ring->wq); } irqreturn_t xen_blkif_be_int(int irq, void *dev_id) @@ -590,7 +599,8 @@ static void print_stats(struct xen_blkif *blkif) int xen_blkif_schedule(void *arg) { - struct xen_blkif *blkif = arg; + struct xen_blkif_ring *ring = arg; + struct xen_blkif *blkif = ring->blkif; struct xen_vbd *vbd = &blkif->vbd; unsigned long timeout; int ret; @@ -606,27 +616,27 @@ int xen_blkif_schedule(void *arg) timeout = msecs_to_jiffies(LRU_INTERVAL); timeout = wait_event_interruptible_timeout( - blkif->wq, - blkif->waiting_reqs || kthread_should_stop(), + ring->wq, + ring->waiting_reqs || kthread_should_stop(), timeout); if (timeout == 0) goto purge_gnt_list; timeout = wait_event_interruptible_timeout( - blkif->pending_free_wq, - !list_empty(&blkif->pending_free) || + ring->pending_free_wq, + !list_empty(&ring->pending_free) || kthread_should_stop(), timeout); if (timeout == 0) goto purge_gnt_list; - blkif->waiting_reqs = 0; + ring->waiting_reqs = 0; smp_mb(); /* clear flag *before* checking for work */ - ret = do_block_io_op(blkif); + ret = do_block_io_op(ring); if (ret > 0) - blkif->waiting_reqs = 1; + ring->waiting_reqs = 1; if (ret == -EACCES) - wait_event_interruptible(blkif->shutdown_wq, + wait_event_interruptible(ring->shutdown_wq, kthread_should_stop()); purge_gnt_list: @@ -649,7 +659,7 @@ purge_gnt_list: if (log_stats) print_stats(blkif); - blkif->xenblkd = NULL; + ring->xenblkd = NULL; xen_blkif_put(blkif); return 0; @@ -658,32 +668,40 @@ purge_gnt_list: /* * Remove persistent grants and empty the pool of free pages */ -void xen_blkbk_free_caches(struct xen_blkif *blkif) +void xen_blkbk_free_caches(struct xen_blkif_ring *ring) { + struct xen_blkif *blkif = ring->blkif; + unsigned long flags; + /* Free all persistent grant pages */ + spin_lock_irqsave(&blkif->pers_gnts_lock, flags); if (!RB_EMPTY_ROOT(&blkif->persistent_gnts)) free_persistent_gnts(blkif, &blkif->persistent_gnts, blkif->persistent_gnt_c); BUG_ON(!RB_EMPTY_ROOT(&blkif->persistent_gnts)); blkif->persistent_gnt_c = 0; + spin_unlock_irqrestore(&blkif->pers_gnts_lock, flags); /* Since we are shutting down remove all pages from the buffer */ shrink_free_pagepool(blkif, 0 /* All */); } static unsigned int xen_blkbk_unmap_prepare( - struct xen_blkif *blkif, + struct xen_blkif_ring *ring, struct grant_page **pages, unsigned int num, struct gnttab_unmap_grant_ref *unmap_ops, struct page **unmap_pages) { unsigned int i, invcount = 0; + unsigned long flags; for (i = 0; i < num; i++) { if (pages[i]->persistent_gnt != NULL) { - put_persistent_gnt(blkif, pages[i]->persistent_gnt); + spin_lock_irqsave(&ring->blkif->pers_gnts_lock, flags); + put_persistent_gnt(ring->blkif, pages[i]->persistent_gnt); + spin_unlock_irqrestore(&ring->blkif->pers_gnts_lock, flags); continue; } if (pages[i]->handle == BLKBACK_INVALID_HANDLE) @@ -700,17 +718,18 @@ static unsigned int xen_blkbk_unmap_prepare( static void xen_blkbk_unmap_and_respond_callback(int result, struct gntab_unmap_queue_data *data) { - struct pending_req* pending_req = (struct pending_req*) (data->data); - struct xen_blkif *blkif = pending_req->blkif; + struct pending_req *pending_req = (struct pending_req *)(data->data); + struct xen_blkif_ring *ring = pending_req->ring; + struct xen_blkif *blkif = ring->blkif; /* BUG_ON used to reproduce existing behaviour, but is this the best way to deal with this? */ BUG_ON(result); put_free_pages(blkif, data->pages, data->count); - make_response(blkif, pending_req->id, + make_response(ring, pending_req->id, pending_req->operation, pending_req->status); - free_req(blkif, pending_req); + free_req(ring, pending_req); /* * Make sure the request is freed before releasing blkif, * or there could be a race between free_req and the @@ -723,7 +742,7 @@ static void xen_blkbk_unmap_and_respond_callback(int result, struct gntab_unmap_ * pending_free_wq if there's a drain going on, but it has * to be taken into account if the current model is changed. */ - if (atomic_dec_and_test(&blkif->inflight) && atomic_read(&blkif->drain)) { + if (atomic_dec_and_test(&ring->inflight) && atomic_read(&blkif->drain)) { complete(&blkif->drain_complete); } xen_blkif_put(blkif); @@ -732,11 +751,11 @@ static void xen_blkbk_unmap_and_respond_callback(int result, struct gntab_unmap_ static void xen_blkbk_unmap_and_respond(struct pending_req *req) { struct gntab_unmap_queue_data* work = &req->gnttab_unmap_data; - struct xen_blkif *blkif = req->blkif; + struct xen_blkif_ring *ring = req->ring; struct grant_page **pages = req->segments; unsigned int invcount; - invcount = xen_blkbk_unmap_prepare(blkif, pages, req->nr_segs, + invcount = xen_blkbk_unmap_prepare(ring, pages, req->nr_segs, req->unmap, req->unmap_pages); work->data = req; @@ -757,7 +776,7 @@ static void xen_blkbk_unmap_and_respond(struct pending_req *req) * of hypercalls, but since this is only used in error paths there's * no real need. */ -static void xen_blkbk_unmap(struct xen_blkif *blkif, +static void xen_blkbk_unmap(struct xen_blkif_ring *ring, struct grant_page *pages[], int num) { @@ -768,20 +787,20 @@ static void xen_blkbk_unmap(struct xen_blkif *blkif, while (num) { unsigned int batch = min(num, BLKIF_MAX_SEGMENTS_PER_REQUEST); - - invcount = xen_blkbk_unmap_prepare(blkif, pages, batch, + + invcount = xen_blkbk_unmap_prepare(ring, pages, batch, unmap, unmap_pages); if (invcount) { ret = gnttab_unmap_refs(unmap, NULL, unmap_pages, invcount); BUG_ON(ret); - put_free_pages(blkif, unmap_pages, invcount); + put_free_pages(ring->blkif, unmap_pages, invcount); } pages += batch; num -= batch; } } -static int xen_blkbk_map(struct xen_blkif *blkif, +static int xen_blkbk_map(struct xen_blkif_ring *ring, struct grant_page *pages[], int num, bool ro) { @@ -794,6 +813,8 @@ static int xen_blkbk_map(struct xen_blkif *blkif, int ret = 0; int last_map = 0, map_until = 0; int use_persistent_gnts; + struct xen_blkif *blkif = ring->blkif; + unsigned long irq_flags; use_persistent_gnts = (blkif->vbd.feature_gnt_persistent); @@ -806,10 +827,13 @@ again: for (i = map_until; i < num; i++) { uint32_t flags; - if (use_persistent_gnts) + if (use_persistent_gnts) { + spin_lock_irqsave(&blkif->pers_gnts_lock, irq_flags); persistent_gnt = get_persistent_gnt( blkif, pages[i]->gref); + spin_unlock_irqrestore(&blkif->pers_gnts_lock, irq_flags); + } if (persistent_gnt) { /* @@ -880,8 +904,10 @@ again: persistent_gnt->gnt = map[new_map_idx].ref; persistent_gnt->handle = map[new_map_idx].handle; persistent_gnt->page = pages[seg_idx]->page; + spin_lock_irqsave(&blkif->pers_gnts_lock, irq_flags); if (add_persistent_gnt(blkif, persistent_gnt)) { + spin_unlock_irqrestore(&blkif->pers_gnts_lock, irq_flags); kfree(persistent_gnt); persistent_gnt = NULL; goto next; @@ -890,6 +916,7 @@ again: pr_debug("grant %u added to the tree of persistent grants, using %u/%u\n", persistent_gnt->gnt, blkif->persistent_gnt_c, xen_blkif_max_pgrants); + spin_unlock_irqrestore(&blkif->pers_gnts_lock, irq_flags); goto next; } if (use_persistent_gnts && !blkif->vbd.overflow_max_grants) { @@ -921,7 +948,7 @@ static int xen_blkbk_map_seg(struct pending_req *pending_req) { int rc; - rc = xen_blkbk_map(pending_req->blkif, pending_req->segments, + rc = xen_blkbk_map(pending_req->ring, pending_req->segments, pending_req->nr_segs, (pending_req->operation != BLKIF_OP_READ)); @@ -934,7 +961,7 @@ static int xen_blkbk_parse_indirect(struct blkif_request *req, struct phys_req *preq) { struct grant_page **pages = pending_req->indirect_pages; - struct xen_blkif *blkif = pending_req->blkif; + struct xen_blkif_ring *ring = pending_req->ring; int indirect_grefs, rc, n, nseg, i; struct blkif_request_segment *segments = NULL; @@ -945,7 +972,7 @@ static int xen_blkbk_parse_indirect(struct blkif_request *req, for (i = 0; i < indirect_grefs; i++) pages[i]->gref = req->u.indirect.indirect_grefs[i]; - rc = xen_blkbk_map(blkif, pages, indirect_grefs, true); + rc = xen_blkbk_map(ring, pages, indirect_grefs, true); if (rc) goto unmap; @@ -972,15 +999,16 @@ static int xen_blkbk_parse_indirect(struct blkif_request *req, unmap: if (segments) kunmap_atomic(segments); - xen_blkbk_unmap(blkif, pages, indirect_grefs); + xen_blkbk_unmap(ring, pages, indirect_grefs); return rc; } -static int dispatch_discard_io(struct xen_blkif *blkif, +static int dispatch_discard_io(struct xen_blkif_ring *ring, struct blkif_request *req) { int err = 0; int status = BLKIF_RSP_OKAY; + struct xen_blkif *blkif = ring->blkif; struct block_device *bdev = blkif->vbd.bdev; unsigned long secure; struct phys_req preq; @@ -1013,26 +1041,28 @@ fail_response: } else if (err) status = BLKIF_RSP_ERROR; - make_response(blkif, req->u.discard.id, req->operation, status); + make_response(ring, req->u.discard.id, req->operation, status); xen_blkif_put(blkif); return err; } -static int dispatch_other_io(struct xen_blkif *blkif, +static int dispatch_other_io(struct xen_blkif_ring *ring, struct blkif_request *req, struct pending_req *pending_req) { - free_req(blkif, pending_req); - make_response(blkif, req->u.other.id, req->operation, + free_req(ring, pending_req); + make_response(ring, req->u.other.id, req->operation, BLKIF_RSP_EOPNOTSUPP); return -EIO; } -static void xen_blk_drain_io(struct xen_blkif *blkif) +static void xen_blk_drain_io(struct xen_blkif_ring *ring) { + struct xen_blkif *blkif = ring->blkif; + atomic_set(&blkif->drain, 1); do { - if (atomic_read(&blkif->inflight) == 0) + if (atomic_read(&ring->inflight) == 0) break; wait_for_completion_interruptible_timeout( &blkif->drain_complete, HZ); @@ -1053,12 +1083,12 @@ static void __end_block_io_op(struct pending_req *pending_req, int error) if ((pending_req->operation == BLKIF_OP_FLUSH_DISKCACHE) && (error == -EOPNOTSUPP)) { pr_debug("flush diskcache op failed, not supported\n"); - xen_blkbk_flush_diskcache(XBT_NIL, pending_req->blkif->be, 0); + xen_blkbk_flush_diskcache(XBT_NIL, pending_req->ring->blkif->be, 0); pending_req->status = BLKIF_RSP_EOPNOTSUPP; } else if ((pending_req->operation == BLKIF_OP_WRITE_BARRIER) && (error == -EOPNOTSUPP)) { pr_debug("write barrier op failed, not supported\n"); - xen_blkbk_barrier(XBT_NIL, pending_req->blkif->be, 0); + xen_blkbk_barrier(XBT_NIL, pending_req->ring->blkif->be, 0); pending_req->status = BLKIF_RSP_EOPNOTSUPP; } else if (error) { pr_debug("Buffer not up-to-date at end of operation," @@ -1092,9 +1122,9 @@ static void end_block_io_op(struct bio *bio) * and transmute it to the block API to hand it over to the proper block disk. */ static int -__do_block_io_op(struct xen_blkif *blkif) +__do_block_io_op(struct xen_blkif_ring *ring) { - union blkif_back_rings *blk_rings = &blkif->blk_rings; + union blkif_back_rings *blk_rings = &ring->blk_rings; struct blkif_request req; struct pending_req *pending_req; RING_IDX rc, rp; @@ -1107,7 +1137,7 @@ __do_block_io_op(struct xen_blkif *blkif) if (RING_REQUEST_PROD_OVERFLOW(&blk_rings->common, rp)) { rc = blk_rings->common.rsp_prod_pvt; pr_warn("Frontend provided bogus ring requests (%d - %d = %d). Halting ring processing on dev=%04x\n", - rp, rc, rp - rc, blkif->vbd.pdevice); + rp, rc, rp - rc, ring->blkif->vbd.pdevice); return -EACCES; } while (rc != rp) { @@ -1120,14 +1150,14 @@ __do_block_io_op(struct xen_blkif *blkif) break; } - pending_req = alloc_req(blkif); + pending_req = alloc_req(ring); if (NULL == pending_req) { - blkif->st_oo_req++; + ring->blkif->st_oo_req++; more_to_do = 1; break; } - switch (blkif->blk_protocol) { + switch (ring->blkif->blk_protocol) { case BLKIF_PROTOCOL_NATIVE: memcpy(&req, RING_GET_REQUEST(&blk_rings->native, rc), sizeof(req)); break; @@ -1151,16 +1181,16 @@ __do_block_io_op(struct xen_blkif *blkif) case BLKIF_OP_WRITE_BARRIER: case BLKIF_OP_FLUSH_DISKCACHE: case BLKIF_OP_INDIRECT: - if (dispatch_rw_block_io(blkif, &req, pending_req)) + if (dispatch_rw_block_io(ring, &req, pending_req)) goto done; break; case BLKIF_OP_DISCARD: - free_req(blkif, pending_req); - if (dispatch_discard_io(blkif, &req)) + free_req(ring, pending_req); + if (dispatch_discard_io(ring, &req)) goto done; break; default: - if (dispatch_other_io(blkif, &req, pending_req)) + if (dispatch_other_io(ring, &req, pending_req)) goto done; break; } @@ -1173,13 +1203,13 @@ done: } static int -do_block_io_op(struct xen_blkif *blkif) +do_block_io_op(struct xen_blkif_ring *ring) { - union blkif_back_rings *blk_rings = &blkif->blk_rings; + union blkif_back_rings *blk_rings = &ring->blk_rings; int more_to_do; do { - more_to_do = __do_block_io_op(blkif); + more_to_do = __do_block_io_op(ring); if (more_to_do) break; @@ -1192,7 +1222,7 @@ do_block_io_op(struct xen_blkif *blkif) * Transmutation of the 'struct blkif_request' to a proper 'struct bio' * and call the 'submit_bio' to pass it to the underlying storage. */ -static int dispatch_rw_block_io(struct xen_blkif *blkif, +static int dispatch_rw_block_io(struct xen_blkif_ring *ring, struct blkif_request *req, struct pending_req *pending_req) { @@ -1220,17 +1250,17 @@ static int dispatch_rw_block_io(struct xen_blkif *blkif, switch (req_operation) { case BLKIF_OP_READ: - blkif->st_rd_req++; + ring->blkif->st_rd_req++; operation = READ; break; case BLKIF_OP_WRITE: - blkif->st_wr_req++; + ring->blkif->st_wr_req++; operation = WRITE_ODIRECT; break; case BLKIF_OP_WRITE_BARRIER: drain = true; case BLKIF_OP_FLUSH_DISKCACHE: - blkif->st_f_req++; + ring->blkif->st_f_req++; operation = WRITE_FLUSH; break; default: @@ -1255,7 +1285,7 @@ static int dispatch_rw_block_io(struct xen_blkif *blkif, preq.nr_sects = 0; - pending_req->blkif = blkif; + pending_req->ring = ring; pending_req->id = req->u.rw.id; pending_req->operation = req_operation; pending_req->status = BLKIF_RSP_OKAY; @@ -1282,12 +1312,12 @@ static int dispatch_rw_block_io(struct xen_blkif *blkif, goto fail_response; } - if (xen_vbd_translate(&preq, blkif, operation) != 0) { + if (xen_vbd_translate(&preq, ring->blkif, operation) != 0) { pr_debug("access denied: %s of [%llu,%llu] on dev=%04x\n", operation == READ ? "read" : "write", preq.sector_number, preq.sector_number + preq.nr_sects, - blkif->vbd.pdevice); + ring->blkif->vbd.pdevice); goto fail_response; } @@ -1299,7 +1329,7 @@ static int dispatch_rw_block_io(struct xen_blkif *blkif, if (((int)preq.sector_number|(int)seg[i].nsec) & ((bdev_logical_block_size(preq.bdev) >> 9) - 1)) { pr_debug("Misaligned I/O request from domain %d\n", - blkif->domid); + ring->blkif->domid); goto fail_response; } } @@ -1308,7 +1338,7 @@ static int dispatch_rw_block_io(struct xen_blkif *blkif, * issue the WRITE_FLUSH. */ if (drain) - xen_blk_drain_io(pending_req->blkif); + xen_blk_drain_io(pending_req->ring); /* * If we have failed at this point, we need to undo the M2P override, @@ -1323,8 +1353,8 @@ static int dispatch_rw_block_io(struct xen_blkif *blkif, * This corresponding xen_blkif_put is done in __end_block_io_op, or * below (in "!bio") if we are handling a BLKIF_OP_DISCARD. */ - xen_blkif_get(blkif); - atomic_inc(&blkif->inflight); + xen_blkif_get(ring->blkif); + atomic_inc(&ring->inflight); for (i = 0; i < nseg; i++) { while ((bio == NULL) || @@ -1372,19 +1402,19 @@ static int dispatch_rw_block_io(struct xen_blkif *blkif, blk_finish_plug(&plug); if (operation == READ) - blkif->st_rd_sect += preq.nr_sects; + ring->blkif->st_rd_sect += preq.nr_sects; else if (operation & WRITE) - blkif->st_wr_sect += preq.nr_sects; + ring->blkif->st_wr_sect += preq.nr_sects; return 0; fail_flush: - xen_blkbk_unmap(blkif, pending_req->segments, + xen_blkbk_unmap(ring, pending_req->segments, pending_req->nr_segs); fail_response: /* Haven't submitted any bio's yet. */ - make_response(blkif, req->u.rw.id, req_operation, BLKIF_RSP_ERROR); - free_req(blkif, pending_req); + make_response(ring, req->u.rw.id, req_operation, BLKIF_RSP_ERROR); + free_req(ring, pending_req); msleep(1); /* back off a bit */ return -EIO; @@ -1402,21 +1432,22 @@ static int dispatch_rw_block_io(struct xen_blkif *blkif, /* * Put a response on the ring on how the operation fared. */ -static void make_response(struct xen_blkif *blkif, u64 id, +static void make_response(struct xen_blkif_ring *ring, u64 id, unsigned short op, int st) { struct blkif_response resp; unsigned long flags; - union blkif_back_rings *blk_rings = &blkif->blk_rings; + union blkif_back_rings *blk_rings; int notify; resp.id = id; resp.operation = op; resp.status = st; - spin_lock_irqsave(&blkif->blk_ring_lock, flags); + spin_lock_irqsave(&ring->blk_ring_lock, flags); + blk_rings = &ring->blk_rings; /* Place on the response ring for the relevant domain. */ - switch (blkif->blk_protocol) { + switch (ring->blkif->blk_protocol) { case BLKIF_PROTOCOL_NATIVE: memcpy(RING_GET_RESPONSE(&blk_rings->native, blk_rings->native.rsp_prod_pvt), &resp, sizeof(resp)); @@ -1434,9 +1465,9 @@ static void make_response(struct xen_blkif *blkif, u64 id, } blk_rings->common.rsp_prod_pvt++; RING_PUSH_RESPONSES_AND_CHECK_NOTIFY(&blk_rings->common, notify); - spin_unlock_irqrestore(&blkif->blk_ring_lock, flags); + spin_unlock_irqrestore(&ring->blk_ring_lock, flags); if (notify) - notify_remote_via_irq(blkif->irq); + notify_remote_via_irq(ring->irq); } static int __init xen_blkif_init(void) diff --git a/drivers/block/xen-blkback/common.h b/drivers/block/xen-blkback/common.h index 68e87a037b99..dbdf4164c83f 100644 --- a/drivers/block/xen-blkback/common.h +++ b/drivers/block/xen-blkback/common.h @@ -269,34 +269,50 @@ struct persistent_gnt { struct list_head remove_node; }; +/* Per-ring information. */ +struct xen_blkif_ring { + /* Physical parameters of the comms window. */ + unsigned int irq; + union blkif_back_rings blk_rings; + void *blk_ring; + /* Private fields. */ + spinlock_t blk_ring_lock; + + wait_queue_head_t wq; + atomic_t inflight; + /* One thread per blkif ring. */ + struct task_struct *xenblkd; + unsigned int waiting_reqs; + + /* List of all 'pending_req' available */ + struct list_head pending_free; + /* And its spinlock. */ + spinlock_t pending_free_lock; + wait_queue_head_t pending_free_wq; + + struct work_struct free_work; + /* Thread shutdown wait queue. */ + wait_queue_head_t shutdown_wq; + struct xen_blkif *blkif; +}; + struct xen_blkif { /* Unique identifier for this interface. */ domid_t domid; unsigned int handle; - /* Physical parameters of the comms window. */ - unsigned int irq; /* Comms information. */ enum blkif_protocol blk_protocol; - union blkif_back_rings blk_rings; - void *blk_ring; /* The VBD attached to this interface. */ struct xen_vbd vbd; /* Back pointer to the backend_info. */ struct backend_info *be; - /* Private fields. */ - spinlock_t blk_ring_lock; atomic_t refcnt; - - wait_queue_head_t wq; /* for barrier (drain) requests */ struct completion drain_complete; atomic_t drain; - atomic_t inflight; - /* One thread per one blkif. */ - struct task_struct *xenblkd; - unsigned int waiting_reqs; /* tree to store persistent grants */ + spinlock_t pers_gnts_lock; struct rb_root persistent_gnts; unsigned int persistent_gnt_c; atomic_t persistent_gnt_in_use; @@ -311,12 +327,6 @@ struct xen_blkif { int free_pages_num; struct list_head free_pages; - /* List of all 'pending_req' available */ - struct list_head pending_free; - /* And its spinlock. */ - spinlock_t pending_free_lock; - wait_queue_head_t pending_free_wq; - /* statistics */ unsigned long st_print; unsigned long long st_rd_req; @@ -328,9 +338,9 @@ struct xen_blkif { unsigned long long st_wr_sect; struct work_struct free_work; - /* Thread shutdown wait queue. */ - wait_queue_head_t shutdown_wq; - unsigned int nr_ring_pages; + unsigned int nr_ring_pages; + /* All rings for this device. */ + struct xen_blkif_ring ring; }; struct seg_buf { @@ -352,7 +362,7 @@ struct grant_page { * response queued for it, with the saved 'id' passed back. */ struct pending_req { - struct xen_blkif *blkif; + struct xen_blkif_ring *ring; u64 id; int nr_segs; atomic_t pendcnt; @@ -394,7 +404,7 @@ int xen_blkif_xenbus_init(void); irqreturn_t xen_blkif_be_int(int irq, void *dev_id); int xen_blkif_schedule(void *arg); int xen_blkif_purge_persistent(void *arg); -void xen_blkbk_free_caches(struct xen_blkif *blkif); +void xen_blkbk_free_caches(struct xen_blkif_ring *ring); int xen_blkbk_flush_diskcache(struct xenbus_transaction xbt, struct backend_info *be, int state); diff --git a/drivers/block/xen-blkback/xenbus.c b/drivers/block/xen-blkback/xenbus.c index f53cff42f8da..e4bfc928035d 100644 --- a/drivers/block/xen-blkback/xenbus.c +++ b/drivers/block/xen-blkback/xenbus.c @@ -88,7 +88,7 @@ static void xen_update_blkif_status(struct xen_blkif *blkif) char name[BLKBACK_NAME_LEN]; /* Not ready to connect? */ - if (!blkif->irq || !blkif->vbd.bdev) + if (!blkif->ring.irq || !blkif->vbd.bdev) return; /* Already connected? */ @@ -113,10 +113,10 @@ static void xen_update_blkif_status(struct xen_blkif *blkif) } invalidate_inode_pages2(blkif->vbd.bdev->bd_inode->i_mapping); - blkif->xenblkd = kthread_run(xen_blkif_schedule, blkif, "%s", name); - if (IS_ERR(blkif->xenblkd)) { - err = PTR_ERR(blkif->xenblkd); - blkif->xenblkd = NULL; + blkif->ring.xenblkd = kthread_run(xen_blkif_schedule, &blkif->ring, "%s", name); + if (IS_ERR(blkif->ring.xenblkd)) { + err = PTR_ERR(blkif->ring.xenblkd); + blkif->ring.xenblkd = NULL; xenbus_dev_error(blkif->be->dev, err, "start xenblkd"); return; } @@ -125,6 +125,7 @@ static void xen_update_blkif_status(struct xen_blkif *blkif) static struct xen_blkif *xen_blkif_alloc(domid_t domid) { struct xen_blkif *blkif; + struct xen_blkif_ring *ring; BUILD_BUG_ON(MAX_INDIRECT_PAGES > BLKIF_MAX_INDIRECT_PAGES_PER_REQUEST); @@ -133,41 +134,40 @@ static struct xen_blkif *xen_blkif_alloc(domid_t domid) return ERR_PTR(-ENOMEM); blkif->domid = domid; - spin_lock_init(&blkif->blk_ring_lock); atomic_set(&blkif->refcnt, 1); - init_waitqueue_head(&blkif->wq); init_completion(&blkif->drain_complete); - atomic_set(&blkif->drain, 0); - blkif->st_print = jiffies; - blkif->persistent_gnts.rb_node = NULL; + INIT_WORK(&blkif->free_work, xen_blkif_deferred_free); spin_lock_init(&blkif->free_pages_lock); INIT_LIST_HEAD(&blkif->free_pages); INIT_LIST_HEAD(&blkif->persistent_purge_list); - blkif->free_pages_num = 0; - atomic_set(&blkif->persistent_gnt_in_use, 0); - atomic_set(&blkif->inflight, 0); + blkif->st_print = jiffies; INIT_WORK(&blkif->persistent_purge_work, xen_blkbk_unmap_purged_grants); - INIT_LIST_HEAD(&blkif->pending_free); - INIT_WORK(&blkif->free_work, xen_blkif_deferred_free); - spin_lock_init(&blkif->pending_free_lock); - init_waitqueue_head(&blkif->pending_free_wq); - init_waitqueue_head(&blkif->shutdown_wq); + ring = &blkif->ring; + ring->blkif = blkif; + spin_lock_init(&ring->blk_ring_lock); + init_waitqueue_head(&ring->wq); + + INIT_LIST_HEAD(&ring->pending_free); + spin_lock_init(&ring->pending_free_lock); + init_waitqueue_head(&ring->pending_free_wq); + init_waitqueue_head(&ring->shutdown_wq); return blkif; } -static int xen_blkif_map(struct xen_blkif *blkif, grant_ref_t *gref, +static int xen_blkif_map(struct xen_blkif_ring *ring, grant_ref_t *gref, unsigned int nr_grefs, unsigned int evtchn) { int err; + struct xen_blkif *blkif = ring->blkif; /* Already connected through? */ - if (blkif->irq) + if (ring->irq) return 0; err = xenbus_map_ring_valloc(blkif->be->dev, gref, nr_grefs, - &blkif->blk_ring); + &ring->blk_ring); if (err < 0) return err; @@ -175,24 +175,24 @@ static int xen_blkif_map(struct xen_blkif *blkif, grant_ref_t *gref, case BLKIF_PROTOCOL_NATIVE: { struct blkif_sring *sring; - sring = (struct blkif_sring *)blkif->blk_ring; - BACK_RING_INIT(&blkif->blk_rings.native, sring, + sring = (struct blkif_sring *)ring->blk_ring; + BACK_RING_INIT(&ring->blk_rings.native, sring, XEN_PAGE_SIZE * nr_grefs); break; } case BLKIF_PROTOCOL_X86_32: { struct blkif_x86_32_sring *sring_x86_32; - sring_x86_32 = (struct blkif_x86_32_sring *)blkif->blk_ring; - BACK_RING_INIT(&blkif->blk_rings.x86_32, sring_x86_32, + sring_x86_32 = (struct blkif_x86_32_sring *)ring->blk_ring; + BACK_RING_INIT(&ring->blk_rings.x86_32, sring_x86_32, XEN_PAGE_SIZE * nr_grefs); break; } case BLKIF_PROTOCOL_X86_64: { struct blkif_x86_64_sring *sring_x86_64; - sring_x86_64 = (struct blkif_x86_64_sring *)blkif->blk_ring; - BACK_RING_INIT(&blkif->blk_rings.x86_64, sring_x86_64, + sring_x86_64 = (struct blkif_x86_64_sring *)ring->blk_ring; + BACK_RING_INIT(&ring->blk_rings.x86_64, sring_x86_64, XEN_PAGE_SIZE * nr_grefs); break; } @@ -202,13 +202,13 @@ static int xen_blkif_map(struct xen_blkif *blkif, grant_ref_t *gref, err = bind_interdomain_evtchn_to_irqhandler(blkif->domid, evtchn, xen_blkif_be_int, 0, - "blkif-backend", blkif); + "blkif-backend", ring); if (err < 0) { - xenbus_unmap_ring_vfree(blkif->be->dev, blkif->blk_ring); - blkif->blk_rings.common.sring = NULL; + xenbus_unmap_ring_vfree(blkif->be->dev, ring->blk_ring); + ring->blk_rings.common.sring = NULL; return err; } - blkif->irq = err; + ring->irq = err; return 0; } @@ -217,35 +217,36 @@ static int xen_blkif_disconnect(struct xen_blkif *blkif) { struct pending_req *req, *n; int i = 0, j; + struct xen_blkif_ring *ring = &blkif->ring; - if (blkif->xenblkd) { - kthread_stop(blkif->xenblkd); - wake_up(&blkif->shutdown_wq); - blkif->xenblkd = NULL; + if (ring->xenblkd) { + kthread_stop(ring->xenblkd); + wake_up(&ring->shutdown_wq); + ring->xenblkd = NULL; } /* The above kthread_stop() guarantees that at this point we * don't have any discard_io or other_io requests. So, checking * for inflight IO is enough. */ - if (atomic_read(&blkif->inflight) > 0) + if (atomic_read(&ring->inflight) > 0) return -EBUSY; - if (blkif->irq) { - unbind_from_irqhandler(blkif->irq, blkif); - blkif->irq = 0; + if (ring->irq) { + unbind_from_irqhandler(ring->irq, ring); + ring->irq = 0; } - if (blkif->blk_rings.common.sring) { - xenbus_unmap_ring_vfree(blkif->be->dev, blkif->blk_ring); - blkif->blk_rings.common.sring = NULL; + if (ring->blk_rings.common.sring) { + xenbus_unmap_ring_vfree(blkif->be->dev, ring->blk_ring); + ring->blk_rings.common.sring = NULL; } /* Remove all persistent grants and the cache of ballooned pages. */ - xen_blkbk_free_caches(blkif); + xen_blkbk_free_caches(ring); /* Check that there is no request in use */ - list_for_each_entry_safe(req, n, &blkif->pending_free, free_list) { + list_for_each_entry_safe(req, n, &ring->pending_free, free_list) { list_del(&req->free_list); for (j = 0; j < MAX_INDIRECT_SEGMENTS; j++) @@ -835,6 +836,7 @@ static int connect_ring(struct backend_info *be) char protocol[64] = ""; struct pending_req *req, *n; int err, i, j; + struct xen_blkif_ring *ring = &be->blkif->ring; pr_debug("%s %s\n", __func__, dev->otherend); @@ -923,7 +925,7 @@ static int connect_ring(struct backend_info *be) req = kzalloc(sizeof(*req), GFP_KERNEL); if (!req) goto fail; - list_add_tail(&req->free_list, &be->blkif->pending_free); + list_add_tail(&req->free_list, &ring->pending_free); for (j = 0; j < MAX_INDIRECT_SEGMENTS; j++) { req->segments[j] = kzalloc(sizeof(*req->segments[0]), GFP_KERNEL); if (!req->segments[j]) @@ -938,7 +940,7 @@ static int connect_ring(struct backend_info *be) } /* Map the shared frame, irq etc. */ - err = xen_blkif_map(be->blkif, ring_ref, nr_grefs, evtchn); + err = xen_blkif_map(ring, ring_ref, nr_grefs, evtchn); if (err) { xenbus_dev_fatal(dev, err, "mapping ring-ref port %u", evtchn); return err; @@ -947,7 +949,7 @@ static int connect_ring(struct backend_info *be) return 0; fail: - list_for_each_entry_safe(req, n, &be->blkif->pending_free, free_list) { + list_for_each_entry_safe(req, n, &ring->pending_free, free_list) { list_del(&req->free_list); for (j = 0; j < MAX_INDIRECT_SEGMENTS; j++) { if (!req->segments[j]) From 2fb1ef4f1226ea6d6d3481036cabe01a4415b68c Mon Sep 17 00:00:00 2001 From: Konrad Rzeszutek Wilk Date: Fri, 11 Dec 2015 12:08:48 -0500 Subject: [PATCH 11/21] xen/blkback: pseudo support for multi hardware queues/rings Preparatory patch for multiple hardware queues (rings). The number of rings is unconditionally set to 1, larger number will be enabled in "xen/blkback: get the number of hardware queues/rings from blkfront". Signed-off-by: Arianna Avanzini Signed-off-by: Bob Liu Signed-off-by: Konrad Rzeszutek Wilk --- v2: Align variables in the structures. --- drivers/block/xen-blkback/common.h | 3 +- drivers/block/xen-blkback/xenbus.c | 293 ++++++++++++++++++----------- 2 files changed, 183 insertions(+), 113 deletions(-) diff --git a/drivers/block/xen-blkback/common.h b/drivers/block/xen-blkback/common.h index dbdf4164c83f..310eff3cf43f 100644 --- a/drivers/block/xen-blkback/common.h +++ b/drivers/block/xen-blkback/common.h @@ -340,7 +340,8 @@ struct xen_blkif { struct work_struct free_work; unsigned int nr_ring_pages; /* All rings for this device. */ - struct xen_blkif_ring ring; + struct xen_blkif_ring *rings; + unsigned int nr_rings; }; struct seg_buf { diff --git a/drivers/block/xen-blkback/xenbus.c b/drivers/block/xen-blkback/xenbus.c index e4bfc928035d..f5bfedd0e948 100644 --- a/drivers/block/xen-blkback/xenbus.c +++ b/drivers/block/xen-blkback/xenbus.c @@ -86,9 +86,11 @@ static void xen_update_blkif_status(struct xen_blkif *blkif) { int err; char name[BLKBACK_NAME_LEN]; + struct xen_blkif_ring *ring; + int i; /* Not ready to connect? */ - if (!blkif->ring.irq || !blkif->vbd.bdev) + if (!blkif->rings || !blkif->rings[0].irq || !blkif->vbd.bdev) return; /* Already connected? */ @@ -113,19 +115,55 @@ static void xen_update_blkif_status(struct xen_blkif *blkif) } invalidate_inode_pages2(blkif->vbd.bdev->bd_inode->i_mapping); - blkif->ring.xenblkd = kthread_run(xen_blkif_schedule, &blkif->ring, "%s", name); - if (IS_ERR(blkif->ring.xenblkd)) { - err = PTR_ERR(blkif->ring.xenblkd); - blkif->ring.xenblkd = NULL; - xenbus_dev_error(blkif->be->dev, err, "start xenblkd"); - return; + for (i = 0; i < blkif->nr_rings; i++) { + ring = &blkif->rings[i]; + ring->xenblkd = kthread_run(xen_blkif_schedule, ring, "%s-%d", name, i); + if (IS_ERR(ring->xenblkd)) { + err = PTR_ERR(ring->xenblkd); + ring->xenblkd = NULL; + xenbus_dev_fatal(blkif->be->dev, err, + "start %s-%d xenblkd", name, i); + goto out; + } } + return; + +out: + while (--i >= 0) { + ring = &blkif->rings[i]; + kthread_stop(ring->xenblkd); + } + return; +} + +static int xen_blkif_alloc_rings(struct xen_blkif *blkif) +{ + unsigned int r; + + blkif->rings = kzalloc(blkif->nr_rings * sizeof(struct xen_blkif_ring), GFP_KERNEL); + if (!blkif->rings) + return -ENOMEM; + + for (r = 0; r < blkif->nr_rings; r++) { + struct xen_blkif_ring *ring = &blkif->rings[r]; + + spin_lock_init(&ring->blk_ring_lock); + init_waitqueue_head(&ring->wq); + INIT_LIST_HEAD(&ring->pending_free); + + spin_lock_init(&ring->pending_free_lock); + init_waitqueue_head(&ring->pending_free_wq); + init_waitqueue_head(&ring->shutdown_wq); + ring->blkif = blkif; + xen_blkif_get(blkif); + } + + return 0; } static struct xen_blkif *xen_blkif_alloc(domid_t domid) { struct xen_blkif *blkif; - struct xen_blkif_ring *ring; BUILD_BUG_ON(MAX_INDIRECT_PAGES > BLKIF_MAX_INDIRECT_PAGES_PER_REQUEST); @@ -143,15 +181,11 @@ static struct xen_blkif *xen_blkif_alloc(domid_t domid) blkif->st_print = jiffies; INIT_WORK(&blkif->persistent_purge_work, xen_blkbk_unmap_purged_grants); - ring = &blkif->ring; - ring->blkif = blkif; - spin_lock_init(&ring->blk_ring_lock); - init_waitqueue_head(&ring->wq); - - INIT_LIST_HEAD(&ring->pending_free); - spin_lock_init(&ring->pending_free_lock); - init_waitqueue_head(&ring->pending_free_wq); - init_waitqueue_head(&ring->shutdown_wq); + blkif->nr_rings = 1; + if (xen_blkif_alloc_rings(blkif)) { + kmem_cache_free(xen_blkif_cachep, blkif); + return ERR_PTR(-ENOMEM); + } return blkif; } @@ -216,50 +250,54 @@ static int xen_blkif_map(struct xen_blkif_ring *ring, grant_ref_t *gref, static int xen_blkif_disconnect(struct xen_blkif *blkif) { struct pending_req *req, *n; - int i = 0, j; - struct xen_blkif_ring *ring = &blkif->ring; + unsigned int j, r; - if (ring->xenblkd) { - kthread_stop(ring->xenblkd); - wake_up(&ring->shutdown_wq); - ring->xenblkd = NULL; + for (r = 0; r < blkif->nr_rings; r++) { + struct xen_blkif_ring *ring = &blkif->rings[r]; + unsigned int i = 0; + + if (ring->xenblkd) { + kthread_stop(ring->xenblkd); + wake_up(&ring->shutdown_wq); + ring->xenblkd = NULL; + } + + /* The above kthread_stop() guarantees that at this point we + * don't have any discard_io or other_io requests. So, checking + * for inflight IO is enough. + */ + if (atomic_read(&ring->inflight) > 0) + return -EBUSY; + + if (ring->irq) { + unbind_from_irqhandler(ring->irq, ring); + ring->irq = 0; + } + + if (ring->blk_rings.common.sring) { + xenbus_unmap_ring_vfree(blkif->be->dev, ring->blk_ring); + ring->blk_rings.common.sring = NULL; + } + + /* Remove all persistent grants and the cache of ballooned pages. */ + xen_blkbk_free_caches(ring); + + /* Check that there is no request in use */ + list_for_each_entry_safe(req, n, &ring->pending_free, free_list) { + list_del(&req->free_list); + + for (j = 0; j < MAX_INDIRECT_SEGMENTS; j++) + kfree(req->segments[j]); + + for (j = 0; j < MAX_INDIRECT_PAGES; j++) + kfree(req->indirect_pages[j]); + + kfree(req); + i++; + } + + WARN_ON(i != (XEN_BLKIF_REQS_PER_PAGE * blkif->nr_ring_pages)); } - - /* The above kthread_stop() guarantees that at this point we - * don't have any discard_io or other_io requests. So, checking - * for inflight IO is enough. - */ - if (atomic_read(&ring->inflight) > 0) - return -EBUSY; - - if (ring->irq) { - unbind_from_irqhandler(ring->irq, ring); - ring->irq = 0; - } - - if (ring->blk_rings.common.sring) { - xenbus_unmap_ring_vfree(blkif->be->dev, ring->blk_ring); - ring->blk_rings.common.sring = NULL; - } - - /* Remove all persistent grants and the cache of ballooned pages. */ - xen_blkbk_free_caches(ring); - - /* Check that there is no request in use */ - list_for_each_entry_safe(req, n, &ring->pending_free, free_list) { - list_del(&req->free_list); - - for (j = 0; j < MAX_INDIRECT_SEGMENTS; j++) - kfree(req->segments[j]); - - for (j = 0; j < MAX_INDIRECT_PAGES; j++) - kfree(req->indirect_pages[j]); - - kfree(req); - i++; - } - - WARN_ON(i != (XEN_BLKIF_REQS_PER_PAGE * blkif->nr_ring_pages)); blkif->nr_ring_pages = 0; return 0; @@ -279,6 +317,7 @@ static void xen_blkif_free(struct xen_blkif *blkif) BUG_ON(!list_empty(&blkif->free_pages)); BUG_ON(!RB_EMPTY_ROOT(&blkif->persistent_gnts)); + kfree(blkif->rings); kmem_cache_free(xen_blkif_cachep, blkif); } @@ -427,6 +466,7 @@ static int xen_vbd_create(struct xen_blkif *blkif, blkif_vdev_t handle, static int xen_blkbk_remove(struct xenbus_device *dev) { struct backend_info *be = dev_get_drvdata(&dev->dev); + unsigned int i; pr_debug("%s %p %d\n", __func__, dev, dev->otherend_id); @@ -443,7 +483,8 @@ static int xen_blkbk_remove(struct xenbus_device *dev) if (be->blkif) { xen_blkif_disconnect(be->blkif); - xen_blkif_put(be->blkif); + for (i = 0; i < be->blkif->nr_rings; i++) + xen_blkif_put(be->blkif); } kfree(be->mode); @@ -826,51 +867,43 @@ again: xenbus_transaction_end(xbt, 1); } - -static int connect_ring(struct backend_info *be) +/* + * Each ring may have multi pages, depends on "ring-page-order". + */ +static int read_per_ring_refs(struct xen_blkif_ring *ring, const char *dir) { - struct xenbus_device *dev = be->dev; unsigned int ring_ref[XENBUS_MAX_RING_GRANTS]; - unsigned int evtchn, nr_grefs, ring_page_order; - unsigned int pers_grants; - char protocol[64] = ""; struct pending_req *req, *n; int err, i, j; - struct xen_blkif_ring *ring = &be->blkif->ring; + struct xen_blkif *blkif = ring->blkif; + struct xenbus_device *dev = blkif->be->dev; + unsigned int ring_page_order, nr_grefs, evtchn; - pr_debug("%s %s\n", __func__, dev->otherend); - - err = xenbus_scanf(XBT_NIL, dev->otherend, "event-channel", "%u", + err = xenbus_scanf(XBT_NIL, dir, "event-channel", "%u", &evtchn); if (err != 1) { err = -EINVAL; - xenbus_dev_fatal(dev, err, "reading %s/event-channel", - dev->otherend); + xenbus_dev_fatal(dev, err, "reading %s/event-channel", dir); return err; } - pr_info("event-channel %u\n", evtchn); err = xenbus_scanf(XBT_NIL, dev->otherend, "ring-page-order", "%u", &ring_page_order); if (err != 1) { - err = xenbus_scanf(XBT_NIL, dev->otherend, "ring-ref", - "%u", &ring_ref[0]); + err = xenbus_scanf(XBT_NIL, dir, "ring-ref", "%u", &ring_ref[0]); if (err != 1) { err = -EINVAL; - xenbus_dev_fatal(dev, err, "reading %s/ring-ref", - dev->otherend); + xenbus_dev_fatal(dev, err, "reading %s/ring-ref", dir); return err; } nr_grefs = 1; - pr_info("%s:using single page: ring-ref %d\n", dev->otherend, - ring_ref[0]); } else { unsigned int i; if (ring_page_order > xen_blkif_max_ring_order) { err = -EINVAL; xenbus_dev_fatal(dev, err, "%s/request %d ring page order exceed max:%d", - dev->otherend, ring_page_order, + dir, ring_page_order, xen_blkif_max_ring_order); return err; } @@ -880,46 +913,17 @@ static int connect_ring(struct backend_info *be) char ring_ref_name[RINGREF_NAME_LEN]; snprintf(ring_ref_name, RINGREF_NAME_LEN, "ring-ref%u", i); - err = xenbus_scanf(XBT_NIL, dev->otherend, ring_ref_name, + err = xenbus_scanf(XBT_NIL, dir, ring_ref_name, "%u", &ring_ref[i]); if (err != 1) { err = -EINVAL; xenbus_dev_fatal(dev, err, "reading %s/%s", - dev->otherend, ring_ref_name); + dir, ring_ref_name); return err; } - pr_info("ring-ref%u: %u\n", i, ring_ref[i]); } } - - be->blkif->blk_protocol = BLKIF_PROTOCOL_DEFAULT; - err = xenbus_gather(XBT_NIL, dev->otherend, "protocol", - "%63s", protocol, NULL); - if (err) - strcpy(protocol, "unspecified, assuming default"); - else if (0 == strcmp(protocol, XEN_IO_PROTO_ABI_NATIVE)) - be->blkif->blk_protocol = BLKIF_PROTOCOL_NATIVE; - else if (0 == strcmp(protocol, XEN_IO_PROTO_ABI_X86_32)) - be->blkif->blk_protocol = BLKIF_PROTOCOL_X86_32; - else if (0 == strcmp(protocol, XEN_IO_PROTO_ABI_X86_64)) - be->blkif->blk_protocol = BLKIF_PROTOCOL_X86_64; - else { - xenbus_dev_fatal(dev, err, "unknown fe protocol %s", protocol); - return -1; - } - err = xenbus_gather(XBT_NIL, dev->otherend, - "feature-persistent", "%u", - &pers_grants, NULL); - if (err) - pers_grants = 0; - - be->blkif->vbd.feature_gnt_persistent = pers_grants; - be->blkif->vbd.overflow_max_grants = 0; - be->blkif->nr_ring_pages = nr_grefs; - - pr_info("ring-pages:%d, event-channel %d, protocol %d (%s) %s\n", - nr_grefs, evtchn, be->blkif->blk_protocol, protocol, - pers_grants ? "persistent grants" : ""); + blkif->nr_ring_pages = nr_grefs; for (i = 0; i < nr_grefs * XEN_BLKIF_REQS_PER_PAGE; i++) { req = kzalloc(sizeof(*req), GFP_KERNEL); @@ -964,6 +968,71 @@ fail: kfree(req); } return -ENOMEM; + +} + +static int connect_ring(struct backend_info *be) +{ + struct xenbus_device *dev = be->dev; + unsigned int pers_grants; + char protocol[64] = ""; + int err, i; + char *xspath; + size_t xspathsize; + const size_t xenstore_path_ext_size = 11; /* sufficient for "/queue-NNN" */ + + pr_debug("%s %s\n", __func__, dev->otherend); + + be->blkif->blk_protocol = BLKIF_PROTOCOL_DEFAULT; + err = xenbus_gather(XBT_NIL, dev->otherend, "protocol", + "%63s", protocol, NULL); + if (err) + strcpy(protocol, "unspecified, assuming default"); + else if (0 == strcmp(protocol, XEN_IO_PROTO_ABI_NATIVE)) + be->blkif->blk_protocol = BLKIF_PROTOCOL_NATIVE; + else if (0 == strcmp(protocol, XEN_IO_PROTO_ABI_X86_32)) + be->blkif->blk_protocol = BLKIF_PROTOCOL_X86_32; + else if (0 == strcmp(protocol, XEN_IO_PROTO_ABI_X86_64)) + be->blkif->blk_protocol = BLKIF_PROTOCOL_X86_64; + else { + xenbus_dev_fatal(dev, err, "unknown fe protocol %s", protocol); + return -1; + } + err = xenbus_gather(XBT_NIL, dev->otherend, + "feature-persistent", "%u", + &pers_grants, NULL); + if (err) + pers_grants = 0; + + be->blkif->vbd.feature_gnt_persistent = pers_grants; + be->blkif->vbd.overflow_max_grants = 0; + + pr_info("%s: using %d queues, protocol %d (%s) %s\n", dev->nodename, + be->blkif->nr_rings, be->blkif->blk_protocol, protocol, + pers_grants ? "persistent grants" : ""); + + if (be->blkif->nr_rings == 1) + return read_per_ring_refs(&be->blkif->rings[0], dev->otherend); + else { + xspathsize = strlen(dev->otherend) + xenstore_path_ext_size; + xspath = kmalloc(xspathsize, GFP_KERNEL); + if (!xspath) { + xenbus_dev_fatal(dev, -ENOMEM, "reading ring references"); + return -ENOMEM; + } + + for (i = 0; i < be->blkif->nr_rings; i++) { + memset(xspath, 0, xspathsize); + snprintf(xspath, xspathsize, "%s/queue-%u", dev->otherend, i); + err = read_per_ring_refs(&be->blkif->rings[i], xspath); + if (err) { + kfree(xspath); + return err; + } + } + kfree(xspath); + } + return 0; } static const struct xenbus_device_id xen_blkbk_ids[] = { From d62d86000316d7ef38e1c2e9602c3ce6d1cb57bd Mon Sep 17 00:00:00 2001 From: Bob Liu Date: Sat, 14 Nov 2015 11:12:17 +0800 Subject: [PATCH 12/21] xen/blkback: get the number of hardware queues/rings from blkfront Backend advertises "multi-queue-max-queues" to front, also get the negotiated number from "multi-queue-num-queues" written by blkfront. Signed-off-by: Bob Liu Signed-off-by: Konrad Rzeszutek Wilk --- drivers/block/xen-blkback/blkback.c | 13 +++++++++++ drivers/block/xen-blkback/common.h | 1 + drivers/block/xen-blkback/xenbus.c | 34 ++++++++++++++++++++++++----- 3 files changed, 42 insertions(+), 6 deletions(-) diff --git a/drivers/block/xen-blkback/blkback.c b/drivers/block/xen-blkback/blkback.c index 4fd8640d146c..18b27770d80b 100644 --- a/drivers/block/xen-blkback/blkback.c +++ b/drivers/block/xen-blkback/blkback.c @@ -83,6 +83,16 @@ module_param_named(max_persistent_grants, xen_blkif_max_pgrants, int, 0644); MODULE_PARM_DESC(max_persistent_grants, "Maximum number of grants to map persistently"); +/* + * Maximum number of rings/queues blkback supports, allow as many queues as there + * are CPUs if user has not specified a value. + */ +unsigned int xenblk_max_queues; +module_param_named(max_queues, xenblk_max_queues, uint, 0644); +MODULE_PARM_DESC(max_queues, + "Maximum number of hardware queues per virtual disk." \ + "By default it is the number of online CPUs."); + /* * Maximum order of pages to be used for the shared ring between front and * backend, 4KB page granularity is used. @@ -1483,6 +1493,9 @@ static int __init xen_blkif_init(void) xen_blkif_max_ring_order = XENBUS_MAX_RING_GRANT_ORDER; } + if (xenblk_max_queues == 0) + xenblk_max_queues = num_online_cpus(); + rc = xen_blkif_interface_init(); if (rc) goto failed_init; diff --git a/drivers/block/xen-blkback/common.h b/drivers/block/xen-blkback/common.h index 310eff3cf43f..847444dc1df4 100644 --- a/drivers/block/xen-blkback/common.h +++ b/drivers/block/xen-blkback/common.h @@ -46,6 +46,7 @@ #include extern unsigned int xen_blkif_max_ring_order; +extern unsigned int xenblk_max_queues; /* * This is the maximum number of segments that would be allowed in indirect * requests. This value will also be passed to the frontend. diff --git a/drivers/block/xen-blkback/xenbus.c b/drivers/block/xen-blkback/xenbus.c index f5bfedd0e948..0d6bb9383a68 100644 --- a/drivers/block/xen-blkback/xenbus.c +++ b/drivers/block/xen-blkback/xenbus.c @@ -181,12 +181,6 @@ static struct xen_blkif *xen_blkif_alloc(domid_t domid) blkif->st_print = jiffies; INIT_WORK(&blkif->persistent_purge_work, xen_blkbk_unmap_purged_grants); - blkif->nr_rings = 1; - if (xen_blkif_alloc_rings(blkif)) { - kmem_cache_free(xen_blkif_cachep, blkif); - return ERR_PTR(-ENOMEM); - } - return blkif; } @@ -595,6 +589,12 @@ static int xen_blkbk_probe(struct xenbus_device *dev, goto fail; } + /* Multi-queue: advertise how many queues are supported by us.*/ + err = xenbus_printf(XBT_NIL, dev->nodename, + "multi-queue-max-queues", "%u", xenblk_max_queues); + if (err) + pr_warn("Error writing multi-queue-max-queues\n"); + /* setup back pointer */ be->blkif->be = be; @@ -980,6 +980,7 @@ static int connect_ring(struct backend_info *be) char *xspath; size_t xspathsize; const size_t xenstore_path_ext_size = 11; /* sufficient for "/queue-NNN" */ + unsigned int requested_num_queues = 0; pr_debug("%s %s\n", __func__, dev->otherend); @@ -1007,6 +1008,27 @@ static int connect_ring(struct backend_info *be) be->blkif->vbd.feature_gnt_persistent = pers_grants; be->blkif->vbd.overflow_max_grants = 0; + /* + * Read the number of hardware queues from frontend. + */ + err = xenbus_scanf(XBT_NIL, dev->otherend, "multi-queue-num-queues", + "%u", &requested_num_queues); + if (err < 0) { + requested_num_queues = 1; + } else { + if (requested_num_queues > xenblk_max_queues + || requested_num_queues == 0) { + /* Buggy or malicious guest. */ + xenbus_dev_fatal(dev, err, + "guest requested %u queues, exceeding the maximum of %u.", + requested_num_queues, xenblk_max_queues); + return -ENOSYS; + } + } + be->blkif->nr_rings = requested_num_queues; + if (xen_blkif_alloc_rings(be->blkif)) + return -ENOMEM; + pr_info("%s: using %d queues, protocol %d (%s) %s\n", dev->nodename, be->blkif->nr_rings, be->blkif->blk_protocol, protocol, pers_grants ? "persistent grants" : ""); From d4bf0065b7251afb723a29b2fd58f7c38f8ce297 Mon Sep 17 00:00:00 2001 From: Bob Liu Date: Sat, 14 Nov 2015 11:12:19 +0800 Subject: [PATCH 13/21] xen/blkback: make pool of persistent grants and free pages per-queue Make pool of persistent grants and free pages per-queue/ring instead of per-device to get better scalability. Test was done based on null_blk driver: dom0: v4.2-rc8 16vcpus 10GB "modprobe null_blk" domu: v4.2-rc8 16vcpus 10GB [test] rw=read direct=1 ioengine=libaio bs=4k time_based runtime=30 filename=/dev/xvdb numjobs=16 iodepth=64 iodepth_batch=64 iodepth_batch_complete=64 group_reporting Results: iops1: After patch "xen/blkfront: make persistent grants per-queue". iops2: After this patch. Queues: 1 4 8 16 Iops orig(k): 810 1064 780 700 Iops1(k): 810 1230(~20%) 1024(~20%) 850(~20%) Iops2(k): 810 1410(~35%) 1354(~75%) 1440(~100%) With 4 queues after this commit we can get ~75% increase in IOPS, and performance won't drop if increasing queue numbers. Please find the respective chart in this link: https://www.dropbox.com/s/agrcy2pbzbsvmwv/iops.png?dl=0 Signed-off-by: Bob Liu Signed-off-by: Konrad Rzeszutek Wilk --- drivers/block/xen-blkback/blkback.c | 202 +++++++++++++--------------- drivers/block/xen-blkback/common.h | 32 ++--- drivers/block/xen-blkback/xenbus.c | 21 ++- 3 files changed, 118 insertions(+), 137 deletions(-) diff --git a/drivers/block/xen-blkback/blkback.c b/drivers/block/xen-blkback/blkback.c index 18b27770d80b..a00d6c6c2880 100644 --- a/drivers/block/xen-blkback/blkback.c +++ b/drivers/block/xen-blkback/blkback.c @@ -123,60 +123,60 @@ module_param(log_stats, int, 0644); /* Number of free pages to remove on each call to gnttab_free_pages */ #define NUM_BATCH_FREE_PAGES 10 -static inline int get_free_page(struct xen_blkif *blkif, struct page **page) +static inline int get_free_page(struct xen_blkif_ring *ring, struct page **page) { unsigned long flags; - spin_lock_irqsave(&blkif->free_pages_lock, flags); - if (list_empty(&blkif->free_pages)) { - BUG_ON(blkif->free_pages_num != 0); - spin_unlock_irqrestore(&blkif->free_pages_lock, flags); + spin_lock_irqsave(&ring->free_pages_lock, flags); + if (list_empty(&ring->free_pages)) { + BUG_ON(ring->free_pages_num != 0); + spin_unlock_irqrestore(&ring->free_pages_lock, flags); return gnttab_alloc_pages(1, page); } - BUG_ON(blkif->free_pages_num == 0); - page[0] = list_first_entry(&blkif->free_pages, struct page, lru); + BUG_ON(ring->free_pages_num == 0); + page[0] = list_first_entry(&ring->free_pages, struct page, lru); list_del(&page[0]->lru); - blkif->free_pages_num--; - spin_unlock_irqrestore(&blkif->free_pages_lock, flags); + ring->free_pages_num--; + spin_unlock_irqrestore(&ring->free_pages_lock, flags); return 0; } -static inline void put_free_pages(struct xen_blkif *blkif, struct page **page, +static inline void put_free_pages(struct xen_blkif_ring *ring, struct page **page, int num) { unsigned long flags; int i; - spin_lock_irqsave(&blkif->free_pages_lock, flags); + spin_lock_irqsave(&ring->free_pages_lock, flags); for (i = 0; i < num; i++) - list_add(&page[i]->lru, &blkif->free_pages); - blkif->free_pages_num += num; - spin_unlock_irqrestore(&blkif->free_pages_lock, flags); + list_add(&page[i]->lru, &ring->free_pages); + ring->free_pages_num += num; + spin_unlock_irqrestore(&ring->free_pages_lock, flags); } -static inline void shrink_free_pagepool(struct xen_blkif *blkif, int num) +static inline void shrink_free_pagepool(struct xen_blkif_ring *ring, int num) { /* Remove requested pages in batches of NUM_BATCH_FREE_PAGES */ struct page *page[NUM_BATCH_FREE_PAGES]; unsigned int num_pages = 0; unsigned long flags; - spin_lock_irqsave(&blkif->free_pages_lock, flags); - while (blkif->free_pages_num > num) { - BUG_ON(list_empty(&blkif->free_pages)); - page[num_pages] = list_first_entry(&blkif->free_pages, + spin_lock_irqsave(&ring->free_pages_lock, flags); + while (ring->free_pages_num > num) { + BUG_ON(list_empty(&ring->free_pages)); + page[num_pages] = list_first_entry(&ring->free_pages, struct page, lru); list_del(&page[num_pages]->lru); - blkif->free_pages_num--; + ring->free_pages_num--; if (++num_pages == NUM_BATCH_FREE_PAGES) { - spin_unlock_irqrestore(&blkif->free_pages_lock, flags); + spin_unlock_irqrestore(&ring->free_pages_lock, flags); gnttab_free_pages(num_pages, page); - spin_lock_irqsave(&blkif->free_pages_lock, flags); + spin_lock_irqsave(&ring->free_pages_lock, flags); num_pages = 0; } } - spin_unlock_irqrestore(&blkif->free_pages_lock, flags); + spin_unlock_irqrestore(&ring->free_pages_lock, flags); if (num_pages != 0) gnttab_free_pages(num_pages, page); } @@ -199,23 +199,29 @@ static void make_response(struct xen_blkif_ring *ring, u64 id, /* - * pers_gnts_lock must be used around all the persistent grant helpers - * because blkback may use multi-thread/queue for each backend. + * We don't need locking around the persistent grant helpers + * because blkback uses a single-thread for each backend, so we + * can be sure that this functions will never be called recursively. + * + * The only exception to that is put_persistent_grant, that can be called + * from interrupt context (by xen_blkbk_unmap), so we have to use atomic + * bit operations to modify the flags of a persistent grant and to count + * the number of used grants. */ -static int add_persistent_gnt(struct xen_blkif *blkif, +static int add_persistent_gnt(struct xen_blkif_ring *ring, struct persistent_gnt *persistent_gnt) { struct rb_node **new = NULL, *parent = NULL; struct persistent_gnt *this; + struct xen_blkif *blkif = ring->blkif; - BUG_ON(!spin_is_locked(&blkif->pers_gnts_lock)); - if (blkif->persistent_gnt_c >= xen_blkif_max_pgrants) { + if (ring->persistent_gnt_c >= xen_blkif_max_pgrants) { if (!blkif->vbd.overflow_max_grants) blkif->vbd.overflow_max_grants = 1; return -EBUSY; } /* Figure out where to put new node */ - new = &blkif->persistent_gnts.rb_node; + new = &ring->persistent_gnts.rb_node; while (*new) { this = container_of(*new, struct persistent_gnt, node); @@ -234,20 +240,19 @@ static int add_persistent_gnt(struct xen_blkif *blkif, set_bit(PERSISTENT_GNT_ACTIVE, persistent_gnt->flags); /* Add new node and rebalance tree. */ rb_link_node(&(persistent_gnt->node), parent, new); - rb_insert_color(&(persistent_gnt->node), &blkif->persistent_gnts); - blkif->persistent_gnt_c++; - atomic_inc(&blkif->persistent_gnt_in_use); + rb_insert_color(&(persistent_gnt->node), &ring->persistent_gnts); + ring->persistent_gnt_c++; + atomic_inc(&ring->persistent_gnt_in_use); return 0; } -static struct persistent_gnt *get_persistent_gnt(struct xen_blkif *blkif, +static struct persistent_gnt *get_persistent_gnt(struct xen_blkif_ring *ring, grant_ref_t gref) { struct persistent_gnt *data; struct rb_node *node = NULL; - BUG_ON(!spin_is_locked(&blkif->pers_gnts_lock)); - node = blkif->persistent_gnts.rb_node; + node = ring->persistent_gnts.rb_node; while (node) { data = container_of(node, struct persistent_gnt, node); @@ -261,25 +266,24 @@ static struct persistent_gnt *get_persistent_gnt(struct xen_blkif *blkif, return NULL; } set_bit(PERSISTENT_GNT_ACTIVE, data->flags); - atomic_inc(&blkif->persistent_gnt_in_use); + atomic_inc(&ring->persistent_gnt_in_use); return data; } } return NULL; } -static void put_persistent_gnt(struct xen_blkif *blkif, +static void put_persistent_gnt(struct xen_blkif_ring *ring, struct persistent_gnt *persistent_gnt) { - BUG_ON(!spin_is_locked(&blkif->pers_gnts_lock)); if(!test_bit(PERSISTENT_GNT_ACTIVE, persistent_gnt->flags)) pr_alert_ratelimited("freeing a grant already unused\n"); set_bit(PERSISTENT_GNT_WAS_ACTIVE, persistent_gnt->flags); clear_bit(PERSISTENT_GNT_ACTIVE, persistent_gnt->flags); - atomic_dec(&blkif->persistent_gnt_in_use); + atomic_dec(&ring->persistent_gnt_in_use); } -static void free_persistent_gnts(struct xen_blkif *blkif, struct rb_root *root, +static void free_persistent_gnts(struct xen_blkif_ring *ring, struct rb_root *root, unsigned int num) { struct gnttab_unmap_grant_ref unmap[BLKIF_MAX_SEGMENTS_PER_REQUEST]; @@ -293,7 +297,6 @@ static void free_persistent_gnts(struct xen_blkif *blkif, struct rb_root *root, unmap_data.unmap_ops = unmap; unmap_data.kunmap_ops = NULL; - BUG_ON(!spin_is_locked(&blkif->pers_gnts_lock)); foreach_grant_safe(persistent_gnt, n, root, node) { BUG_ON(persistent_gnt->handle == BLKBACK_INVALID_HANDLE); @@ -311,7 +314,7 @@ static void free_persistent_gnts(struct xen_blkif *blkif, struct rb_root *root, unmap_data.count = segs_to_unmap; BUG_ON(gnttab_unmap_refs_sync(&unmap_data)); - put_free_pages(blkif, pages, segs_to_unmap); + put_free_pages(ring, pages, segs_to_unmap); segs_to_unmap = 0; } @@ -328,17 +331,15 @@ void xen_blkbk_unmap_purged_grants(struct work_struct *work) struct page *pages[BLKIF_MAX_SEGMENTS_PER_REQUEST]; struct persistent_gnt *persistent_gnt; int segs_to_unmap = 0; - struct xen_blkif *blkif = container_of(work, typeof(*blkif), persistent_purge_work); + struct xen_blkif_ring *ring = container_of(work, typeof(*ring), persistent_purge_work); struct gntab_unmap_queue_data unmap_data; - unsigned long flags; unmap_data.pages = pages; unmap_data.unmap_ops = unmap; unmap_data.kunmap_ops = NULL; - spin_lock_irqsave(&blkif->pers_gnts_lock, flags); - while(!list_empty(&blkif->persistent_purge_list)) { - persistent_gnt = list_first_entry(&blkif->persistent_purge_list, + while(!list_empty(&ring->persistent_purge_list)) { + persistent_gnt = list_first_entry(&ring->persistent_purge_list, struct persistent_gnt, remove_node); list_del(&persistent_gnt->remove_node); @@ -353,45 +354,42 @@ void xen_blkbk_unmap_purged_grants(struct work_struct *work) if (++segs_to_unmap == BLKIF_MAX_SEGMENTS_PER_REQUEST) { unmap_data.count = segs_to_unmap; BUG_ON(gnttab_unmap_refs_sync(&unmap_data)); - put_free_pages(blkif, pages, segs_to_unmap); + put_free_pages(ring, pages, segs_to_unmap); segs_to_unmap = 0; } kfree(persistent_gnt); } - spin_unlock_irqrestore(&blkif->pers_gnts_lock, flags); if (segs_to_unmap > 0) { unmap_data.count = segs_to_unmap; BUG_ON(gnttab_unmap_refs_sync(&unmap_data)); - put_free_pages(blkif, pages, segs_to_unmap); + put_free_pages(ring, pages, segs_to_unmap); } } -static void purge_persistent_gnt(struct xen_blkif *blkif) +static void purge_persistent_gnt(struct xen_blkif_ring *ring) { struct persistent_gnt *persistent_gnt; struct rb_node *n; unsigned int num_clean, total; bool scan_used = false, clean_used = false; struct rb_root *root; - unsigned long flags; - spin_lock_irqsave(&blkif->pers_gnts_lock, flags); - if (blkif->persistent_gnt_c < xen_blkif_max_pgrants || - (blkif->persistent_gnt_c == xen_blkif_max_pgrants && - !blkif->vbd.overflow_max_grants)) { + if (ring->persistent_gnt_c < xen_blkif_max_pgrants || + (ring->persistent_gnt_c == xen_blkif_max_pgrants && + !ring->blkif->vbd.overflow_max_grants)) { goto out; } - if (work_busy(&blkif->persistent_purge_work)) { + if (work_busy(&ring->persistent_purge_work)) { pr_alert_ratelimited("Scheduled work from previous purge is still busy, cannot purge list\n"); goto out; } num_clean = (xen_blkif_max_pgrants / 100) * LRU_PERCENT_CLEAN; - num_clean = blkif->persistent_gnt_c - xen_blkif_max_pgrants + num_clean; - num_clean = min(blkif->persistent_gnt_c, num_clean); + num_clean = ring->persistent_gnt_c - xen_blkif_max_pgrants + num_clean; + num_clean = min(ring->persistent_gnt_c, num_clean); if ((num_clean == 0) || - (num_clean > (blkif->persistent_gnt_c - atomic_read(&blkif->persistent_gnt_in_use)))) + (num_clean > (ring->persistent_gnt_c - atomic_read(&ring->persistent_gnt_in_use)))) goto out; /* @@ -407,8 +405,8 @@ static void purge_persistent_gnt(struct xen_blkif *blkif) pr_debug("Going to purge %u persistent grants\n", num_clean); - BUG_ON(!list_empty(&blkif->persistent_purge_list)); - root = &blkif->persistent_gnts; + BUG_ON(!list_empty(&ring->persistent_purge_list)); + root = &ring->persistent_gnts; purge_list: foreach_grant_safe(persistent_gnt, n, root, node) { BUG_ON(persistent_gnt->handle == @@ -427,7 +425,7 @@ purge_list: rb_erase(&persistent_gnt->node, root); list_add(&persistent_gnt->remove_node, - &blkif->persistent_purge_list); + &ring->persistent_purge_list); if (--num_clean == 0) goto finished; } @@ -448,18 +446,14 @@ finished: goto purge_list; } - blkif->persistent_gnt_c -= (total - num_clean); - spin_unlock_irqrestore(&blkif->pers_gnts_lock, flags); - blkif->vbd.overflow_max_grants = 0; + ring->persistent_gnt_c -= (total - num_clean); + ring->blkif->vbd.overflow_max_grants = 0; /* We can defer this work */ - schedule_work(&blkif->persistent_purge_work); + schedule_work(&ring->persistent_purge_work); pr_debug("Purged %u/%u\n", (total - num_clean), total); - return; out: - spin_unlock_irqrestore(&blkif->pers_gnts_lock, flags); - return; } @@ -591,14 +585,16 @@ irqreturn_t xen_blkif_be_int(int irq, void *dev_id) * SCHEDULER FUNCTIONS */ -static void print_stats(struct xen_blkif *blkif) +static void print_stats(struct xen_blkif_ring *ring) { + struct xen_blkif *blkif = ring->blkif; + pr_info("(%s): oo %3llu | rd %4llu | wr %4llu | f %4llu" " | ds %4llu | pg: %4u/%4d\n", current->comm, blkif->st_oo_req, blkif->st_rd_req, blkif->st_wr_req, blkif->st_f_req, blkif->st_ds_req, - blkif->persistent_gnt_c, + ring->persistent_gnt_c, xen_blkif_max_pgrants); blkif->st_print = jiffies + msecs_to_jiffies(10 * 1000); blkif->st_rd_req = 0; @@ -651,23 +647,23 @@ int xen_blkif_schedule(void *arg) purge_gnt_list: if (blkif->vbd.feature_gnt_persistent && - time_after(jiffies, blkif->next_lru)) { - purge_persistent_gnt(blkif); - blkif->next_lru = jiffies + msecs_to_jiffies(LRU_INTERVAL); + time_after(jiffies, ring->next_lru)) { + purge_persistent_gnt(ring); + ring->next_lru = jiffies + msecs_to_jiffies(LRU_INTERVAL); } /* Shrink if we have more than xen_blkif_max_buffer_pages */ - shrink_free_pagepool(blkif, xen_blkif_max_buffer_pages); + shrink_free_pagepool(ring, xen_blkif_max_buffer_pages); - if (log_stats && time_after(jiffies, blkif->st_print)) - print_stats(blkif); + if (log_stats && time_after(jiffies, ring->blkif->st_print)) + print_stats(ring); } /* Drain pending purge work */ - flush_work(&blkif->persistent_purge_work); + flush_work(&ring->persistent_purge_work); if (log_stats) - print_stats(blkif); + print_stats(ring); ring->xenblkd = NULL; xen_blkif_put(blkif); @@ -680,21 +676,16 @@ purge_gnt_list: */ void xen_blkbk_free_caches(struct xen_blkif_ring *ring) { - struct xen_blkif *blkif = ring->blkif; - unsigned long flags; - /* Free all persistent grant pages */ - spin_lock_irqsave(&blkif->pers_gnts_lock, flags); - if (!RB_EMPTY_ROOT(&blkif->persistent_gnts)) - free_persistent_gnts(blkif, &blkif->persistent_gnts, - blkif->persistent_gnt_c); + if (!RB_EMPTY_ROOT(&ring->persistent_gnts)) + free_persistent_gnts(ring, &ring->persistent_gnts, + ring->persistent_gnt_c); - BUG_ON(!RB_EMPTY_ROOT(&blkif->persistent_gnts)); - blkif->persistent_gnt_c = 0; - spin_unlock_irqrestore(&blkif->pers_gnts_lock, flags); + BUG_ON(!RB_EMPTY_ROOT(&ring->persistent_gnts)); + ring->persistent_gnt_c = 0; /* Since we are shutting down remove all pages from the buffer */ - shrink_free_pagepool(blkif, 0 /* All */); + shrink_free_pagepool(ring, 0 /* All */); } static unsigned int xen_blkbk_unmap_prepare( @@ -705,13 +696,10 @@ static unsigned int xen_blkbk_unmap_prepare( struct page **unmap_pages) { unsigned int i, invcount = 0; - unsigned long flags; for (i = 0; i < num; i++) { if (pages[i]->persistent_gnt != NULL) { - spin_lock_irqsave(&ring->blkif->pers_gnts_lock, flags); - put_persistent_gnt(ring->blkif, pages[i]->persistent_gnt); - spin_unlock_irqrestore(&ring->blkif->pers_gnts_lock, flags); + put_persistent_gnt(ring, pages[i]->persistent_gnt); continue; } if (pages[i]->handle == BLKBACK_INVALID_HANDLE) @@ -736,7 +724,7 @@ static void xen_blkbk_unmap_and_respond_callback(int result, struct gntab_unmap_ but is this the best way to deal with this? */ BUG_ON(result); - put_free_pages(blkif, data->pages, data->count); + put_free_pages(ring, data->pages, data->count); make_response(ring, pending_req->id, pending_req->operation, pending_req->status); free_req(ring, pending_req); @@ -803,7 +791,7 @@ static void xen_blkbk_unmap(struct xen_blkif_ring *ring, if (invcount) { ret = gnttab_unmap_refs(unmap, NULL, unmap_pages, invcount); BUG_ON(ret); - put_free_pages(ring->blkif, unmap_pages, invcount); + put_free_pages(ring, unmap_pages, invcount); } pages += batch; num -= batch; @@ -824,7 +812,6 @@ static int xen_blkbk_map(struct xen_blkif_ring *ring, int last_map = 0, map_until = 0; int use_persistent_gnts; struct xen_blkif *blkif = ring->blkif; - unsigned long irq_flags; use_persistent_gnts = (blkif->vbd.feature_gnt_persistent); @@ -838,11 +825,9 @@ again: uint32_t flags; if (use_persistent_gnts) { - spin_lock_irqsave(&blkif->pers_gnts_lock, irq_flags); persistent_gnt = get_persistent_gnt( - blkif, + ring, pages[i]->gref); - spin_unlock_irqrestore(&blkif->pers_gnts_lock, irq_flags); } if (persistent_gnt) { @@ -853,7 +838,7 @@ again: pages[i]->page = persistent_gnt->page; pages[i]->persistent_gnt = persistent_gnt; } else { - if (get_free_page(blkif, &pages[i]->page)) + if (get_free_page(ring, &pages[i]->page)) goto out_of_memory; addr = vaddr(pages[i]->page); pages_to_gnt[segs_to_map] = pages[i]->page; @@ -886,7 +871,7 @@ again: BUG_ON(new_map_idx >= segs_to_map); if (unlikely(map[new_map_idx].status != 0)) { pr_debug("invalid buffer -- could not remap it\n"); - put_free_pages(blkif, &pages[seg_idx]->page, 1); + put_free_pages(ring, &pages[seg_idx]->page, 1); pages[seg_idx]->handle = BLKBACK_INVALID_HANDLE; ret |= 1; goto next; @@ -896,7 +881,7 @@ again: continue; } if (use_persistent_gnts && - blkif->persistent_gnt_c < xen_blkif_max_pgrants) { + ring->persistent_gnt_c < xen_blkif_max_pgrants) { /* * We are using persistent grants, the grant is * not mapped but we might have room for it. @@ -914,19 +899,16 @@ again: persistent_gnt->gnt = map[new_map_idx].ref; persistent_gnt->handle = map[new_map_idx].handle; persistent_gnt->page = pages[seg_idx]->page; - spin_lock_irqsave(&blkif->pers_gnts_lock, irq_flags); - if (add_persistent_gnt(blkif, + if (add_persistent_gnt(ring, persistent_gnt)) { - spin_unlock_irqrestore(&blkif->pers_gnts_lock, irq_flags); kfree(persistent_gnt); persistent_gnt = NULL; goto next; } pages[seg_idx]->persistent_gnt = persistent_gnt; pr_debug("grant %u added to the tree of persistent grants, using %u/%u\n", - persistent_gnt->gnt, blkif->persistent_gnt_c, + persistent_gnt->gnt, ring->persistent_gnt_c, xen_blkif_max_pgrants); - spin_unlock_irqrestore(&blkif->pers_gnts_lock, irq_flags); goto next; } if (use_persistent_gnts && !blkif->vbd.overflow_max_grants) { @@ -950,7 +932,7 @@ next: out_of_memory: pr_alert("%s: out of memory\n", __func__); - put_free_pages(blkif, pages_to_gnt, segs_to_map); + put_free_pages(ring, pages_to_gnt, segs_to_map); return -ENOMEM; } diff --git a/drivers/block/xen-blkback/common.h b/drivers/block/xen-blkback/common.h index 847444dc1df4..3c244ecf22a4 100644 --- a/drivers/block/xen-blkback/common.h +++ b/drivers/block/xen-blkback/common.h @@ -291,6 +291,22 @@ struct xen_blkif_ring { spinlock_t pending_free_lock; wait_queue_head_t pending_free_wq; + /* Tree to store persistent grants. */ + spinlock_t pers_gnts_lock; + struct rb_root persistent_gnts; + unsigned int persistent_gnt_c; + atomic_t persistent_gnt_in_use; + unsigned long next_lru; + + /* Used by the kworker that offload work from the persistent purge. */ + struct list_head persistent_purge_list; + struct work_struct persistent_purge_work; + + /* Buffer of free pages to map grant refs. */ + spinlock_t free_pages_lock; + int free_pages_num; + struct list_head free_pages; + struct work_struct free_work; /* Thread shutdown wait queue. */ wait_queue_head_t shutdown_wq; @@ -312,22 +328,6 @@ struct xen_blkif { struct completion drain_complete; atomic_t drain; - /* tree to store persistent grants */ - spinlock_t pers_gnts_lock; - struct rb_root persistent_gnts; - unsigned int persistent_gnt_c; - atomic_t persistent_gnt_in_use; - unsigned long next_lru; - - /* used by the kworker that offload work from the persistent purge */ - struct list_head persistent_purge_list; - struct work_struct persistent_purge_work; - - /* buffer of free pages to map grant refs */ - spinlock_t free_pages_lock; - int free_pages_num; - struct list_head free_pages; - /* statistics */ unsigned long st_print; unsigned long long st_rd_req; diff --git a/drivers/block/xen-blkback/xenbus.c b/drivers/block/xen-blkback/xenbus.c index 0d6bb9383a68..2b8650a9a6a9 100644 --- a/drivers/block/xen-blkback/xenbus.c +++ b/drivers/block/xen-blkback/xenbus.c @@ -150,6 +150,10 @@ static int xen_blkif_alloc_rings(struct xen_blkif *blkif) spin_lock_init(&ring->blk_ring_lock); init_waitqueue_head(&ring->wq); INIT_LIST_HEAD(&ring->pending_free); + INIT_LIST_HEAD(&ring->persistent_purge_list); + INIT_WORK(&ring->persistent_purge_work, xen_blkbk_unmap_purged_grants); + spin_lock_init(&ring->free_pages_lock); + INIT_LIST_HEAD(&ring->free_pages); spin_lock_init(&ring->pending_free_lock); init_waitqueue_head(&ring->pending_free_wq); @@ -175,11 +179,7 @@ static struct xen_blkif *xen_blkif_alloc(domid_t domid) atomic_set(&blkif->refcnt, 1); init_completion(&blkif->drain_complete); INIT_WORK(&blkif->free_work, xen_blkif_deferred_free); - spin_lock_init(&blkif->free_pages_lock); - INIT_LIST_HEAD(&blkif->free_pages); - INIT_LIST_HEAD(&blkif->persistent_purge_list); blkif->st_print = jiffies; - INIT_WORK(&blkif->persistent_purge_work, xen_blkbk_unmap_purged_grants); return blkif; } @@ -290,6 +290,12 @@ static int xen_blkif_disconnect(struct xen_blkif *blkif) i++; } + BUG_ON(atomic_read(&ring->persistent_gnt_in_use) != 0); + BUG_ON(!list_empty(&ring->persistent_purge_list)); + BUG_ON(!RB_EMPTY_ROOT(&ring->persistent_gnts)); + BUG_ON(!list_empty(&ring->free_pages)); + BUG_ON(ring->free_pages_num != 0); + BUG_ON(ring->persistent_gnt_c != 0); WARN_ON(i != (XEN_BLKIF_REQS_PER_PAGE * blkif->nr_ring_pages)); } blkif->nr_ring_pages = 0; @@ -304,13 +310,6 @@ static void xen_blkif_free(struct xen_blkif *blkif) xen_vbd_free(&blkif->vbd); /* Make sure everything is drained before shutting down */ - BUG_ON(blkif->persistent_gnt_c != 0); - BUG_ON(atomic_read(&blkif->persistent_gnt_in_use) != 0); - BUG_ON(blkif->free_pages_num != 0); - BUG_ON(!list_empty(&blkif->persistent_purge_list)); - BUG_ON(!list_empty(&blkif->free_pages)); - BUG_ON(!RB_EMPTY_ROOT(&blkif->persistent_gnts)); - kfree(blkif->rings); kmem_cache_free(xen_blkif_cachep, blkif); } From bde21f73b9be146fda0c689f2724cda9d7737565 Mon Sep 17 00:00:00 2001 From: Konrad Rzeszutek Wilk Date: Wed, 25 Nov 2015 13:07:39 -0500 Subject: [PATCH 14/21] xen/blocks: Return -EXX instead of -1 Lets return sensible values instead of -1. Signed-off-by: Konrad Rzeszutek Wilk --- drivers/block/xen-blkback/xenbus.c | 2 +- drivers/block/xen-blkfront.c | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/block/xen-blkback/xenbus.c b/drivers/block/xen-blkback/xenbus.c index 2b8650a9a6a9..ca3a414de11c 100644 --- a/drivers/block/xen-blkback/xenbus.c +++ b/drivers/block/xen-blkback/xenbus.c @@ -996,7 +996,7 @@ static int connect_ring(struct backend_info *be) be->blkif->blk_protocol = BLKIF_PROTOCOL_X86_64; else { xenbus_dev_fatal(dev, err, "unknown fe protocol %s", protocol); - return -1; + return -ENOSYS; } err = xenbus_gather(XBT_NIL, dev->otherend, "feature-persistent", "%u", diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c index 8b0f3d92d8b4..ef5ce43e307a 100644 --- a/drivers/block/xen-blkfront.c +++ b/drivers/block/xen-blkfront.c @@ -828,11 +828,11 @@ static int xlvbd_init_blk_queue(struct gendisk *gd, u16 sector_size, info->tag_set.driver_data = info; if (blk_mq_alloc_tag_set(&info->tag_set)) - return -1; + return -EINVAL; rq = blk_mq_init_queue(&info->tag_set); if (IS_ERR(rq)) { blk_mq_free_tag_set(&info->tag_set); - return -1; + return PTR_ERR(rq); } queue_flag_set_unlocked(QUEUE_FLAG_VIRT, rq); From 2d0382fac17cef20d507a0211b82e0942b2ab271 Mon Sep 17 00:00:00 2001 From: Konrad Rzeszutek Wilk Date: Wed, 25 Nov 2015 13:20:14 -0500 Subject: [PATCH 15/21] xen/blkback: Free resources if connect_ring failed. With the multi-queue support we could fail at setting up some of the rings and fail the connection. That meant that all resources tied to rings[0..n-1] (where n is the ring that failed to be setup). Eventually the frontend will switch to the states and we will call xen_blkif_disconnect. However we do not want to be at the mercy of the frontend deciding when to change states. This allows us to do the cleanup right away and freeing resources. Signed-off-by: Konrad Rzeszutek Wilk --- drivers/block/xen-blkback/xenbus.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/drivers/block/xen-blkback/xenbus.c b/drivers/block/xen-blkback/xenbus.c index ca3a414de11c..c92b35882720 100644 --- a/drivers/block/xen-blkback/xenbus.c +++ b/drivers/block/xen-blkback/xenbus.c @@ -749,8 +749,14 @@ static void frontend_changed(struct xenbus_device *dev, } err = connect_ring(be); - if (err) + if (err) { + /* + * Clean up so that memory resources can be used by + * other devices. connect_ring reported already error. + */ + xen_blkif_disconnect(be->blkif); break; + } xen_update_blkif_status(be->blkif); break; From a6e7af1288eeb7fca8361356998d31a92a291531 Mon Sep 17 00:00:00 2001 From: Jiri Kosina Date: Mon, 26 Oct 2015 14:47:21 +0900 Subject: [PATCH 16/21] xen-blkback: clear PF_NOFREEZE for xen_blkif_schedule() xen_blkif_schedule() kthread calls try_to_freeze() at the beginning of every attempt to purge the LRU. This operation can't ever succeed though, as the kthread hasn't marked itself as freezable. Before (hopefully eventually) kthread freezing gets converted to fileystem freezing, we'd rather mark xen_blkif_schedule() freezable (as it can generate I/O during suspend). Signed-off-by: Jiri Kosina Signed-off-by: Konrad Rzeszutek Wilk --- drivers/block/xen-blkback/blkback.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/block/xen-blkback/blkback.c b/drivers/block/xen-blkback/blkback.c index a00d6c6c2880..99b479f330af 100644 --- a/drivers/block/xen-blkback/blkback.c +++ b/drivers/block/xen-blkback/blkback.c @@ -613,6 +613,7 @@ int xen_blkif_schedule(void *arg) xen_blkif_get(blkif); + set_freezable(); while (!kthread_should_stop()) { if (try_to_freeze()) continue; From 2e073969d57f60fc0b863985779657624cbd4886 Mon Sep 17 00:00:00 2001 From: Julien Grall Date: Thu, 13 Aug 2015 19:23:10 +0100 Subject: [PATCH 17/21] xen-blkfront: Introduce blkif_ring_get_request MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The code to get a request is always the same. Therefore we can factorize it in a single function. Signed-off-by: Julien Grall Acked-by: Roger Pau MonnĂ© Signed-off-by: Konrad Rzeszutek Wilk --- drivers/block/xen-blkfront.c | 30 +++++++++++++++++++----------- 1 file changed, 19 insertions(+), 11 deletions(-) diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c index ef5ce43e307a..0b32c90ffc3f 100644 --- a/drivers/block/xen-blkfront.c +++ b/drivers/block/xen-blkfront.c @@ -481,6 +481,23 @@ static int blkif_ioctl(struct block_device *bdev, fmode_t mode, return 0; } +static unsigned long blkif_ring_get_request(struct blkfront_ring_info *rinfo, + struct request *req, + struct blkif_request **ring_req) +{ + unsigned long id; + + *ring_req = RING_GET_REQUEST(&rinfo->ring, rinfo->ring.req_prod_pvt); + rinfo->ring.req_prod_pvt++; + + id = get_id_from_freelist(rinfo); + rinfo->shadow[id].request = req; + + (*ring_req)->u.rw.id = id; + + return id; +} + static int blkif_queue_discard_req(struct request *req, struct blkfront_ring_info *rinfo) { struct blkfront_info *info = rinfo->dev_info; @@ -488,9 +505,7 @@ static int blkif_queue_discard_req(struct request *req, struct blkfront_ring_inf unsigned long id; /* Fill out a communications ring structure. */ - ring_req = RING_GET_REQUEST(&rinfo->ring, rinfo->ring.req_prod_pvt); - id = get_id_from_freelist(rinfo); - rinfo->shadow[id].request = req; + id = blkif_ring_get_request(rinfo, req, &ring_req); ring_req->operation = BLKIF_OP_DISCARD; ring_req->u.discard.nr_sectors = blk_rq_sectors(req); @@ -501,8 +516,6 @@ static int blkif_queue_discard_req(struct request *req, struct blkfront_ring_inf else ring_req->u.discard.flag = 0; - rinfo->ring.req_prod_pvt++; - /* Keep a private copy so we can reissue requests when recovering. */ rinfo->shadow[id].req = *ring_req; @@ -635,9 +648,7 @@ static int blkif_queue_rw_req(struct request *req, struct blkfront_ring_info *ri } /* Fill out a communications ring structure. */ - ring_req = RING_GET_REQUEST(&rinfo->ring, rinfo->ring.req_prod_pvt); - id = get_id_from_freelist(rinfo); - rinfo->shadow[id].request = req; + id = blkif_ring_get_request(rinfo, req, &ring_req); BUG_ON(info->max_indirect_segments == 0 && GREFS(req->nr_phys_segments) > BLKIF_MAX_SEGMENTS_PER_REQUEST); @@ -650,7 +661,6 @@ static int blkif_queue_rw_req(struct request *req, struct blkfront_ring_info *ri for_each_sg(rinfo->shadow[id].sg, sg, num_sg, i) num_grant += gnttab_count_grant(sg->offset, sg->length); - ring_req->u.rw.id = id; rinfo->shadow[id].num_sg = num_sg; if (num_grant > BLKIF_MAX_SEGMENTS_PER_REQUEST) { /* @@ -716,8 +726,6 @@ static int blkif_queue_rw_req(struct request *req, struct blkfront_ring_info *ri if (setup.segments) kunmap_atomic(setup.segments); - rinfo->ring.req_prod_pvt++; - /* Keep a private copy so we can reissue requests when recovering. */ rinfo->shadow[id].req = *ring_req; From 6cc5683390472c450fd69975d1283db79202667f Mon Sep 17 00:00:00 2001 From: Julien Grall Date: Thu, 13 Aug 2015 13:13:35 +0100 Subject: [PATCH 18/21] xen/blkfront: Handle non-indirect grant with 64KB pages MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The minimal size of request in the block framework is always PAGE_SIZE. It means that when 64KB guest is support, the request will at least be 64KB. Although, if the backend doesn't support indirect descriptor (such as QDISK in QEMU), a ring request is only able to accommodate 11 segments of 4KB (i.e 44KB). The current frontend is assuming that an I/O request will always fit in a ring request. This is not true any more when using 64KB page granularity and will therefore crash during boot. On ARM64, the ABI is completely neutral to the page granularity used by the domU. The guest has the choice between different page granularity supported by the processors (for instance on ARM64: 4KB, 16KB, 64KB). This can't be enforced by the hypervisor and therefore it's possible to run guests using different page granularity. So we can't mandate the block backend to support indirect descriptor when the frontend is using 64KB page granularity and have to fix it properly in the frontend. The solution exposed below is based on modifying directly the frontend guest rather than asking the block framework to support smaller size (i.e < PAGE_SIZE). This is because the change is the block framework are not trivial as everything seems to relying on a struct *page (see [1]). Although, it may be possible that someone succeed to do it in the future and we would therefore be able to use it. Given that a block request may not fit in a single ring request, a second request is introduced for the data that cannot fit in the first one. This means that the second ring request should never be used on Linux if the page size is smaller than 44KB. To achieve the support of the extra ring request, the block queue size is divided by two. Therefore, the ring will always contain enough space to accommodate 2 ring requests. While this will reduce the overall performance, it will make the implementation more contained. The way forward to get better performance is to implement in the backend either indirect descriptor or multiple grants ring. Note that the parameters blk_queue_max_* helpers haven't been updated. The block code will set the mimimum size supported and we may be able to support directly any change in the block framework that lower down the minimal size of a request. [1] http://lists.xen.org/archives/html/xen-devel/2015-08/msg02200.html Signed-off-by: Julien Grall Acked-by: Roger Pau MonnĂ© Signed-off-by: Konrad Rzeszutek Wilk --- drivers/block/xen-blkfront.c | 228 ++++++++++++++++++++++++++++++++--- 1 file changed, 212 insertions(+), 16 deletions(-) diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c index 0b32c90ffc3f..f3d0d4758641 100644 --- a/drivers/block/xen-blkfront.c +++ b/drivers/block/xen-blkfront.c @@ -60,6 +60,20 @@ #include +/* + * The minimal size of segment supported by the block framework is PAGE_SIZE. + * When Linux is using a different page size than Xen, it may not be possible + * to put all the data in a single segment. + * This can happen when the backend doesn't support indirect descriptor and + * therefore the maximum amount of data that a request can carry is + * BLKIF_MAX_SEGMENTS_PER_REQUEST * XEN_PAGE_SIZE = 44KB + * + * Note that we only support one extra request. So the Linux page size + * should be <= ( 2 * BLKIF_MAX_SEGMENTS_PER_REQUEST * XEN_PAGE_SIZE) = + * 88KB. + */ +#define HAS_EXTRA_REQ (BLKIF_MAX_SEGMENTS_PER_REQUEST < XEN_PFN_PER_PAGE) + enum blkif_state { BLKIF_STATE_DISCONNECTED, BLKIF_STATE_CONNECTED, @@ -72,6 +86,13 @@ struct grant { struct list_head node; }; +enum blk_req_status { + REQ_WAITING, + REQ_DONE, + REQ_ERROR, + REQ_EOPNOTSUPP, +}; + struct blk_shadow { struct blkif_request req; struct request *request; @@ -79,6 +100,14 @@ struct blk_shadow { struct grant **indirect_grants; struct scatterlist *sg; unsigned int num_sg; + enum blk_req_status status; + + #define NO_ASSOCIATED_ID ~0UL + /* + * Id of the sibling if we ever need 2 requests when handling a + * block I/O request + */ + unsigned long associated_id; }; struct split_bio { @@ -492,6 +521,8 @@ static unsigned long blkif_ring_get_request(struct blkfront_ring_info *rinfo, id = get_id_from_freelist(rinfo); rinfo->shadow[id].request = req; + rinfo->shadow[id].status = REQ_WAITING; + rinfo->shadow[id].associated_id = NO_ASSOCIATED_ID; (*ring_req)->u.rw.id = id; @@ -533,6 +564,9 @@ struct setup_rw_req { bool need_copy; unsigned int bvec_off; char *bvec_data; + + bool require_extra_req; + struct blkif_request *extra_ring_req; }; static void blkif_setup_rw_req_grant(unsigned long gfn, unsigned int offset, @@ -546,8 +580,24 @@ static void blkif_setup_rw_req_grant(unsigned long gfn, unsigned int offset, unsigned int grant_idx = setup->grant_idx; struct blkif_request *ring_req = setup->ring_req; struct blkfront_ring_info *rinfo = setup->rinfo; + /* + * We always use the shadow of the first request to store the list + * of grant associated to the block I/O request. This made the + * completion more easy to handle even if the block I/O request is + * split. + */ struct blk_shadow *shadow = &rinfo->shadow[setup->id]; + if (unlikely(setup->require_extra_req && + grant_idx >= BLKIF_MAX_SEGMENTS_PER_REQUEST)) { + /* + * We are using the second request, setup grant_idx + * to be the index of the segment array. + */ + grant_idx -= BLKIF_MAX_SEGMENTS_PER_REQUEST; + ring_req = setup->extra_ring_req; + } + if ((ring_req->operation == BLKIF_OP_INDIRECT) && (grant_idx % GRANTS_PER_INDIRECT_FRAME == 0)) { if (setup->segments) @@ -562,7 +612,11 @@ static void blkif_setup_rw_req_grant(unsigned long gfn, unsigned int offset, gnt_list_entry = get_grant(&setup->gref_head, gfn, rinfo); ref = gnt_list_entry->gref; - shadow->grants_used[grant_idx] = gnt_list_entry; + /* + * All the grants are stored in the shadow of the first + * request. Therefore we have to use the global index. + */ + shadow->grants_used[setup->grant_idx] = gnt_list_entry; if (setup->need_copy) { void *shared_data; @@ -604,11 +658,31 @@ static void blkif_setup_rw_req_grant(unsigned long gfn, unsigned int offset, (setup->grant_idx)++; } +static void blkif_setup_extra_req(struct blkif_request *first, + struct blkif_request *second) +{ + uint16_t nr_segments = first->u.rw.nr_segments; + + /* + * The second request is only present when the first request uses + * all its segments. It's always the continuity of the first one. + */ + first->u.rw.nr_segments = BLKIF_MAX_SEGMENTS_PER_REQUEST; + + second->u.rw.nr_segments = nr_segments - BLKIF_MAX_SEGMENTS_PER_REQUEST; + second->u.rw.sector_number = first->u.rw.sector_number + + (BLKIF_MAX_SEGMENTS_PER_REQUEST * XEN_PAGE_SIZE) / 512; + + second->u.rw.handle = first->u.rw.handle; + second->operation = first->operation; +} + static int blkif_queue_rw_req(struct request *req, struct blkfront_ring_info *rinfo) { struct blkfront_info *info = rinfo->dev_info; - struct blkif_request *ring_req; - unsigned long id; + struct blkif_request *ring_req, *extra_ring_req = NULL; + unsigned long id, extra_id = NO_ASSOCIATED_ID; + bool require_extra_req = false; int i; struct setup_rw_req setup = { .grant_idx = 0, @@ -650,19 +724,19 @@ static int blkif_queue_rw_req(struct request *req, struct blkfront_ring_info *ri /* Fill out a communications ring structure. */ id = blkif_ring_get_request(rinfo, req, &ring_req); - BUG_ON(info->max_indirect_segments == 0 && - GREFS(req->nr_phys_segments) > BLKIF_MAX_SEGMENTS_PER_REQUEST); - BUG_ON(info->max_indirect_segments && - GREFS(req->nr_phys_segments) > info->max_indirect_segments); - num_sg = blk_rq_map_sg(req->q, req, rinfo->shadow[id].sg); num_grant = 0; /* Calculate the number of grant used */ for_each_sg(rinfo->shadow[id].sg, sg, num_sg, i) num_grant += gnttab_count_grant(sg->offset, sg->length); + require_extra_req = info->max_indirect_segments == 0 && + num_grant > BLKIF_MAX_SEGMENTS_PER_REQUEST; + BUG_ON(!HAS_EXTRA_REQ && require_extra_req); + rinfo->shadow[id].num_sg = num_sg; - if (num_grant > BLKIF_MAX_SEGMENTS_PER_REQUEST) { + if (num_grant > BLKIF_MAX_SEGMENTS_PER_REQUEST && + likely(!require_extra_req)) { /* * The indirect operation can only be a BLKIF_OP_READ or * BLKIF_OP_WRITE @@ -702,10 +776,30 @@ static int blkif_queue_rw_req(struct request *req, struct blkfront_ring_info *ri } } ring_req->u.rw.nr_segments = num_grant; + if (unlikely(require_extra_req)) { + extra_id = blkif_ring_get_request(rinfo, req, + &extra_ring_req); + /* + * Only the first request contains the scatter-gather + * list. + */ + rinfo->shadow[extra_id].num_sg = 0; + + blkif_setup_extra_req(ring_req, extra_ring_req); + + /* Link the 2 requests together */ + rinfo->shadow[extra_id].associated_id = id; + rinfo->shadow[id].associated_id = extra_id; + } } setup.ring_req = ring_req; setup.id = id; + + setup.require_extra_req = require_extra_req; + if (unlikely(require_extra_req)) + setup.extra_ring_req = extra_ring_req; + for_each_sg(rinfo->shadow[id].sg, sg, num_sg, i) { BUG_ON(sg->offset + sg->length > PAGE_SIZE); @@ -728,6 +822,8 @@ static int blkif_queue_rw_req(struct request *req, struct blkfront_ring_info *ri /* Keep a private copy so we can reissue requests when recovering. */ rinfo->shadow[id].req = *ring_req; + if (unlikely(require_extra_req)) + rinfo->shadow[extra_id].req = *extra_ring_req; if (max_grefs > 0) gnttab_free_grant_references(setup.gref_head); @@ -829,7 +925,16 @@ static int xlvbd_init_blk_queue(struct gendisk *gd, u16 sector_size, memset(&info->tag_set, 0, sizeof(info->tag_set)); info->tag_set.ops = &blkfront_mq_ops; info->tag_set.nr_hw_queues = info->nr_rings; - info->tag_set.queue_depth = BLK_RING_SIZE(info); + if (HAS_EXTRA_REQ && info->max_indirect_segments == 0) { + /* + * When indirect descriptior is not supported, the I/O request + * will be split between multiple request in the ring. + * To avoid problems when sending the request, divide by + * 2 the depth of the queue. + */ + info->tag_set.queue_depth = BLK_RING_SIZE(info) / 2; + } else + info->tag_set.queue_depth = BLK_RING_SIZE(info); info->tag_set.numa_node = NUMA_NO_NODE; info->tag_set.flags = BLK_MQ_F_SHOULD_MERGE | BLK_MQ_F_SG_MERGE; info->tag_set.cmd_size = 0; @@ -1269,20 +1374,93 @@ static void blkif_copy_from_grant(unsigned long gfn, unsigned int offset, kunmap_atomic(shared_data); } -static void blkif_completion(struct blk_shadow *s, struct blkfront_ring_info *rinfo, +static enum blk_req_status blkif_rsp_to_req_status(int rsp) +{ + switch (rsp) + { + case BLKIF_RSP_OKAY: + return REQ_DONE; + case BLKIF_RSP_EOPNOTSUPP: + return REQ_EOPNOTSUPP; + case BLKIF_RSP_ERROR: + /* Fallthrough. */ + default: + return REQ_ERROR; + } +} + +/* + * Get the final status of the block request based on two ring response + */ +static int blkif_get_final_status(enum blk_req_status s1, + enum blk_req_status s2) +{ + BUG_ON(s1 == REQ_WAITING); + BUG_ON(s2 == REQ_WAITING); + + if (s1 == REQ_ERROR || s2 == REQ_ERROR) + return BLKIF_RSP_ERROR; + else if (s1 == REQ_EOPNOTSUPP || s2 == REQ_EOPNOTSUPP) + return BLKIF_RSP_EOPNOTSUPP; + return BLKIF_RSP_OKAY; +} + +static bool blkif_completion(unsigned long *id, + struct blkfront_ring_info *rinfo, struct blkif_response *bret) { int i = 0; struct scatterlist *sg; int num_sg, num_grant; struct blkfront_info *info = rinfo->dev_info; + struct blk_shadow *s = &rinfo->shadow[*id]; struct copy_from_grant data = { - .s = s, .grant_idx = 0, }; num_grant = s->req.operation == BLKIF_OP_INDIRECT ? s->req.u.indirect.nr_segments : s->req.u.rw.nr_segments; + + /* The I/O request may be split in two. */ + if (unlikely(s->associated_id != NO_ASSOCIATED_ID)) { + struct blk_shadow *s2 = &rinfo->shadow[s->associated_id]; + + /* Keep the status of the current response in shadow. */ + s->status = blkif_rsp_to_req_status(bret->status); + + /* Wait the second response if not yet here. */ + if (s2->status == REQ_WAITING) + return 0; + + bret->status = blkif_get_final_status(s->status, + s2->status); + + /* + * All the grants is stored in the first shadow in order + * to make the completion code simpler. + */ + num_grant += s2->req.u.rw.nr_segments; + + /* + * The two responses may not come in order. Only the + * first request will store the scatter-gather list. + */ + if (s2->num_sg != 0) { + /* Update "id" with the ID of the first response. */ + *id = s->associated_id; + s = s2; + } + + /* + * We don't need anymore the second request, so recycling + * it now. + */ + if (add_id_to_freelist(rinfo, s->associated_id)) + WARN(1, "%s: can't recycle the second part (id = %ld) of the request\n", + info->gd->disk_name, s->associated_id); + } + + data.s = s; num_sg = s->num_sg; if (bret->operation == BLKIF_OP_READ && info->feature_persistent) { @@ -1352,6 +1530,8 @@ static void blkif_completion(struct blk_shadow *s, struct blkfront_ring_info *ri } } } + + return 1; } static irqreturn_t blkif_interrupt(int irq, void *dev_id) @@ -1391,8 +1571,14 @@ static irqreturn_t blkif_interrupt(int irq, void *dev_id) } req = rinfo->shadow[id].request; - if (bret->operation != BLKIF_OP_DISCARD) - blkif_completion(&rinfo->shadow[id], rinfo, bret); + if (bret->operation != BLKIF_OP_DISCARD) { + /* + * We may need to wait for an extra response if the + * I/O request is split in 2 + */ + if (!blkif_completion(&id, rinfo, bret)) + continue; + } if (add_id_to_freelist(rinfo, id)) { WARN(1, "%s: response to %s (id %ld) couldn't be recycled!\n", @@ -2017,8 +2203,18 @@ static int blkfront_setup_indirect(struct blkfront_ring_info *rinfo) int err, i; struct blkfront_info *info = rinfo->dev_info; - if (info->max_indirect_segments == 0) - grants = BLKIF_MAX_SEGMENTS_PER_REQUEST; + if (info->max_indirect_segments == 0) { + if (!HAS_EXTRA_REQ) + grants = BLKIF_MAX_SEGMENTS_PER_REQUEST; + else { + /* + * When an extra req is required, the maximum + * grants supported is related to the size of the + * Linux block segment. + */ + grants = GRANTS_PER_PSEG; + } + } else grants = info->max_indirect_segments; psegs = grants / GRANTS_PER_PSEG; From db6fbc106786f26d95889c50c18b1f28aa543a17 Mon Sep 17 00:00:00 2001 From: Bob Liu Date: Wed, 9 Dec 2015 07:44:02 +0800 Subject: [PATCH 19/21] xen/blkback: make st_ statistics per ring Make st_* statistics per ring and the VBD sysfs would iterate over all the rings. Note: xenvbd_sysfs_delif() is called in xen_blkbk_remove() before all rings are torn down, so it's safe. Signed-off-by: Bob Liu Signed-off-by: Konrad Rzeszutek Wilk --- v2: Aligned the variables on the same column. --- drivers/block/xen-blkback/blkback.c | 34 ++++++++++------------ drivers/block/xen-blkback/common.h | 20 ++++++------- drivers/block/xen-blkback/xenbus.c | 45 ++++++++++++++++++++++------- 3 files changed, 61 insertions(+), 38 deletions(-) diff --git a/drivers/block/xen-blkback/blkback.c b/drivers/block/xen-blkback/blkback.c index 99b479f330af..148930c8c121 100644 --- a/drivers/block/xen-blkback/blkback.c +++ b/drivers/block/xen-blkback/blkback.c @@ -587,20 +587,18 @@ irqreturn_t xen_blkif_be_int(int irq, void *dev_id) static void print_stats(struct xen_blkif_ring *ring) { - struct xen_blkif *blkif = ring->blkif; - pr_info("(%s): oo %3llu | rd %4llu | wr %4llu | f %4llu" " | ds %4llu | pg: %4u/%4d\n", - current->comm, blkif->st_oo_req, - blkif->st_rd_req, blkif->st_wr_req, - blkif->st_f_req, blkif->st_ds_req, + current->comm, ring->st_oo_req, + ring->st_rd_req, ring->st_wr_req, + ring->st_f_req, ring->st_ds_req, ring->persistent_gnt_c, xen_blkif_max_pgrants); - blkif->st_print = jiffies + msecs_to_jiffies(10 * 1000); - blkif->st_rd_req = 0; - blkif->st_wr_req = 0; - blkif->st_oo_req = 0; - blkif->st_ds_req = 0; + ring->st_print = jiffies + msecs_to_jiffies(10 * 1000); + ring->st_rd_req = 0; + ring->st_wr_req = 0; + ring->st_oo_req = 0; + ring->st_ds_req = 0; } int xen_blkif_schedule(void *arg) @@ -656,7 +654,7 @@ purge_gnt_list: /* Shrink if we have more than xen_blkif_max_buffer_pages */ shrink_free_pagepool(ring, xen_blkif_max_buffer_pages); - if (log_stats && time_after(jiffies, ring->blkif->st_print)) + if (log_stats && time_after(jiffies, ring->st_print)) print_stats(ring); } @@ -1018,7 +1016,7 @@ static int dispatch_discard_io(struct xen_blkif_ring *ring, preq.sector_number + preq.nr_sects, blkif->vbd.pdevice); goto fail_response; } - blkif->st_ds_req++; + ring->st_ds_req++; secure = (blkif->vbd.discard_secure && (req->u.discard.flag & BLKIF_DISCARD_SECURE)) ? @@ -1145,7 +1143,7 @@ __do_block_io_op(struct xen_blkif_ring *ring) pending_req = alloc_req(ring); if (NULL == pending_req) { - ring->blkif->st_oo_req++; + ring->st_oo_req++; more_to_do = 1; break; } @@ -1243,17 +1241,17 @@ static int dispatch_rw_block_io(struct xen_blkif_ring *ring, switch (req_operation) { case BLKIF_OP_READ: - ring->blkif->st_rd_req++; + ring->st_rd_req++; operation = READ; break; case BLKIF_OP_WRITE: - ring->blkif->st_wr_req++; + ring->st_wr_req++; operation = WRITE_ODIRECT; break; case BLKIF_OP_WRITE_BARRIER: drain = true; case BLKIF_OP_FLUSH_DISKCACHE: - ring->blkif->st_f_req++; + ring->st_f_req++; operation = WRITE_FLUSH; break; default: @@ -1395,9 +1393,9 @@ static int dispatch_rw_block_io(struct xen_blkif_ring *ring, blk_finish_plug(&plug); if (operation == READ) - ring->blkif->st_rd_sect += preq.nr_sects; + ring->st_rd_sect += preq.nr_sects; else if (operation & WRITE) - ring->blkif->st_wr_sect += preq.nr_sects; + ring->st_wr_sect += preq.nr_sects; return 0; diff --git a/drivers/block/xen-blkback/common.h b/drivers/block/xen-blkback/common.h index 3c244ecf22a4..b27c5ba15600 100644 --- a/drivers/block/xen-blkback/common.h +++ b/drivers/block/xen-blkback/common.h @@ -298,6 +298,16 @@ struct xen_blkif_ring { atomic_t persistent_gnt_in_use; unsigned long next_lru; + /* Statistics. */ + unsigned long st_print; + unsigned long long st_rd_req; + unsigned long long st_wr_req; + unsigned long long st_oo_req; + unsigned long long st_f_req; + unsigned long long st_ds_req; + unsigned long long st_rd_sect; + unsigned long long st_wr_sect; + /* Used by the kworker that offload work from the persistent purge. */ struct list_head persistent_purge_list; struct work_struct persistent_purge_work; @@ -328,16 +338,6 @@ struct xen_blkif { struct completion drain_complete; atomic_t drain; - /* statistics */ - unsigned long st_print; - unsigned long long st_rd_req; - unsigned long long st_wr_req; - unsigned long long st_oo_req; - unsigned long long st_f_req; - unsigned long long st_ds_req; - unsigned long long st_rd_sect; - unsigned long long st_wr_sect; - struct work_struct free_work; unsigned int nr_ring_pages; /* All rings for this device. */ diff --git a/drivers/block/xen-blkback/xenbus.c b/drivers/block/xen-blkback/xenbus.c index c92b35882720..44396b8a0cb2 100644 --- a/drivers/block/xen-blkback/xenbus.c +++ b/drivers/block/xen-blkback/xenbus.c @@ -159,6 +159,7 @@ static int xen_blkif_alloc_rings(struct xen_blkif *blkif) init_waitqueue_head(&ring->pending_free_wq); init_waitqueue_head(&ring->shutdown_wq); ring->blkif = blkif; + ring->st_print = jiffies; xen_blkif_get(blkif); } @@ -179,7 +180,6 @@ static struct xen_blkif *xen_blkif_alloc(domid_t domid) atomic_set(&blkif->refcnt, 1); init_completion(&blkif->drain_complete); INIT_WORK(&blkif->free_work, xen_blkif_deferred_free); - blkif->st_print = jiffies; return blkif; } @@ -329,25 +329,38 @@ int __init xen_blkif_interface_init(void) * sysfs interface for VBD I/O requests */ -#define VBD_SHOW(name, format, args...) \ +#define VBD_SHOW_ALLRING(name, format) \ static ssize_t show_##name(struct device *_dev, \ struct device_attribute *attr, \ char *buf) \ { \ struct xenbus_device *dev = to_xenbus_device(_dev); \ struct backend_info *be = dev_get_drvdata(&dev->dev); \ + struct xen_blkif *blkif = be->blkif; \ + unsigned int i; \ + unsigned long long result = 0; \ \ - return sprintf(buf, format, ##args); \ + if (!blkif->rings) \ + goto out; \ + \ + for (i = 0; i < blkif->nr_rings; i++) { \ + struct xen_blkif_ring *ring = &blkif->rings[i]; \ + \ + result += ring->st_##name; \ + } \ + \ +out: \ + return sprintf(buf, format, result); \ } \ static DEVICE_ATTR(name, S_IRUGO, show_##name, NULL) -VBD_SHOW(oo_req, "%llu\n", be->blkif->st_oo_req); -VBD_SHOW(rd_req, "%llu\n", be->blkif->st_rd_req); -VBD_SHOW(wr_req, "%llu\n", be->blkif->st_wr_req); -VBD_SHOW(f_req, "%llu\n", be->blkif->st_f_req); -VBD_SHOW(ds_req, "%llu\n", be->blkif->st_ds_req); -VBD_SHOW(rd_sect, "%llu\n", be->blkif->st_rd_sect); -VBD_SHOW(wr_sect, "%llu\n", be->blkif->st_wr_sect); +VBD_SHOW_ALLRING(oo_req, "%llu\n"); +VBD_SHOW_ALLRING(rd_req, "%llu\n"); +VBD_SHOW_ALLRING(wr_req, "%llu\n"); +VBD_SHOW_ALLRING(f_req, "%llu\n"); +VBD_SHOW_ALLRING(ds_req, "%llu\n"); +VBD_SHOW_ALLRING(rd_sect, "%llu\n"); +VBD_SHOW_ALLRING(wr_sect, "%llu\n"); static struct attribute *xen_vbdstat_attrs[] = { &dev_attr_oo_req.attr, @@ -365,6 +378,18 @@ static struct attribute_group xen_vbdstat_group = { .attrs = xen_vbdstat_attrs, }; +#define VBD_SHOW(name, format, args...) \ + static ssize_t show_##name(struct device *_dev, \ + struct device_attribute *attr, \ + char *buf) \ + { \ + struct xenbus_device *dev = to_xenbus_device(_dev); \ + struct backend_info *be = dev_get_drvdata(&dev->dev); \ + \ + return sprintf(buf, format, ##args); \ + } \ + static DEVICE_ATTR(name, S_IRUGO, show_##name, NULL) + VBD_SHOW(physical_device, "%x:%x\n", be->major, be->minor); VBD_SHOW(mode, "%s\n", be->mode); From 93bb277f97a6d319361766bde228717faf4abdeb Mon Sep 17 00:00:00 2001 From: Bob Liu Date: Thu, 10 Dec 2015 09:16:48 +0800 Subject: [PATCH 20/21] xen/blkback: Fix two memory leaks. This patch fixs two memleaks: backtrace: [] kmemleak_alloc+0x28/0x50 [] kmem_cache_alloc+0xbb/0x1d0 [] xen_blkbk_probe+0x58/0x230 [] xenbus_dev_probe+0x76/0x130 [] driver_probe_device+0x166/0x2c0 [] __device_attach_driver+0xac/0xb0 [] bus_for_each_drv+0x67/0x90 [] __device_attach+0xc7/0x120 [] device_initial_probe+0x13/0x20 [] bus_probe_device+0x9a/0xb0 [] device_add+0x3b1/0x5c0 [] device_register+0x1e/0x30 [] xenbus_probe_node+0x158/0x170 [] xenbus_dev_changed+0x1af/0x1c0 [] backend_changed+0x1b/0x20 [] xenwatch_thread+0xb6/0x160 unreferenced object 0xffff880007ba8ef8 (size 224): backtrace: [] kmemleak_alloc+0x28/0x50 [] __kmalloc+0xd3/0x1e0 [] frontend_changed+0x2c7/0x580 [] xenbus_otherend_changed+0xa2/0xb0 [] frontend_changed+0x10/0x20 [] xenwatch_thread+0xb6/0x160 [] kthread+0xd7/0xf0 [] ret_from_fork+0x3f/0x70 [] 0xffffffffffffffff unreferenced object 0xffff8800048dcd38 (size 224): The first leak is caused by not put() the be->blkif reference which we had gotten in xen_blkif_alloc(), while the second is us not freeing blkif->rings in the right place. Signed-off-by: Bob Liu Reported-and-Tested-by: Konrad Rzeszutek Wilk Signed-off-by: Konrad Rzeszutek Wilk --- drivers/block/xen-blkback/xenbus.c | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/drivers/block/xen-blkback/xenbus.c b/drivers/block/xen-blkback/xenbus.c index 44396b8a0cb2..876763f7f13e 100644 --- a/drivers/block/xen-blkback/xenbus.c +++ b/drivers/block/xen-blkback/xenbus.c @@ -297,8 +297,16 @@ static int xen_blkif_disconnect(struct xen_blkif *blkif) BUG_ON(ring->free_pages_num != 0); BUG_ON(ring->persistent_gnt_c != 0); WARN_ON(i != (XEN_BLKIF_REQS_PER_PAGE * blkif->nr_ring_pages)); + xen_blkif_put(blkif); } blkif->nr_ring_pages = 0; + /* + * blkif->rings was allocated in connect_ring, so we should free it in + * here. + */ + kfree(blkif->rings); + blkif->rings = NULL; + blkif->nr_rings = 0; return 0; } @@ -310,7 +318,6 @@ static void xen_blkif_free(struct xen_blkif *blkif) xen_vbd_free(&blkif->vbd); /* Make sure everything is drained before shutting down */ - kfree(blkif->rings); kmem_cache_free(xen_blkif_cachep, blkif); } @@ -484,7 +491,6 @@ static int xen_vbd_create(struct xen_blkif *blkif, blkif_vdev_t handle, static int xen_blkbk_remove(struct xenbus_device *dev) { struct backend_info *be = dev_get_drvdata(&dev->dev); - unsigned int i; pr_debug("%s %p %d\n", __func__, dev, dev->otherend_id); @@ -499,12 +505,11 @@ static int xen_blkbk_remove(struct xenbus_device *dev) dev_set_drvdata(&dev->dev, NULL); - if (be->blkif) { + if (be->blkif) xen_blkif_disconnect(be->blkif); - for (i = 0; i < be->blkif->nr_rings; i++) - xen_blkif_put(be->blkif); - } + /* Put the reference we set in xen_blkif_alloc(). */ + xen_blkif_put(be->blkif); kfree(be->mode); kfree(be); return 0; From c31ecf6c126dbc7f30234eaf6c4a079649a38de7 Mon Sep 17 00:00:00 2001 From: Konrad Rzeszutek Wilk Date: Fri, 18 Dec 2015 16:28:53 -0500 Subject: [PATCH 21/21] xen/blkfront: Fix crash if backend doesn't follow the right states. We have split the setting up of all the resources in two steps: 1) talk_to_blkback - which figures out the num_ring_pages (from the default value of zero), sets up shadow and so 2) blkfront_connect - does the real part of filling out the internal structures. The problem is if we bypass the 1) step and go straight to 2) and call blkfront_setup_indirect where we use the macro BLK_RING_SIZE - which returns an negative value (because sz is zero - since num_ring_pages is zero - since it has never been set). We can fix this by making sure that we always have called talk_to_blkback before going to blkfront_connect. Or we could set in blkfront_probe info->nr_ring_pages = 1 to have a default value. But that looks odd - as we haven't actually negotiated any ring size. This patch changes XenbusStateConnected state to detect if we haven't done the initial handshake - and if so continue on as if were in XenbusStateInitWait state. We also roll the error recovery (freeing the structure) into talk_to_blkback error path - which is safe since that function is only called from blkback_changed. Signed-off-by: Konrad Rzeszutek Wilk --- drivers/block/xen-blkfront.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c index f3d0d4758641..8a8dc91c39f7 100644 --- a/drivers/block/xen-blkfront.c +++ b/drivers/block/xen-blkfront.c @@ -1867,6 +1867,9 @@ again: destroy_blkring: blkif_free(info, 0); + kfree(info); + dev_set_drvdata(&dev->dev, NULL); + return err; } @@ -2453,11 +2456,8 @@ static void blkback_changed(struct xenbus_device *dev, case XenbusStateInitWait: if (dev->state != XenbusStateInitialising) break; - if (talk_to_blkback(dev, info)) { - kfree(info); - dev_set_drvdata(&dev->dev, NULL); + if (talk_to_blkback(dev, info)) break; - } case XenbusStateInitialising: case XenbusStateInitialised: case XenbusStateReconfiguring: @@ -2466,6 +2466,10 @@ static void blkback_changed(struct xenbus_device *dev, break; case XenbusStateConnected: + if (dev->state != XenbusStateInitialised) { + if (talk_to_blkback(dev, info)) + break; + } blkfront_connect(info); break;