OpenCloudOS-Kernel/drivers/block/virtio_blk.c

1060 lines
26 KiB
C
Raw Normal View History

// SPDX-License-Identifier: GPL-2.0-only
//#define DEBUG
#include <linux/spinlock.h>
include cleanup: Update gfp.h and slab.h includes to prepare for breaking implicit slab.h inclusion from percpu.h percpu.h is included by sched.h and module.h and thus ends up being included when building most .c files. percpu.h includes slab.h which in turn includes gfp.h making everything defined by the two files universally available and complicating inclusion dependencies. percpu.h -> slab.h dependency is about to be removed. Prepare for this change by updating users of gfp and slab facilities include those headers directly instead of assuming availability. As this conversion needs to touch large number of source files, the following script is used as the basis of conversion. http://userweb.kernel.org/~tj/misc/slabh-sweep.py The script does the followings. * Scan files for gfp and slab usages and update includes such that only the necessary includes are there. ie. if only gfp is used, gfp.h, if slab is used, slab.h. * When the script inserts a new include, it looks at the include blocks and try to put the new include such that its order conforms to its surrounding. It's put in the include block which contains core kernel includes, in the same order that the rest are ordered - alphabetical, Christmas tree, rev-Xmas-tree or at the end if there doesn't seem to be any matching order. * If the script can't find a place to put a new include (mostly because the file doesn't have fitting include block), it prints out an error message indicating which .h file needs to be added to the file. The conversion was done in the following steps. 1. The initial automatic conversion of all .c files updated slightly over 4000 files, deleting around 700 includes and adding ~480 gfp.h and ~3000 slab.h inclusions. The script emitted errors for ~400 files. 2. Each error was manually checked. Some didn't need the inclusion, some needed manual addition while adding it to implementation .h or embedding .c file was more appropriate for others. This step added inclusions to around 150 files. 3. The script was run again and the output was compared to the edits from #2 to make sure no file was left behind. 4. Several build tests were done and a couple of problems were fixed. e.g. lib/decompress_*.c used malloc/free() wrappers around slab APIs requiring slab.h to be added manually. 5. The script was run on all .h files but without automatically editing them as sprinkling gfp.h and slab.h inclusions around .h files could easily lead to inclusion dependency hell. Most gfp.h inclusion directives were ignored as stuff from gfp.h was usually wildly available and often used in preprocessor macros. Each slab.h inclusion directive was examined and added manually as necessary. 6. percpu.h was updated not to include slab.h. 7. Build test were done on the following configurations and failures were fixed. CONFIG_GCOV_KERNEL was turned off for all tests (as my distributed build env didn't work with gcov compiles) and a few more options had to be turned off depending on archs to make things build (like ipr on powerpc/64 which failed due to missing writeq). * x86 and x86_64 UP and SMP allmodconfig and a custom test config. * powerpc and powerpc64 SMP allmodconfig * sparc and sparc64 SMP allmodconfig * ia64 SMP allmodconfig * s390 SMP allmodconfig * alpha SMP allmodconfig * um on x86_64 SMP allmodconfig 8. percpu.h modifications were reverted so that it could be applied as a separate patch and serve as bisection point. Given the fact that I had only a couple of failures from tests on step 6, I'm fairly confident about the coverage of this conversion patch. If there is a breakage, it's likely to be something in one of the arch headers which should be easily discoverable easily on most builds of the specific arch. Signed-off-by: Tejun Heo <tj@kernel.org> Guess-its-ok-by: Christoph Lameter <cl@linux-foundation.org> Cc: Ingo Molnar <mingo@redhat.com> Cc: Lee Schermerhorn <Lee.Schermerhorn@hp.com>
2010-03-24 16:04:11 +08:00
#include <linux/slab.h>
#include <linux/blkdev.h>
#include <linux/hdreg.h>
#include <linux/module.h>
#include <linux/mutex.h>
#include <linux/interrupt.h>
#include <linux/virtio.h>
#include <linux/virtio_blk.h>
#include <linux/scatterlist.h>
#include <linux/string_helpers.h>
#include <linux/idr.h>
#include <linux/blk-mq.h>
#include <linux/blk-mq-virtio.h>
#include <linux/numa.h>
#include <uapi/linux/virtio_ring.h>
#define PART_BITS 4
#define VQ_NAME_LEN 16
#define MAX_DISCARD_SEGMENTS 256u
/* The maximum number of sg elements that fit into a virtqueue */
#define VIRTIO_BLK_MAX_SG_ELEMS 32768
static int major;
static DEFINE_IDA(vd_index_ida);
static struct workqueue_struct *virtblk_wq;
struct virtio_blk_vq {
struct virtqueue *vq;
spinlock_t lock;
char name[VQ_NAME_LEN];
} ____cacheline_aligned_in_smp;
struct virtio_blk {
virtio-blk: handle block_device_operations callbacks after hot unplug A userspace process holding a file descriptor to a virtio_blk device can still invoke block_device_operations after hot unplug. This leads to a use-after-free accessing vblk->vdev in virtblk_getgeo() when ioctl(HDIO_GETGEO) is invoked: BUG: unable to handle kernel NULL pointer dereference at 0000000000000090 IP: [<ffffffffc00e5450>] virtio_check_driver_offered_feature+0x10/0x90 [virtio] PGD 800000003a92f067 PUD 3a930067 PMD 0 Oops: 0000 [#1] SMP CPU: 0 PID: 1310 Comm: hdio-getgeo Tainted: G OE ------------ 3.10.0-1062.el7.x86_64 #1 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.13.0-0-gf21b5a4aeb02-prebuilt.qemu.org 04/01/2014 task: ffff9be5fbfb8000 ti: ffff9be5fa890000 task.ti: ffff9be5fa890000 RIP: 0010:[<ffffffffc00e5450>] [<ffffffffc00e5450>] virtio_check_driver_offered_feature+0x10/0x90 [virtio] RSP: 0018:ffff9be5fa893dc8 EFLAGS: 00010246 RAX: ffff9be5fc3f3400 RBX: ffff9be5fa893e30 RCX: 0000000000000000 RDX: 0000000000000000 RSI: 0000000000000004 RDI: ffff9be5fbc10b40 RBP: ffff9be5fa893dc8 R08: 0000000000000301 R09: 0000000000000301 R10: 0000000000000000 R11: 0000000000000000 R12: ffff9be5fdc24680 R13: ffff9be5fbc10b40 R14: ffff9be5fbc10480 R15: 0000000000000000 FS: 00007f1bfb968740(0000) GS:ffff9be5ffc00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000000000000090 CR3: 000000003a894000 CR4: 0000000000360ff0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: [<ffffffffc016ac37>] virtblk_getgeo+0x47/0x110 [virtio_blk] [<ffffffff8d3f200d>] ? handle_mm_fault+0x39d/0x9b0 [<ffffffff8d561265>] blkdev_ioctl+0x1f5/0xa20 [<ffffffff8d488771>] block_ioctl+0x41/0x50 [<ffffffff8d45d9e0>] do_vfs_ioctl+0x3a0/0x5a0 [<ffffffff8d45dc81>] SyS_ioctl+0xa1/0xc0 A related problem is that virtblk_remove() leaks the vd_index_ida index when something still holds a reference to vblk->disk during hot unplug. This causes virtio-blk device names to be lost (vda, vdb, etc). Fix these issues by protecting vblk->vdev with a mutex and reference counting vblk so the vd_index_ida index can be removed in all cases. Fixes: 48e4043d4529 ("virtio: add virtio disk geometry feature") Reported-by: Lance Digby <ldigby@redhat.com> Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com> Link: https://lore.kernel.org/r/20200430140442.171016-1-stefanha@redhat.com Signed-off-by: Michael S. Tsirkin <mst@redhat.com> Reviewed-by: Stefano Garzarella <sgarzare@redhat.com>
2020-04-30 22:04:42 +08:00
/*
* This mutex must be held by anything that may run after
* virtblk_remove() sets vblk->vdev to NULL.
*
* blk-mq, virtqueue processing, and sysfs attribute code paths are
* shut down before vblk->vdev is set to NULL and therefore do not need
* to hold this mutex.
*/
struct mutex vdev_mutex;
struct virtio_device *vdev;
/* The disk structure for the kernel. */
struct gendisk *disk;
/* Block layer tags. */
struct blk_mq_tag_set tag_set;
/* Process context for config space updates */
struct work_struct config_work;
virtio-blk: handle block_device_operations callbacks after hot unplug A userspace process holding a file descriptor to a virtio_blk device can still invoke block_device_operations after hot unplug. This leads to a use-after-free accessing vblk->vdev in virtblk_getgeo() when ioctl(HDIO_GETGEO) is invoked: BUG: unable to handle kernel NULL pointer dereference at 0000000000000090 IP: [<ffffffffc00e5450>] virtio_check_driver_offered_feature+0x10/0x90 [virtio] PGD 800000003a92f067 PUD 3a930067 PMD 0 Oops: 0000 [#1] SMP CPU: 0 PID: 1310 Comm: hdio-getgeo Tainted: G OE ------------ 3.10.0-1062.el7.x86_64 #1 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.13.0-0-gf21b5a4aeb02-prebuilt.qemu.org 04/01/2014 task: ffff9be5fbfb8000 ti: ffff9be5fa890000 task.ti: ffff9be5fa890000 RIP: 0010:[<ffffffffc00e5450>] [<ffffffffc00e5450>] virtio_check_driver_offered_feature+0x10/0x90 [virtio] RSP: 0018:ffff9be5fa893dc8 EFLAGS: 00010246 RAX: ffff9be5fc3f3400 RBX: ffff9be5fa893e30 RCX: 0000000000000000 RDX: 0000000000000000 RSI: 0000000000000004 RDI: ffff9be5fbc10b40 RBP: ffff9be5fa893dc8 R08: 0000000000000301 R09: 0000000000000301 R10: 0000000000000000 R11: 0000000000000000 R12: ffff9be5fdc24680 R13: ffff9be5fbc10b40 R14: ffff9be5fbc10480 R15: 0000000000000000 FS: 00007f1bfb968740(0000) GS:ffff9be5ffc00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000000000000090 CR3: 000000003a894000 CR4: 0000000000360ff0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: [<ffffffffc016ac37>] virtblk_getgeo+0x47/0x110 [virtio_blk] [<ffffffff8d3f200d>] ? handle_mm_fault+0x39d/0x9b0 [<ffffffff8d561265>] blkdev_ioctl+0x1f5/0xa20 [<ffffffff8d488771>] block_ioctl+0x41/0x50 [<ffffffff8d45d9e0>] do_vfs_ioctl+0x3a0/0x5a0 [<ffffffff8d45dc81>] SyS_ioctl+0xa1/0xc0 A related problem is that virtblk_remove() leaks the vd_index_ida index when something still holds a reference to vblk->disk during hot unplug. This causes virtio-blk device names to be lost (vda, vdb, etc). Fix these issues by protecting vblk->vdev with a mutex and reference counting vblk so the vd_index_ida index can be removed in all cases. Fixes: 48e4043d4529 ("virtio: add virtio disk geometry feature") Reported-by: Lance Digby <ldigby@redhat.com> Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com> Link: https://lore.kernel.org/r/20200430140442.171016-1-stefanha@redhat.com Signed-off-by: Michael S. Tsirkin <mst@redhat.com> Reviewed-by: Stefano Garzarella <sgarzare@redhat.com>
2020-04-30 22:04:42 +08:00
/*
* Tracks references from block_device_operations open/release and
* virtio_driver probe/remove so this object can be freed once no
* longer in use.
*/
refcount_t refs;
/* What host tells us, plus 2 for header & tailer. */
unsigned int sg_elems;
/* Ida index - used to track minor number allocations. */
int index;
/* num of vqs */
int num_vqs;
struct virtio_blk_vq *vqs;
};
struct virtblk_req {
struct virtio_blk_outhdr out_hdr;
u8 status;
virtio-blk: Add bio-based IO path for virtio-blk This patch introduces bio-based IO path for virtio-blk. Compared to request-based IO path, bio-based IO path uses driver provided ->make_request_fn() method to bypasses the IO scheduler. It handles the bio to device directly without allocating a request in block layer. This reduces the IO path in guest kernel to achieve high IOPS and lower latency. The downside is that guest can not use the IO scheduler to merge and sort requests. However, this is not a big problem if the backend disk in host side uses faster disk device. When the bio-based IO path is not enabled, virtio-blk still uses the original request-based IO path, no performance difference is observed. Using a slow device e.g. normal SATA disk, the bio-based IO path for sequential read and write are slower than req-based IO path due to lack of merge in guest kernel. So we make the bio-based path optional. Performance evaluation: ----------------------------- 1) Fio test is performed in a 8 vcpu guest with ramdisk based guest using kvm tool. Short version: With bio-based IO path, sequential read/write, random read/write IOPS boost : 28%, 24%, 21%, 16% Latency improvement: 32%, 17%, 21%, 16% Long version: With bio-based IO path: seq-read : io=2048.0MB, bw=116996KB/s, iops=233991 , runt= 17925msec seq-write : io=2048.0MB, bw=100829KB/s, iops=201658 , runt= 20799msec rand-read : io=3095.7MB, bw=112134KB/s, iops=224268 , runt= 28269msec rand-write: io=3095.7MB, bw=96198KB/s, iops=192396 , runt= 32952msec clat (usec): min=0 , max=2631.6K, avg=58716.99, stdev=191377.30 clat (usec): min=0 , max=1753.2K, avg=66423.25, stdev=81774.35 clat (usec): min=0 , max=2915.5K, avg=61685.70, stdev=120598.39 clat (usec): min=0 , max=1933.4K, avg=76935.12, stdev=96603.45 cpu : usr=74.08%, sys=703.84%, ctx=29661403, majf=21354, minf=22460954 cpu : usr=70.92%, sys=702.81%, ctx=77219828, majf=13980, minf=27713137 cpu : usr=72.23%, sys=695.37%, ctx=88081059, majf=18475, minf=28177648 cpu : usr=69.69%, sys=654.13%, ctx=145476035, majf=15867, minf=26176375 With request-based IO path: seq-read : io=2048.0MB, bw=91074KB/s, iops=182147 , runt= 23027msec seq-write : io=2048.0MB, bw=80725KB/s, iops=161449 , runt= 25979msec rand-read : io=3095.7MB, bw=92106KB/s, iops=184211 , runt= 34416msec rand-write: io=3095.7MB, bw=82815KB/s, iops=165630 , runt= 38277msec clat (usec): min=0 , max=1932.4K, avg=77824.17, stdev=170339.49 clat (usec): min=0 , max=2510.2K, avg=78023.96, stdev=146949.15 clat (usec): min=0 , max=3037.2K, avg=74746.53, stdev=128498.27 clat (usec): min=0 , max=1363.4K, avg=89830.75, stdev=114279.68 cpu : usr=53.28%, sys=724.19%, ctx=37988895, majf=17531, minf=23577622 cpu : usr=49.03%, sys=633.20%, ctx=205935380, majf=18197, minf=27288959 cpu : usr=55.78%, sys=722.40%, ctx=101525058, majf=19273, minf=28067082 cpu : usr=56.55%, sys=690.83%, ctx=228205022, majf=18039, minf=26551985 2) Fio test is performed in a 8 vcpu guest with Fusion-IO based guest using kvm tool. Short version: With bio-based IO path, sequential read/write, random read/write IOPS boost : 11%, 11%, 13%, 10% Latency improvement: 10%, 10%, 12%, 10% Long Version: With bio-based IO path: read : io=2048.0MB, bw=58920KB/s, iops=117840 , runt= 35593msec write: io=2048.0MB, bw=64308KB/s, iops=128616 , runt= 32611msec read : io=3095.7MB, bw=59633KB/s, iops=119266 , runt= 53157msec write: io=3095.7MB, bw=62993KB/s, iops=125985 , runt= 50322msec clat (usec): min=0 , max=1284.3K, avg=128109.01, stdev=71513.29 clat (usec): min=94 , max=962339 , avg=116832.95, stdev=65836.80 clat (usec): min=0 , max=1846.6K, avg=128509.99, stdev=89575.07 clat (usec): min=0 , max=2256.4K, avg=121361.84, stdev=82747.25 cpu : usr=56.79%, sys=421.70%, ctx=147335118, majf=21080, minf=19852517 cpu : usr=61.81%, sys=455.53%, ctx=143269950, majf=16027, minf=24800604 cpu : usr=63.10%, sys=455.38%, ctx=178373538, majf=16958, minf=24822612 cpu : usr=62.04%, sys=453.58%, ctx=226902362, majf=16089, minf=23278105 With request-based IO path: read : io=2048.0MB, bw=52896KB/s, iops=105791 , runt= 39647msec write: io=2048.0MB, bw=57856KB/s, iops=115711 , runt= 36248msec read : io=3095.7MB, bw=52387KB/s, iops=104773 , runt= 60510msec write: io=3095.7MB, bw=57310KB/s, iops=114619 , runt= 55312msec clat (usec): min=0 , max=1532.6K, avg=142085.62, stdev=109196.84 clat (usec): min=0 , max=1487.4K, avg=129110.71, stdev=114973.64 clat (usec): min=0 , max=1388.6K, avg=145049.22, stdev=107232.55 clat (usec): min=0 , max=1465.9K, avg=133585.67, stdev=110322.95 cpu : usr=44.08%, sys=590.71%, ctx=451812322, majf=14841, minf=17648641 cpu : usr=48.73%, sys=610.78%, ctx=418953997, majf=22164, minf=26850689 cpu : usr=45.58%, sys=581.16%, ctx=714079216, majf=21497, minf=22558223 cpu : usr=48.40%, sys=599.65%, ctx=656089423, majf=16393, minf=23824409 3) Fio test is performed in a 8 vcpu guest with normal SATA based guest using kvm tool. Short version: With bio-based IO path, sequential read/write, random read/write IOPS boost : -10%, -10%, 4.4%, 0.5% Latency improvement: -12%, -15%, 2.5%, 0.8% Long Version: With bio-based IO path: read : io=124812KB, bw=36537KB/s, iops=9060 , runt= 3416msec write: io=169180KB, bw=24406KB/s, iops=6065 , runt= 6932msec read : io=256200KB, bw=2089.3KB/s, iops=520 , runt=122630msec write: io=257988KB, bw=1545.7KB/s, iops=384 , runt=166910msec clat (msec): min=1 , max=1527 , avg=28.06, stdev=89.54 clat (msec): min=2 , max=344 , avg=41.12, stdev=38.70 clat (msec): min=8 , max=1984 , avg=490.63, stdev=207.28 clat (msec): min=33 , max=4131 , avg=659.19, stdev=304.71 cpu : usr=4.85%, sys=17.15%, ctx=31593, majf=0, minf=7 cpu : usr=3.04%, sys=11.45%, ctx=39377, majf=0, minf=0 cpu : usr=0.47%, sys=1.59%, ctx=262986, majf=0, minf=16 cpu : usr=0.47%, sys=1.46%, ctx=337410, majf=0, minf=0 With request-based IO path: read : io=150120KB, bw=40420KB/s, iops=10037 , runt= 3714msec write: io=194932KB, bw=27029KB/s, iops=6722 , runt= 7212msec read : io=257136KB, bw=2001.1KB/s, iops=498 , runt=128443msec write: io=258276KB, bw=1537.2KB/s, iops=382 , runt=168028msec clat (msec): min=1 , max=1542 , avg=24.84, stdev=32.45 clat (msec): min=3 , max=628 , avg=35.62, stdev=39.71 clat (msec): min=8 , max=2540 , avg=503.28, stdev=236.97 clat (msec): min=41 , max=4398 , avg=653.88, stdev=302.61 cpu : usr=3.91%, sys=15.75%, ctx=26968, majf=0, minf=23 cpu : usr=2.50%, sys=10.56%, ctx=19090, majf=0, minf=0 cpu : usr=0.16%, sys=0.43%, ctx=20159, majf=0, minf=16 cpu : usr=0.18%, sys=0.53%, ctx=81364, majf=0, minf=0 How to use: ----------------------------- Add 'virtio_blk.use_bio=1' to kernel cmdline or 'modprobe virtio_blk use_bio=1' to enable ->make_request_fn() based I/O path. Cc: Rusty Russell <rusty@rustcorp.com.au> Cc: Jens Axboe <axboe@kernel.dk> Cc: Christoph Hellwig <hch@lst.de> Cc: Tejun Heo <tj@kernel.org> Cc: Shaohua Li <shli@kernel.org> Cc: "Michael S. Tsirkin" <mst@redhat.com> Cc: kvm@vger.kernel.org Cc: linux-kernel@vger.kernel.org Cc: virtualization@lists.linux-foundation.org Signed-off-by: Christoph Hellwig <hch@lst.de> Signed-off-by: Minchan Kim <minchan.kim@gmail.com> Signed-off-by: Asias He <asias@redhat.com> Acked-by: Rusty Russell <rusty@rustcorp.com.au> Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
2012-08-08 16:07:04 +08:00
struct scatterlist sg[];
};
static inline blk_status_t virtblk_result(struct virtblk_req *vbr)
virtio-blk: Add bio-based IO path for virtio-blk This patch introduces bio-based IO path for virtio-blk. Compared to request-based IO path, bio-based IO path uses driver provided ->make_request_fn() method to bypasses the IO scheduler. It handles the bio to device directly without allocating a request in block layer. This reduces the IO path in guest kernel to achieve high IOPS and lower latency. The downside is that guest can not use the IO scheduler to merge and sort requests. However, this is not a big problem if the backend disk in host side uses faster disk device. When the bio-based IO path is not enabled, virtio-blk still uses the original request-based IO path, no performance difference is observed. Using a slow device e.g. normal SATA disk, the bio-based IO path for sequential read and write are slower than req-based IO path due to lack of merge in guest kernel. So we make the bio-based path optional. Performance evaluation: ----------------------------- 1) Fio test is performed in a 8 vcpu guest with ramdisk based guest using kvm tool. Short version: With bio-based IO path, sequential read/write, random read/write IOPS boost : 28%, 24%, 21%, 16% Latency improvement: 32%, 17%, 21%, 16% Long version: With bio-based IO path: seq-read : io=2048.0MB, bw=116996KB/s, iops=233991 , runt= 17925msec seq-write : io=2048.0MB, bw=100829KB/s, iops=201658 , runt= 20799msec rand-read : io=3095.7MB, bw=112134KB/s, iops=224268 , runt= 28269msec rand-write: io=3095.7MB, bw=96198KB/s, iops=192396 , runt= 32952msec clat (usec): min=0 , max=2631.6K, avg=58716.99, stdev=191377.30 clat (usec): min=0 , max=1753.2K, avg=66423.25, stdev=81774.35 clat (usec): min=0 , max=2915.5K, avg=61685.70, stdev=120598.39 clat (usec): min=0 , max=1933.4K, avg=76935.12, stdev=96603.45 cpu : usr=74.08%, sys=703.84%, ctx=29661403, majf=21354, minf=22460954 cpu : usr=70.92%, sys=702.81%, ctx=77219828, majf=13980, minf=27713137 cpu : usr=72.23%, sys=695.37%, ctx=88081059, majf=18475, minf=28177648 cpu : usr=69.69%, sys=654.13%, ctx=145476035, majf=15867, minf=26176375 With request-based IO path: seq-read : io=2048.0MB, bw=91074KB/s, iops=182147 , runt= 23027msec seq-write : io=2048.0MB, bw=80725KB/s, iops=161449 , runt= 25979msec rand-read : io=3095.7MB, bw=92106KB/s, iops=184211 , runt= 34416msec rand-write: io=3095.7MB, bw=82815KB/s, iops=165630 , runt= 38277msec clat (usec): min=0 , max=1932.4K, avg=77824.17, stdev=170339.49 clat (usec): min=0 , max=2510.2K, avg=78023.96, stdev=146949.15 clat (usec): min=0 , max=3037.2K, avg=74746.53, stdev=128498.27 clat (usec): min=0 , max=1363.4K, avg=89830.75, stdev=114279.68 cpu : usr=53.28%, sys=724.19%, ctx=37988895, majf=17531, minf=23577622 cpu : usr=49.03%, sys=633.20%, ctx=205935380, majf=18197, minf=27288959 cpu : usr=55.78%, sys=722.40%, ctx=101525058, majf=19273, minf=28067082 cpu : usr=56.55%, sys=690.83%, ctx=228205022, majf=18039, minf=26551985 2) Fio test is performed in a 8 vcpu guest with Fusion-IO based guest using kvm tool. Short version: With bio-based IO path, sequential read/write, random read/write IOPS boost : 11%, 11%, 13%, 10% Latency improvement: 10%, 10%, 12%, 10% Long Version: With bio-based IO path: read : io=2048.0MB, bw=58920KB/s, iops=117840 , runt= 35593msec write: io=2048.0MB, bw=64308KB/s, iops=128616 , runt= 32611msec read : io=3095.7MB, bw=59633KB/s, iops=119266 , runt= 53157msec write: io=3095.7MB, bw=62993KB/s, iops=125985 , runt= 50322msec clat (usec): min=0 , max=1284.3K, avg=128109.01, stdev=71513.29 clat (usec): min=94 , max=962339 , avg=116832.95, stdev=65836.80 clat (usec): min=0 , max=1846.6K, avg=128509.99, stdev=89575.07 clat (usec): min=0 , max=2256.4K, avg=121361.84, stdev=82747.25 cpu : usr=56.79%, sys=421.70%, ctx=147335118, majf=21080, minf=19852517 cpu : usr=61.81%, sys=455.53%, ctx=143269950, majf=16027, minf=24800604 cpu : usr=63.10%, sys=455.38%, ctx=178373538, majf=16958, minf=24822612 cpu : usr=62.04%, sys=453.58%, ctx=226902362, majf=16089, minf=23278105 With request-based IO path: read : io=2048.0MB, bw=52896KB/s, iops=105791 , runt= 39647msec write: io=2048.0MB, bw=57856KB/s, iops=115711 , runt= 36248msec read : io=3095.7MB, bw=52387KB/s, iops=104773 , runt= 60510msec write: io=3095.7MB, bw=57310KB/s, iops=114619 , runt= 55312msec clat (usec): min=0 , max=1532.6K, avg=142085.62, stdev=109196.84 clat (usec): min=0 , max=1487.4K, avg=129110.71, stdev=114973.64 clat (usec): min=0 , max=1388.6K, avg=145049.22, stdev=107232.55 clat (usec): min=0 , max=1465.9K, avg=133585.67, stdev=110322.95 cpu : usr=44.08%, sys=590.71%, ctx=451812322, majf=14841, minf=17648641 cpu : usr=48.73%, sys=610.78%, ctx=418953997, majf=22164, minf=26850689 cpu : usr=45.58%, sys=581.16%, ctx=714079216, majf=21497, minf=22558223 cpu : usr=48.40%, sys=599.65%, ctx=656089423, majf=16393, minf=23824409 3) Fio test is performed in a 8 vcpu guest with normal SATA based guest using kvm tool. Short version: With bio-based IO path, sequential read/write, random read/write IOPS boost : -10%, -10%, 4.4%, 0.5% Latency improvement: -12%, -15%, 2.5%, 0.8% Long Version: With bio-based IO path: read : io=124812KB, bw=36537KB/s, iops=9060 , runt= 3416msec write: io=169180KB, bw=24406KB/s, iops=6065 , runt= 6932msec read : io=256200KB, bw=2089.3KB/s, iops=520 , runt=122630msec write: io=257988KB, bw=1545.7KB/s, iops=384 , runt=166910msec clat (msec): min=1 , max=1527 , avg=28.06, stdev=89.54 clat (msec): min=2 , max=344 , avg=41.12, stdev=38.70 clat (msec): min=8 , max=1984 , avg=490.63, stdev=207.28 clat (msec): min=33 , max=4131 , avg=659.19, stdev=304.71 cpu : usr=4.85%, sys=17.15%, ctx=31593, majf=0, minf=7 cpu : usr=3.04%, sys=11.45%, ctx=39377, majf=0, minf=0 cpu : usr=0.47%, sys=1.59%, ctx=262986, majf=0, minf=16 cpu : usr=0.47%, sys=1.46%, ctx=337410, majf=0, minf=0 With request-based IO path: read : io=150120KB, bw=40420KB/s, iops=10037 , runt= 3714msec write: io=194932KB, bw=27029KB/s, iops=6722 , runt= 7212msec read : io=257136KB, bw=2001.1KB/s, iops=498 , runt=128443msec write: io=258276KB, bw=1537.2KB/s, iops=382 , runt=168028msec clat (msec): min=1 , max=1542 , avg=24.84, stdev=32.45 clat (msec): min=3 , max=628 , avg=35.62, stdev=39.71 clat (msec): min=8 , max=2540 , avg=503.28, stdev=236.97 clat (msec): min=41 , max=4398 , avg=653.88, stdev=302.61 cpu : usr=3.91%, sys=15.75%, ctx=26968, majf=0, minf=23 cpu : usr=2.50%, sys=10.56%, ctx=19090, majf=0, minf=0 cpu : usr=0.16%, sys=0.43%, ctx=20159, majf=0, minf=16 cpu : usr=0.18%, sys=0.53%, ctx=81364, majf=0, minf=0 How to use: ----------------------------- Add 'virtio_blk.use_bio=1' to kernel cmdline or 'modprobe virtio_blk use_bio=1' to enable ->make_request_fn() based I/O path. Cc: Rusty Russell <rusty@rustcorp.com.au> Cc: Jens Axboe <axboe@kernel.dk> Cc: Christoph Hellwig <hch@lst.de> Cc: Tejun Heo <tj@kernel.org> Cc: Shaohua Li <shli@kernel.org> Cc: "Michael S. Tsirkin" <mst@redhat.com> Cc: kvm@vger.kernel.org Cc: linux-kernel@vger.kernel.org Cc: virtualization@lists.linux-foundation.org Signed-off-by: Christoph Hellwig <hch@lst.de> Signed-off-by: Minchan Kim <minchan.kim@gmail.com> Signed-off-by: Asias He <asias@redhat.com> Acked-by: Rusty Russell <rusty@rustcorp.com.au> Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
2012-08-08 16:07:04 +08:00
{
switch (vbr->status) {
case VIRTIO_BLK_S_OK:
return BLK_STS_OK;
virtio-blk: Add bio-based IO path for virtio-blk This patch introduces bio-based IO path for virtio-blk. Compared to request-based IO path, bio-based IO path uses driver provided ->make_request_fn() method to bypasses the IO scheduler. It handles the bio to device directly without allocating a request in block layer. This reduces the IO path in guest kernel to achieve high IOPS and lower latency. The downside is that guest can not use the IO scheduler to merge and sort requests. However, this is not a big problem if the backend disk in host side uses faster disk device. When the bio-based IO path is not enabled, virtio-blk still uses the original request-based IO path, no performance difference is observed. Using a slow device e.g. normal SATA disk, the bio-based IO path for sequential read and write are slower than req-based IO path due to lack of merge in guest kernel. So we make the bio-based path optional. Performance evaluation: ----------------------------- 1) Fio test is performed in a 8 vcpu guest with ramdisk based guest using kvm tool. Short version: With bio-based IO path, sequential read/write, random read/write IOPS boost : 28%, 24%, 21%, 16% Latency improvement: 32%, 17%, 21%, 16% Long version: With bio-based IO path: seq-read : io=2048.0MB, bw=116996KB/s, iops=233991 , runt= 17925msec seq-write : io=2048.0MB, bw=100829KB/s, iops=201658 , runt= 20799msec rand-read : io=3095.7MB, bw=112134KB/s, iops=224268 , runt= 28269msec rand-write: io=3095.7MB, bw=96198KB/s, iops=192396 , runt= 32952msec clat (usec): min=0 , max=2631.6K, avg=58716.99, stdev=191377.30 clat (usec): min=0 , max=1753.2K, avg=66423.25, stdev=81774.35 clat (usec): min=0 , max=2915.5K, avg=61685.70, stdev=120598.39 clat (usec): min=0 , max=1933.4K, avg=76935.12, stdev=96603.45 cpu : usr=74.08%, sys=703.84%, ctx=29661403, majf=21354, minf=22460954 cpu : usr=70.92%, sys=702.81%, ctx=77219828, majf=13980, minf=27713137 cpu : usr=72.23%, sys=695.37%, ctx=88081059, majf=18475, minf=28177648 cpu : usr=69.69%, sys=654.13%, ctx=145476035, majf=15867, minf=26176375 With request-based IO path: seq-read : io=2048.0MB, bw=91074KB/s, iops=182147 , runt= 23027msec seq-write : io=2048.0MB, bw=80725KB/s, iops=161449 , runt= 25979msec rand-read : io=3095.7MB, bw=92106KB/s, iops=184211 , runt= 34416msec rand-write: io=3095.7MB, bw=82815KB/s, iops=165630 , runt= 38277msec clat (usec): min=0 , max=1932.4K, avg=77824.17, stdev=170339.49 clat (usec): min=0 , max=2510.2K, avg=78023.96, stdev=146949.15 clat (usec): min=0 , max=3037.2K, avg=74746.53, stdev=128498.27 clat (usec): min=0 , max=1363.4K, avg=89830.75, stdev=114279.68 cpu : usr=53.28%, sys=724.19%, ctx=37988895, majf=17531, minf=23577622 cpu : usr=49.03%, sys=633.20%, ctx=205935380, majf=18197, minf=27288959 cpu : usr=55.78%, sys=722.40%, ctx=101525058, majf=19273, minf=28067082 cpu : usr=56.55%, sys=690.83%, ctx=228205022, majf=18039, minf=26551985 2) Fio test is performed in a 8 vcpu guest with Fusion-IO based guest using kvm tool. Short version: With bio-based IO path, sequential read/write, random read/write IOPS boost : 11%, 11%, 13%, 10% Latency improvement: 10%, 10%, 12%, 10% Long Version: With bio-based IO path: read : io=2048.0MB, bw=58920KB/s, iops=117840 , runt= 35593msec write: io=2048.0MB, bw=64308KB/s, iops=128616 , runt= 32611msec read : io=3095.7MB, bw=59633KB/s, iops=119266 , runt= 53157msec write: io=3095.7MB, bw=62993KB/s, iops=125985 , runt= 50322msec clat (usec): min=0 , max=1284.3K, avg=128109.01, stdev=71513.29 clat (usec): min=94 , max=962339 , avg=116832.95, stdev=65836.80 clat (usec): min=0 , max=1846.6K, avg=128509.99, stdev=89575.07 clat (usec): min=0 , max=2256.4K, avg=121361.84, stdev=82747.25 cpu : usr=56.79%, sys=421.70%, ctx=147335118, majf=21080, minf=19852517 cpu : usr=61.81%, sys=455.53%, ctx=143269950, majf=16027, minf=24800604 cpu : usr=63.10%, sys=455.38%, ctx=178373538, majf=16958, minf=24822612 cpu : usr=62.04%, sys=453.58%, ctx=226902362, majf=16089, minf=23278105 With request-based IO path: read : io=2048.0MB, bw=52896KB/s, iops=105791 , runt= 39647msec write: io=2048.0MB, bw=57856KB/s, iops=115711 , runt= 36248msec read : io=3095.7MB, bw=52387KB/s, iops=104773 , runt= 60510msec write: io=3095.7MB, bw=57310KB/s, iops=114619 , runt= 55312msec clat (usec): min=0 , max=1532.6K, avg=142085.62, stdev=109196.84 clat (usec): min=0 , max=1487.4K, avg=129110.71, stdev=114973.64 clat (usec): min=0 , max=1388.6K, avg=145049.22, stdev=107232.55 clat (usec): min=0 , max=1465.9K, avg=133585.67, stdev=110322.95 cpu : usr=44.08%, sys=590.71%, ctx=451812322, majf=14841, minf=17648641 cpu : usr=48.73%, sys=610.78%, ctx=418953997, majf=22164, minf=26850689 cpu : usr=45.58%, sys=581.16%, ctx=714079216, majf=21497, minf=22558223 cpu : usr=48.40%, sys=599.65%, ctx=656089423, majf=16393, minf=23824409 3) Fio test is performed in a 8 vcpu guest with normal SATA based guest using kvm tool. Short version: With bio-based IO path, sequential read/write, random read/write IOPS boost : -10%, -10%, 4.4%, 0.5% Latency improvement: -12%, -15%, 2.5%, 0.8% Long Version: With bio-based IO path: read : io=124812KB, bw=36537KB/s, iops=9060 , runt= 3416msec write: io=169180KB, bw=24406KB/s, iops=6065 , runt= 6932msec read : io=256200KB, bw=2089.3KB/s, iops=520 , runt=122630msec write: io=257988KB, bw=1545.7KB/s, iops=384 , runt=166910msec clat (msec): min=1 , max=1527 , avg=28.06, stdev=89.54 clat (msec): min=2 , max=344 , avg=41.12, stdev=38.70 clat (msec): min=8 , max=1984 , avg=490.63, stdev=207.28 clat (msec): min=33 , max=4131 , avg=659.19, stdev=304.71 cpu : usr=4.85%, sys=17.15%, ctx=31593, majf=0, minf=7 cpu : usr=3.04%, sys=11.45%, ctx=39377, majf=0, minf=0 cpu : usr=0.47%, sys=1.59%, ctx=262986, majf=0, minf=16 cpu : usr=0.47%, sys=1.46%, ctx=337410, majf=0, minf=0 With request-based IO path: read : io=150120KB, bw=40420KB/s, iops=10037 , runt= 3714msec write: io=194932KB, bw=27029KB/s, iops=6722 , runt= 7212msec read : io=257136KB, bw=2001.1KB/s, iops=498 , runt=128443msec write: io=258276KB, bw=1537.2KB/s, iops=382 , runt=168028msec clat (msec): min=1 , max=1542 , avg=24.84, stdev=32.45 clat (msec): min=3 , max=628 , avg=35.62, stdev=39.71 clat (msec): min=8 , max=2540 , avg=503.28, stdev=236.97 clat (msec): min=41 , max=4398 , avg=653.88, stdev=302.61 cpu : usr=3.91%, sys=15.75%, ctx=26968, majf=0, minf=23 cpu : usr=2.50%, sys=10.56%, ctx=19090, majf=0, minf=0 cpu : usr=0.16%, sys=0.43%, ctx=20159, majf=0, minf=16 cpu : usr=0.18%, sys=0.53%, ctx=81364, majf=0, minf=0 How to use: ----------------------------- Add 'virtio_blk.use_bio=1' to kernel cmdline or 'modprobe virtio_blk use_bio=1' to enable ->make_request_fn() based I/O path. Cc: Rusty Russell <rusty@rustcorp.com.au> Cc: Jens Axboe <axboe@kernel.dk> Cc: Christoph Hellwig <hch@lst.de> Cc: Tejun Heo <tj@kernel.org> Cc: Shaohua Li <shli@kernel.org> Cc: "Michael S. Tsirkin" <mst@redhat.com> Cc: kvm@vger.kernel.org Cc: linux-kernel@vger.kernel.org Cc: virtualization@lists.linux-foundation.org Signed-off-by: Christoph Hellwig <hch@lst.de> Signed-off-by: Minchan Kim <minchan.kim@gmail.com> Signed-off-by: Asias He <asias@redhat.com> Acked-by: Rusty Russell <rusty@rustcorp.com.au> Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
2012-08-08 16:07:04 +08:00
case VIRTIO_BLK_S_UNSUPP:
return BLK_STS_NOTSUPP;
virtio-blk: Add bio-based IO path for virtio-blk This patch introduces bio-based IO path for virtio-blk. Compared to request-based IO path, bio-based IO path uses driver provided ->make_request_fn() method to bypasses the IO scheduler. It handles the bio to device directly without allocating a request in block layer. This reduces the IO path in guest kernel to achieve high IOPS and lower latency. The downside is that guest can not use the IO scheduler to merge and sort requests. However, this is not a big problem if the backend disk in host side uses faster disk device. When the bio-based IO path is not enabled, virtio-blk still uses the original request-based IO path, no performance difference is observed. Using a slow device e.g. normal SATA disk, the bio-based IO path for sequential read and write are slower than req-based IO path due to lack of merge in guest kernel. So we make the bio-based path optional. Performance evaluation: ----------------------------- 1) Fio test is performed in a 8 vcpu guest with ramdisk based guest using kvm tool. Short version: With bio-based IO path, sequential read/write, random read/write IOPS boost : 28%, 24%, 21%, 16% Latency improvement: 32%, 17%, 21%, 16% Long version: With bio-based IO path: seq-read : io=2048.0MB, bw=116996KB/s, iops=233991 , runt= 17925msec seq-write : io=2048.0MB, bw=100829KB/s, iops=201658 , runt= 20799msec rand-read : io=3095.7MB, bw=112134KB/s, iops=224268 , runt= 28269msec rand-write: io=3095.7MB, bw=96198KB/s, iops=192396 , runt= 32952msec clat (usec): min=0 , max=2631.6K, avg=58716.99, stdev=191377.30 clat (usec): min=0 , max=1753.2K, avg=66423.25, stdev=81774.35 clat (usec): min=0 , max=2915.5K, avg=61685.70, stdev=120598.39 clat (usec): min=0 , max=1933.4K, avg=76935.12, stdev=96603.45 cpu : usr=74.08%, sys=703.84%, ctx=29661403, majf=21354, minf=22460954 cpu : usr=70.92%, sys=702.81%, ctx=77219828, majf=13980, minf=27713137 cpu : usr=72.23%, sys=695.37%, ctx=88081059, majf=18475, minf=28177648 cpu : usr=69.69%, sys=654.13%, ctx=145476035, majf=15867, minf=26176375 With request-based IO path: seq-read : io=2048.0MB, bw=91074KB/s, iops=182147 , runt= 23027msec seq-write : io=2048.0MB, bw=80725KB/s, iops=161449 , runt= 25979msec rand-read : io=3095.7MB, bw=92106KB/s, iops=184211 , runt= 34416msec rand-write: io=3095.7MB, bw=82815KB/s, iops=165630 , runt= 38277msec clat (usec): min=0 , max=1932.4K, avg=77824.17, stdev=170339.49 clat (usec): min=0 , max=2510.2K, avg=78023.96, stdev=146949.15 clat (usec): min=0 , max=3037.2K, avg=74746.53, stdev=128498.27 clat (usec): min=0 , max=1363.4K, avg=89830.75, stdev=114279.68 cpu : usr=53.28%, sys=724.19%, ctx=37988895, majf=17531, minf=23577622 cpu : usr=49.03%, sys=633.20%, ctx=205935380, majf=18197, minf=27288959 cpu : usr=55.78%, sys=722.40%, ctx=101525058, majf=19273, minf=28067082 cpu : usr=56.55%, sys=690.83%, ctx=228205022, majf=18039, minf=26551985 2) Fio test is performed in a 8 vcpu guest with Fusion-IO based guest using kvm tool. Short version: With bio-based IO path, sequential read/write, random read/write IOPS boost : 11%, 11%, 13%, 10% Latency improvement: 10%, 10%, 12%, 10% Long Version: With bio-based IO path: read : io=2048.0MB, bw=58920KB/s, iops=117840 , runt= 35593msec write: io=2048.0MB, bw=64308KB/s, iops=128616 , runt= 32611msec read : io=3095.7MB, bw=59633KB/s, iops=119266 , runt= 53157msec write: io=3095.7MB, bw=62993KB/s, iops=125985 , runt= 50322msec clat (usec): min=0 , max=1284.3K, avg=128109.01, stdev=71513.29 clat (usec): min=94 , max=962339 , avg=116832.95, stdev=65836.80 clat (usec): min=0 , max=1846.6K, avg=128509.99, stdev=89575.07 clat (usec): min=0 , max=2256.4K, avg=121361.84, stdev=82747.25 cpu : usr=56.79%, sys=421.70%, ctx=147335118, majf=21080, minf=19852517 cpu : usr=61.81%, sys=455.53%, ctx=143269950, majf=16027, minf=24800604 cpu : usr=63.10%, sys=455.38%, ctx=178373538, majf=16958, minf=24822612 cpu : usr=62.04%, sys=453.58%, ctx=226902362, majf=16089, minf=23278105 With request-based IO path: read : io=2048.0MB, bw=52896KB/s, iops=105791 , runt= 39647msec write: io=2048.0MB, bw=57856KB/s, iops=115711 , runt= 36248msec read : io=3095.7MB, bw=52387KB/s, iops=104773 , runt= 60510msec write: io=3095.7MB, bw=57310KB/s, iops=114619 , runt= 55312msec clat (usec): min=0 , max=1532.6K, avg=142085.62, stdev=109196.84 clat (usec): min=0 , max=1487.4K, avg=129110.71, stdev=114973.64 clat (usec): min=0 , max=1388.6K, avg=145049.22, stdev=107232.55 clat (usec): min=0 , max=1465.9K, avg=133585.67, stdev=110322.95 cpu : usr=44.08%, sys=590.71%, ctx=451812322, majf=14841, minf=17648641 cpu : usr=48.73%, sys=610.78%, ctx=418953997, majf=22164, minf=26850689 cpu : usr=45.58%, sys=581.16%, ctx=714079216, majf=21497, minf=22558223 cpu : usr=48.40%, sys=599.65%, ctx=656089423, majf=16393, minf=23824409 3) Fio test is performed in a 8 vcpu guest with normal SATA based guest using kvm tool. Short version: With bio-based IO path, sequential read/write, random read/write IOPS boost : -10%, -10%, 4.4%, 0.5% Latency improvement: -12%, -15%, 2.5%, 0.8% Long Version: With bio-based IO path: read : io=124812KB, bw=36537KB/s, iops=9060 , runt= 3416msec write: io=169180KB, bw=24406KB/s, iops=6065 , runt= 6932msec read : io=256200KB, bw=2089.3KB/s, iops=520 , runt=122630msec write: io=257988KB, bw=1545.7KB/s, iops=384 , runt=166910msec clat (msec): min=1 , max=1527 , avg=28.06, stdev=89.54 clat (msec): min=2 , max=344 , avg=41.12, stdev=38.70 clat (msec): min=8 , max=1984 , avg=490.63, stdev=207.28 clat (msec): min=33 , max=4131 , avg=659.19, stdev=304.71 cpu : usr=4.85%, sys=17.15%, ctx=31593, majf=0, minf=7 cpu : usr=3.04%, sys=11.45%, ctx=39377, majf=0, minf=0 cpu : usr=0.47%, sys=1.59%, ctx=262986, majf=0, minf=16 cpu : usr=0.47%, sys=1.46%, ctx=337410, majf=0, minf=0 With request-based IO path: read : io=150120KB, bw=40420KB/s, iops=10037 , runt= 3714msec write: io=194932KB, bw=27029KB/s, iops=6722 , runt= 7212msec read : io=257136KB, bw=2001.1KB/s, iops=498 , runt=128443msec write: io=258276KB, bw=1537.2KB/s, iops=382 , runt=168028msec clat (msec): min=1 , max=1542 , avg=24.84, stdev=32.45 clat (msec): min=3 , max=628 , avg=35.62, stdev=39.71 clat (msec): min=8 , max=2540 , avg=503.28, stdev=236.97 clat (msec): min=41 , max=4398 , avg=653.88, stdev=302.61 cpu : usr=3.91%, sys=15.75%, ctx=26968, majf=0, minf=23 cpu : usr=2.50%, sys=10.56%, ctx=19090, majf=0, minf=0 cpu : usr=0.16%, sys=0.43%, ctx=20159, majf=0, minf=16 cpu : usr=0.18%, sys=0.53%, ctx=81364, majf=0, minf=0 How to use: ----------------------------- Add 'virtio_blk.use_bio=1' to kernel cmdline or 'modprobe virtio_blk use_bio=1' to enable ->make_request_fn() based I/O path. Cc: Rusty Russell <rusty@rustcorp.com.au> Cc: Jens Axboe <axboe@kernel.dk> Cc: Christoph Hellwig <hch@lst.de> Cc: Tejun Heo <tj@kernel.org> Cc: Shaohua Li <shli@kernel.org> Cc: "Michael S. Tsirkin" <mst@redhat.com> Cc: kvm@vger.kernel.org Cc: linux-kernel@vger.kernel.org Cc: virtualization@lists.linux-foundation.org Signed-off-by: Christoph Hellwig <hch@lst.de> Signed-off-by: Minchan Kim <minchan.kim@gmail.com> Signed-off-by: Asias He <asias@redhat.com> Acked-by: Rusty Russell <rusty@rustcorp.com.au> Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
2012-08-08 16:07:04 +08:00
default:
return BLK_STS_IOERR;
virtio-blk: Add bio-based IO path for virtio-blk This patch introduces bio-based IO path for virtio-blk. Compared to request-based IO path, bio-based IO path uses driver provided ->make_request_fn() method to bypasses the IO scheduler. It handles the bio to device directly without allocating a request in block layer. This reduces the IO path in guest kernel to achieve high IOPS and lower latency. The downside is that guest can not use the IO scheduler to merge and sort requests. However, this is not a big problem if the backend disk in host side uses faster disk device. When the bio-based IO path is not enabled, virtio-blk still uses the original request-based IO path, no performance difference is observed. Using a slow device e.g. normal SATA disk, the bio-based IO path for sequential read and write are slower than req-based IO path due to lack of merge in guest kernel. So we make the bio-based path optional. Performance evaluation: ----------------------------- 1) Fio test is performed in a 8 vcpu guest with ramdisk based guest using kvm tool. Short version: With bio-based IO path, sequential read/write, random read/write IOPS boost : 28%, 24%, 21%, 16% Latency improvement: 32%, 17%, 21%, 16% Long version: With bio-based IO path: seq-read : io=2048.0MB, bw=116996KB/s, iops=233991 , runt= 17925msec seq-write : io=2048.0MB, bw=100829KB/s, iops=201658 , runt= 20799msec rand-read : io=3095.7MB, bw=112134KB/s, iops=224268 , runt= 28269msec rand-write: io=3095.7MB, bw=96198KB/s, iops=192396 , runt= 32952msec clat (usec): min=0 , max=2631.6K, avg=58716.99, stdev=191377.30 clat (usec): min=0 , max=1753.2K, avg=66423.25, stdev=81774.35 clat (usec): min=0 , max=2915.5K, avg=61685.70, stdev=120598.39 clat (usec): min=0 , max=1933.4K, avg=76935.12, stdev=96603.45 cpu : usr=74.08%, sys=703.84%, ctx=29661403, majf=21354, minf=22460954 cpu : usr=70.92%, sys=702.81%, ctx=77219828, majf=13980, minf=27713137 cpu : usr=72.23%, sys=695.37%, ctx=88081059, majf=18475, minf=28177648 cpu : usr=69.69%, sys=654.13%, ctx=145476035, majf=15867, minf=26176375 With request-based IO path: seq-read : io=2048.0MB, bw=91074KB/s, iops=182147 , runt= 23027msec seq-write : io=2048.0MB, bw=80725KB/s, iops=161449 , runt= 25979msec rand-read : io=3095.7MB, bw=92106KB/s, iops=184211 , runt= 34416msec rand-write: io=3095.7MB, bw=82815KB/s, iops=165630 , runt= 38277msec clat (usec): min=0 , max=1932.4K, avg=77824.17, stdev=170339.49 clat (usec): min=0 , max=2510.2K, avg=78023.96, stdev=146949.15 clat (usec): min=0 , max=3037.2K, avg=74746.53, stdev=128498.27 clat (usec): min=0 , max=1363.4K, avg=89830.75, stdev=114279.68 cpu : usr=53.28%, sys=724.19%, ctx=37988895, majf=17531, minf=23577622 cpu : usr=49.03%, sys=633.20%, ctx=205935380, majf=18197, minf=27288959 cpu : usr=55.78%, sys=722.40%, ctx=101525058, majf=19273, minf=28067082 cpu : usr=56.55%, sys=690.83%, ctx=228205022, majf=18039, minf=26551985 2) Fio test is performed in a 8 vcpu guest with Fusion-IO based guest using kvm tool. Short version: With bio-based IO path, sequential read/write, random read/write IOPS boost : 11%, 11%, 13%, 10% Latency improvement: 10%, 10%, 12%, 10% Long Version: With bio-based IO path: read : io=2048.0MB, bw=58920KB/s, iops=117840 , runt= 35593msec write: io=2048.0MB, bw=64308KB/s, iops=128616 , runt= 32611msec read : io=3095.7MB, bw=59633KB/s, iops=119266 , runt= 53157msec write: io=3095.7MB, bw=62993KB/s, iops=125985 , runt= 50322msec clat (usec): min=0 , max=1284.3K, avg=128109.01, stdev=71513.29 clat (usec): min=94 , max=962339 , avg=116832.95, stdev=65836.80 clat (usec): min=0 , max=1846.6K, avg=128509.99, stdev=89575.07 clat (usec): min=0 , max=2256.4K, avg=121361.84, stdev=82747.25 cpu : usr=56.79%, sys=421.70%, ctx=147335118, majf=21080, minf=19852517 cpu : usr=61.81%, sys=455.53%, ctx=143269950, majf=16027, minf=24800604 cpu : usr=63.10%, sys=455.38%, ctx=178373538, majf=16958, minf=24822612 cpu : usr=62.04%, sys=453.58%, ctx=226902362, majf=16089, minf=23278105 With request-based IO path: read : io=2048.0MB, bw=52896KB/s, iops=105791 , runt= 39647msec write: io=2048.0MB, bw=57856KB/s, iops=115711 , runt= 36248msec read : io=3095.7MB, bw=52387KB/s, iops=104773 , runt= 60510msec write: io=3095.7MB, bw=57310KB/s, iops=114619 , runt= 55312msec clat (usec): min=0 , max=1532.6K, avg=142085.62, stdev=109196.84 clat (usec): min=0 , max=1487.4K, avg=129110.71, stdev=114973.64 clat (usec): min=0 , max=1388.6K, avg=145049.22, stdev=107232.55 clat (usec): min=0 , max=1465.9K, avg=133585.67, stdev=110322.95 cpu : usr=44.08%, sys=590.71%, ctx=451812322, majf=14841, minf=17648641 cpu : usr=48.73%, sys=610.78%, ctx=418953997, majf=22164, minf=26850689 cpu : usr=45.58%, sys=581.16%, ctx=714079216, majf=21497, minf=22558223 cpu : usr=48.40%, sys=599.65%, ctx=656089423, majf=16393, minf=23824409 3) Fio test is performed in a 8 vcpu guest with normal SATA based guest using kvm tool. Short version: With bio-based IO path, sequential read/write, random read/write IOPS boost : -10%, -10%, 4.4%, 0.5% Latency improvement: -12%, -15%, 2.5%, 0.8% Long Version: With bio-based IO path: read : io=124812KB, bw=36537KB/s, iops=9060 , runt= 3416msec write: io=169180KB, bw=24406KB/s, iops=6065 , runt= 6932msec read : io=256200KB, bw=2089.3KB/s, iops=520 , runt=122630msec write: io=257988KB, bw=1545.7KB/s, iops=384 , runt=166910msec clat (msec): min=1 , max=1527 , avg=28.06, stdev=89.54 clat (msec): min=2 , max=344 , avg=41.12, stdev=38.70 clat (msec): min=8 , max=1984 , avg=490.63, stdev=207.28 clat (msec): min=33 , max=4131 , avg=659.19, stdev=304.71 cpu : usr=4.85%, sys=17.15%, ctx=31593, majf=0, minf=7 cpu : usr=3.04%, sys=11.45%, ctx=39377, majf=0, minf=0 cpu : usr=0.47%, sys=1.59%, ctx=262986, majf=0, minf=16 cpu : usr=0.47%, sys=1.46%, ctx=337410, majf=0, minf=0 With request-based IO path: read : io=150120KB, bw=40420KB/s, iops=10037 , runt= 3714msec write: io=194932KB, bw=27029KB/s, iops=6722 , runt= 7212msec read : io=257136KB, bw=2001.1KB/s, iops=498 , runt=128443msec write: io=258276KB, bw=1537.2KB/s, iops=382 , runt=168028msec clat (msec): min=1 , max=1542 , avg=24.84, stdev=32.45 clat (msec): min=3 , max=628 , avg=35.62, stdev=39.71 clat (msec): min=8 , max=2540 , avg=503.28, stdev=236.97 clat (msec): min=41 , max=4398 , avg=653.88, stdev=302.61 cpu : usr=3.91%, sys=15.75%, ctx=26968, majf=0, minf=23 cpu : usr=2.50%, sys=10.56%, ctx=19090, majf=0, minf=0 cpu : usr=0.16%, sys=0.43%, ctx=20159, majf=0, minf=16 cpu : usr=0.18%, sys=0.53%, ctx=81364, majf=0, minf=0 How to use: ----------------------------- Add 'virtio_blk.use_bio=1' to kernel cmdline or 'modprobe virtio_blk use_bio=1' to enable ->make_request_fn() based I/O path. Cc: Rusty Russell <rusty@rustcorp.com.au> Cc: Jens Axboe <axboe@kernel.dk> Cc: Christoph Hellwig <hch@lst.de> Cc: Tejun Heo <tj@kernel.org> Cc: Shaohua Li <shli@kernel.org> Cc: "Michael S. Tsirkin" <mst@redhat.com> Cc: kvm@vger.kernel.org Cc: linux-kernel@vger.kernel.org Cc: virtualization@lists.linux-foundation.org Signed-off-by: Christoph Hellwig <hch@lst.de> Signed-off-by: Minchan Kim <minchan.kim@gmail.com> Signed-off-by: Asias He <asias@redhat.com> Acked-by: Rusty Russell <rusty@rustcorp.com.au> Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
2012-08-08 16:07:04 +08:00
}
}
static int virtblk_add_req(struct virtqueue *vq, struct virtblk_req *vbr,
struct scatterlist *data_sg, bool have_data)
{
struct scatterlist hdr, status, *sgs[3];
unsigned int num_out = 0, num_in = 0;
sg_init_one(&hdr, &vbr->out_hdr, sizeof(vbr->out_hdr));
sgs[num_out++] = &hdr;
if (have_data) {
if (vbr->out_hdr.type & cpu_to_virtio32(vq->vdev, VIRTIO_BLK_T_OUT))
sgs[num_out++] = data_sg;
else
sgs[num_out + num_in++] = data_sg;
}
sg_init_one(&status, &vbr->status, sizeof(vbr->status));
sgs[num_out + num_in++] = &status;
return virtqueue_add_sgs(vq, sgs, num_out, num_in, vbr, GFP_ATOMIC);
}
static int virtblk_setup_discard_write_zeroes(struct request *req, bool unmap)
{
unsigned short segments = blk_rq_nr_discard_segments(req);
unsigned short n = 0;
struct virtio_blk_discard_write_zeroes *range;
struct bio *bio;
u32 flags = 0;
if (unmap)
flags |= VIRTIO_BLK_WRITE_ZEROES_FLAG_UNMAP;
range = kmalloc_array(segments, sizeof(*range), GFP_ATOMIC);
if (!range)
return -ENOMEM;
/*
* Single max discard segment means multi-range discard isn't
* supported, and block layer only runs contiguity merge like
* normal RW request. So we can't reply on bio for retrieving
* each range info.
*/
if (queue_max_discard_segments(req->q) == 1) {
range[0].flags = cpu_to_le32(flags);
range[0].num_sectors = cpu_to_le32(blk_rq_sectors(req));
range[0].sector = cpu_to_le64(blk_rq_pos(req));
n = 1;
} else {
__rq_for_each_bio(bio, req) {
u64 sector = bio->bi_iter.bi_sector;
u32 num_sectors = bio->bi_iter.bi_size >> SECTOR_SHIFT;
range[n].flags = cpu_to_le32(flags);
range[n].num_sectors = cpu_to_le32(num_sectors);
range[n].sector = cpu_to_le64(sector);
n++;
}
}
WARN_ON_ONCE(n != segments);
req->special_vec.bv_page = virt_to_page(range);
req->special_vec.bv_offset = offset_in_page(range);
req->special_vec.bv_len = sizeof(*range) * segments;
req->rq_flags |= RQF_SPECIAL_PAYLOAD;
return 0;
}
static inline void virtblk_request_done(struct request *req)
virtio-blk: Add bio-based IO path for virtio-blk This patch introduces bio-based IO path for virtio-blk. Compared to request-based IO path, bio-based IO path uses driver provided ->make_request_fn() method to bypasses the IO scheduler. It handles the bio to device directly without allocating a request in block layer. This reduces the IO path in guest kernel to achieve high IOPS and lower latency. The downside is that guest can not use the IO scheduler to merge and sort requests. However, this is not a big problem if the backend disk in host side uses faster disk device. When the bio-based IO path is not enabled, virtio-blk still uses the original request-based IO path, no performance difference is observed. Using a slow device e.g. normal SATA disk, the bio-based IO path for sequential read and write are slower than req-based IO path due to lack of merge in guest kernel. So we make the bio-based path optional. Performance evaluation: ----------------------------- 1) Fio test is performed in a 8 vcpu guest with ramdisk based guest using kvm tool. Short version: With bio-based IO path, sequential read/write, random read/write IOPS boost : 28%, 24%, 21%, 16% Latency improvement: 32%, 17%, 21%, 16% Long version: With bio-based IO path: seq-read : io=2048.0MB, bw=116996KB/s, iops=233991 , runt= 17925msec seq-write : io=2048.0MB, bw=100829KB/s, iops=201658 , runt= 20799msec rand-read : io=3095.7MB, bw=112134KB/s, iops=224268 , runt= 28269msec rand-write: io=3095.7MB, bw=96198KB/s, iops=192396 , runt= 32952msec clat (usec): min=0 , max=2631.6K, avg=58716.99, stdev=191377.30 clat (usec): min=0 , max=1753.2K, avg=66423.25, stdev=81774.35 clat (usec): min=0 , max=2915.5K, avg=61685.70, stdev=120598.39 clat (usec): min=0 , max=1933.4K, avg=76935.12, stdev=96603.45 cpu : usr=74.08%, sys=703.84%, ctx=29661403, majf=21354, minf=22460954 cpu : usr=70.92%, sys=702.81%, ctx=77219828, majf=13980, minf=27713137 cpu : usr=72.23%, sys=695.37%, ctx=88081059, majf=18475, minf=28177648 cpu : usr=69.69%, sys=654.13%, ctx=145476035, majf=15867, minf=26176375 With request-based IO path: seq-read : io=2048.0MB, bw=91074KB/s, iops=182147 , runt= 23027msec seq-write : io=2048.0MB, bw=80725KB/s, iops=161449 , runt= 25979msec rand-read : io=3095.7MB, bw=92106KB/s, iops=184211 , runt= 34416msec rand-write: io=3095.7MB, bw=82815KB/s, iops=165630 , runt= 38277msec clat (usec): min=0 , max=1932.4K, avg=77824.17, stdev=170339.49 clat (usec): min=0 , max=2510.2K, avg=78023.96, stdev=146949.15 clat (usec): min=0 , max=3037.2K, avg=74746.53, stdev=128498.27 clat (usec): min=0 , max=1363.4K, avg=89830.75, stdev=114279.68 cpu : usr=53.28%, sys=724.19%, ctx=37988895, majf=17531, minf=23577622 cpu : usr=49.03%, sys=633.20%, ctx=205935380, majf=18197, minf=27288959 cpu : usr=55.78%, sys=722.40%, ctx=101525058, majf=19273, minf=28067082 cpu : usr=56.55%, sys=690.83%, ctx=228205022, majf=18039, minf=26551985 2) Fio test is performed in a 8 vcpu guest with Fusion-IO based guest using kvm tool. Short version: With bio-based IO path, sequential read/write, random read/write IOPS boost : 11%, 11%, 13%, 10% Latency improvement: 10%, 10%, 12%, 10% Long Version: With bio-based IO path: read : io=2048.0MB, bw=58920KB/s, iops=117840 , runt= 35593msec write: io=2048.0MB, bw=64308KB/s, iops=128616 , runt= 32611msec read : io=3095.7MB, bw=59633KB/s, iops=119266 , runt= 53157msec write: io=3095.7MB, bw=62993KB/s, iops=125985 , runt= 50322msec clat (usec): min=0 , max=1284.3K, avg=128109.01, stdev=71513.29 clat (usec): min=94 , max=962339 , avg=116832.95, stdev=65836.80 clat (usec): min=0 , max=1846.6K, avg=128509.99, stdev=89575.07 clat (usec): min=0 , max=2256.4K, avg=121361.84, stdev=82747.25 cpu : usr=56.79%, sys=421.70%, ctx=147335118, majf=21080, minf=19852517 cpu : usr=61.81%, sys=455.53%, ctx=143269950, majf=16027, minf=24800604 cpu : usr=63.10%, sys=455.38%, ctx=178373538, majf=16958, minf=24822612 cpu : usr=62.04%, sys=453.58%, ctx=226902362, majf=16089, minf=23278105 With request-based IO path: read : io=2048.0MB, bw=52896KB/s, iops=105791 , runt= 39647msec write: io=2048.0MB, bw=57856KB/s, iops=115711 , runt= 36248msec read : io=3095.7MB, bw=52387KB/s, iops=104773 , runt= 60510msec write: io=3095.7MB, bw=57310KB/s, iops=114619 , runt= 55312msec clat (usec): min=0 , max=1532.6K, avg=142085.62, stdev=109196.84 clat (usec): min=0 , max=1487.4K, avg=129110.71, stdev=114973.64 clat (usec): min=0 , max=1388.6K, avg=145049.22, stdev=107232.55 clat (usec): min=0 , max=1465.9K, avg=133585.67, stdev=110322.95 cpu : usr=44.08%, sys=590.71%, ctx=451812322, majf=14841, minf=17648641 cpu : usr=48.73%, sys=610.78%, ctx=418953997, majf=22164, minf=26850689 cpu : usr=45.58%, sys=581.16%, ctx=714079216, majf=21497, minf=22558223 cpu : usr=48.40%, sys=599.65%, ctx=656089423, majf=16393, minf=23824409 3) Fio test is performed in a 8 vcpu guest with normal SATA based guest using kvm tool. Short version: With bio-based IO path, sequential read/write, random read/write IOPS boost : -10%, -10%, 4.4%, 0.5% Latency improvement: -12%, -15%, 2.5%, 0.8% Long Version: With bio-based IO path: read : io=124812KB, bw=36537KB/s, iops=9060 , runt= 3416msec write: io=169180KB, bw=24406KB/s, iops=6065 , runt= 6932msec read : io=256200KB, bw=2089.3KB/s, iops=520 , runt=122630msec write: io=257988KB, bw=1545.7KB/s, iops=384 , runt=166910msec clat (msec): min=1 , max=1527 , avg=28.06, stdev=89.54 clat (msec): min=2 , max=344 , avg=41.12, stdev=38.70 clat (msec): min=8 , max=1984 , avg=490.63, stdev=207.28 clat (msec): min=33 , max=4131 , avg=659.19, stdev=304.71 cpu : usr=4.85%, sys=17.15%, ctx=31593, majf=0, minf=7 cpu : usr=3.04%, sys=11.45%, ctx=39377, majf=0, minf=0 cpu : usr=0.47%, sys=1.59%, ctx=262986, majf=0, minf=16 cpu : usr=0.47%, sys=1.46%, ctx=337410, majf=0, minf=0 With request-based IO path: read : io=150120KB, bw=40420KB/s, iops=10037 , runt= 3714msec write: io=194932KB, bw=27029KB/s, iops=6722 , runt= 7212msec read : io=257136KB, bw=2001.1KB/s, iops=498 , runt=128443msec write: io=258276KB, bw=1537.2KB/s, iops=382 , runt=168028msec clat (msec): min=1 , max=1542 , avg=24.84, stdev=32.45 clat (msec): min=3 , max=628 , avg=35.62, stdev=39.71 clat (msec): min=8 , max=2540 , avg=503.28, stdev=236.97 clat (msec): min=41 , max=4398 , avg=653.88, stdev=302.61 cpu : usr=3.91%, sys=15.75%, ctx=26968, majf=0, minf=23 cpu : usr=2.50%, sys=10.56%, ctx=19090, majf=0, minf=0 cpu : usr=0.16%, sys=0.43%, ctx=20159, majf=0, minf=16 cpu : usr=0.18%, sys=0.53%, ctx=81364, majf=0, minf=0 How to use: ----------------------------- Add 'virtio_blk.use_bio=1' to kernel cmdline or 'modprobe virtio_blk use_bio=1' to enable ->make_request_fn() based I/O path. Cc: Rusty Russell <rusty@rustcorp.com.au> Cc: Jens Axboe <axboe@kernel.dk> Cc: Christoph Hellwig <hch@lst.de> Cc: Tejun Heo <tj@kernel.org> Cc: Shaohua Li <shli@kernel.org> Cc: "Michael S. Tsirkin" <mst@redhat.com> Cc: kvm@vger.kernel.org Cc: linux-kernel@vger.kernel.org Cc: virtualization@lists.linux-foundation.org Signed-off-by: Christoph Hellwig <hch@lst.de> Signed-off-by: Minchan Kim <minchan.kim@gmail.com> Signed-off-by: Asias He <asias@redhat.com> Acked-by: Rusty Russell <rusty@rustcorp.com.au> Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
2012-08-08 16:07:04 +08:00
{
struct virtblk_req *vbr = blk_mq_rq_to_pdu(req);
virtio-blk: Add bio-based IO path for virtio-blk This patch introduces bio-based IO path for virtio-blk. Compared to request-based IO path, bio-based IO path uses driver provided ->make_request_fn() method to bypasses the IO scheduler. It handles the bio to device directly without allocating a request in block layer. This reduces the IO path in guest kernel to achieve high IOPS and lower latency. The downside is that guest can not use the IO scheduler to merge and sort requests. However, this is not a big problem if the backend disk in host side uses faster disk device. When the bio-based IO path is not enabled, virtio-blk still uses the original request-based IO path, no performance difference is observed. Using a slow device e.g. normal SATA disk, the bio-based IO path for sequential read and write are slower than req-based IO path due to lack of merge in guest kernel. So we make the bio-based path optional. Performance evaluation: ----------------------------- 1) Fio test is performed in a 8 vcpu guest with ramdisk based guest using kvm tool. Short version: With bio-based IO path, sequential read/write, random read/write IOPS boost : 28%, 24%, 21%, 16% Latency improvement: 32%, 17%, 21%, 16% Long version: With bio-based IO path: seq-read : io=2048.0MB, bw=116996KB/s, iops=233991 , runt= 17925msec seq-write : io=2048.0MB, bw=100829KB/s, iops=201658 , runt= 20799msec rand-read : io=3095.7MB, bw=112134KB/s, iops=224268 , runt= 28269msec rand-write: io=3095.7MB, bw=96198KB/s, iops=192396 , runt= 32952msec clat (usec): min=0 , max=2631.6K, avg=58716.99, stdev=191377.30 clat (usec): min=0 , max=1753.2K, avg=66423.25, stdev=81774.35 clat (usec): min=0 , max=2915.5K, avg=61685.70, stdev=120598.39 clat (usec): min=0 , max=1933.4K, avg=76935.12, stdev=96603.45 cpu : usr=74.08%, sys=703.84%, ctx=29661403, majf=21354, minf=22460954 cpu : usr=70.92%, sys=702.81%, ctx=77219828, majf=13980, minf=27713137 cpu : usr=72.23%, sys=695.37%, ctx=88081059, majf=18475, minf=28177648 cpu : usr=69.69%, sys=654.13%, ctx=145476035, majf=15867, minf=26176375 With request-based IO path: seq-read : io=2048.0MB, bw=91074KB/s, iops=182147 , runt= 23027msec seq-write : io=2048.0MB, bw=80725KB/s, iops=161449 , runt= 25979msec rand-read : io=3095.7MB, bw=92106KB/s, iops=184211 , runt= 34416msec rand-write: io=3095.7MB, bw=82815KB/s, iops=165630 , runt= 38277msec clat (usec): min=0 , max=1932.4K, avg=77824.17, stdev=170339.49 clat (usec): min=0 , max=2510.2K, avg=78023.96, stdev=146949.15 clat (usec): min=0 , max=3037.2K, avg=74746.53, stdev=128498.27 clat (usec): min=0 , max=1363.4K, avg=89830.75, stdev=114279.68 cpu : usr=53.28%, sys=724.19%, ctx=37988895, majf=17531, minf=23577622 cpu : usr=49.03%, sys=633.20%, ctx=205935380, majf=18197, minf=27288959 cpu : usr=55.78%, sys=722.40%, ctx=101525058, majf=19273, minf=28067082 cpu : usr=56.55%, sys=690.83%, ctx=228205022, majf=18039, minf=26551985 2) Fio test is performed in a 8 vcpu guest with Fusion-IO based guest using kvm tool. Short version: With bio-based IO path, sequential read/write, random read/write IOPS boost : 11%, 11%, 13%, 10% Latency improvement: 10%, 10%, 12%, 10% Long Version: With bio-based IO path: read : io=2048.0MB, bw=58920KB/s, iops=117840 , runt= 35593msec write: io=2048.0MB, bw=64308KB/s, iops=128616 , runt= 32611msec read : io=3095.7MB, bw=59633KB/s, iops=119266 , runt= 53157msec write: io=3095.7MB, bw=62993KB/s, iops=125985 , runt= 50322msec clat (usec): min=0 , max=1284.3K, avg=128109.01, stdev=71513.29 clat (usec): min=94 , max=962339 , avg=116832.95, stdev=65836.80 clat (usec): min=0 , max=1846.6K, avg=128509.99, stdev=89575.07 clat (usec): min=0 , max=2256.4K, avg=121361.84, stdev=82747.25 cpu : usr=56.79%, sys=421.70%, ctx=147335118, majf=21080, minf=19852517 cpu : usr=61.81%, sys=455.53%, ctx=143269950, majf=16027, minf=24800604 cpu : usr=63.10%, sys=455.38%, ctx=178373538, majf=16958, minf=24822612 cpu : usr=62.04%, sys=453.58%, ctx=226902362, majf=16089, minf=23278105 With request-based IO path: read : io=2048.0MB, bw=52896KB/s, iops=105791 , runt= 39647msec write: io=2048.0MB, bw=57856KB/s, iops=115711 , runt= 36248msec read : io=3095.7MB, bw=52387KB/s, iops=104773 , runt= 60510msec write: io=3095.7MB, bw=57310KB/s, iops=114619 , runt= 55312msec clat (usec): min=0 , max=1532.6K, avg=142085.62, stdev=109196.84 clat (usec): min=0 , max=1487.4K, avg=129110.71, stdev=114973.64 clat (usec): min=0 , max=1388.6K, avg=145049.22, stdev=107232.55 clat (usec): min=0 , max=1465.9K, avg=133585.67, stdev=110322.95 cpu : usr=44.08%, sys=590.71%, ctx=451812322, majf=14841, minf=17648641 cpu : usr=48.73%, sys=610.78%, ctx=418953997, majf=22164, minf=26850689 cpu : usr=45.58%, sys=581.16%, ctx=714079216, majf=21497, minf=22558223 cpu : usr=48.40%, sys=599.65%, ctx=656089423, majf=16393, minf=23824409 3) Fio test is performed in a 8 vcpu guest with normal SATA based guest using kvm tool. Short version: With bio-based IO path, sequential read/write, random read/write IOPS boost : -10%, -10%, 4.4%, 0.5% Latency improvement: -12%, -15%, 2.5%, 0.8% Long Version: With bio-based IO path: read : io=124812KB, bw=36537KB/s, iops=9060 , runt= 3416msec write: io=169180KB, bw=24406KB/s, iops=6065 , runt= 6932msec read : io=256200KB, bw=2089.3KB/s, iops=520 , runt=122630msec write: io=257988KB, bw=1545.7KB/s, iops=384 , runt=166910msec clat (msec): min=1 , max=1527 , avg=28.06, stdev=89.54 clat (msec): min=2 , max=344 , avg=41.12, stdev=38.70 clat (msec): min=8 , max=1984 , avg=490.63, stdev=207.28 clat (msec): min=33 , max=4131 , avg=659.19, stdev=304.71 cpu : usr=4.85%, sys=17.15%, ctx=31593, majf=0, minf=7 cpu : usr=3.04%, sys=11.45%, ctx=39377, majf=0, minf=0 cpu : usr=0.47%, sys=1.59%, ctx=262986, majf=0, minf=16 cpu : usr=0.47%, sys=1.46%, ctx=337410, majf=0, minf=0 With request-based IO path: read : io=150120KB, bw=40420KB/s, iops=10037 , runt= 3714msec write: io=194932KB, bw=27029KB/s, iops=6722 , runt= 7212msec read : io=257136KB, bw=2001.1KB/s, iops=498 , runt=128443msec write: io=258276KB, bw=1537.2KB/s, iops=382 , runt=168028msec clat (msec): min=1 , max=1542 , avg=24.84, stdev=32.45 clat (msec): min=3 , max=628 , avg=35.62, stdev=39.71 clat (msec): min=8 , max=2540 , avg=503.28, stdev=236.97 clat (msec): min=41 , max=4398 , avg=653.88, stdev=302.61 cpu : usr=3.91%, sys=15.75%, ctx=26968, majf=0, minf=23 cpu : usr=2.50%, sys=10.56%, ctx=19090, majf=0, minf=0 cpu : usr=0.16%, sys=0.43%, ctx=20159, majf=0, minf=16 cpu : usr=0.18%, sys=0.53%, ctx=81364, majf=0, minf=0 How to use: ----------------------------- Add 'virtio_blk.use_bio=1' to kernel cmdline or 'modprobe virtio_blk use_bio=1' to enable ->make_request_fn() based I/O path. Cc: Rusty Russell <rusty@rustcorp.com.au> Cc: Jens Axboe <axboe@kernel.dk> Cc: Christoph Hellwig <hch@lst.de> Cc: Tejun Heo <tj@kernel.org> Cc: Shaohua Li <shli@kernel.org> Cc: "Michael S. Tsirkin" <mst@redhat.com> Cc: kvm@vger.kernel.org Cc: linux-kernel@vger.kernel.org Cc: virtualization@lists.linux-foundation.org Signed-off-by: Christoph Hellwig <hch@lst.de> Signed-off-by: Minchan Kim <minchan.kim@gmail.com> Signed-off-by: Asias He <asias@redhat.com> Acked-by: Rusty Russell <rusty@rustcorp.com.au> Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
2012-08-08 16:07:04 +08:00
if (req->rq_flags & RQF_SPECIAL_PAYLOAD)
kfree(bvec_virt(&req->special_vec));
blk_mq_end_request(req, virtblk_result(vbr));
virtio-blk: Add bio-based IO path for virtio-blk This patch introduces bio-based IO path for virtio-blk. Compared to request-based IO path, bio-based IO path uses driver provided ->make_request_fn() method to bypasses the IO scheduler. It handles the bio to device directly without allocating a request in block layer. This reduces the IO path in guest kernel to achieve high IOPS and lower latency. The downside is that guest can not use the IO scheduler to merge and sort requests. However, this is not a big problem if the backend disk in host side uses faster disk device. When the bio-based IO path is not enabled, virtio-blk still uses the original request-based IO path, no performance difference is observed. Using a slow device e.g. normal SATA disk, the bio-based IO path for sequential read and write are slower than req-based IO path due to lack of merge in guest kernel. So we make the bio-based path optional. Performance evaluation: ----------------------------- 1) Fio test is performed in a 8 vcpu guest with ramdisk based guest using kvm tool. Short version: With bio-based IO path, sequential read/write, random read/write IOPS boost : 28%, 24%, 21%, 16% Latency improvement: 32%, 17%, 21%, 16% Long version: With bio-based IO path: seq-read : io=2048.0MB, bw=116996KB/s, iops=233991 , runt= 17925msec seq-write : io=2048.0MB, bw=100829KB/s, iops=201658 , runt= 20799msec rand-read : io=3095.7MB, bw=112134KB/s, iops=224268 , runt= 28269msec rand-write: io=3095.7MB, bw=96198KB/s, iops=192396 , runt= 32952msec clat (usec): min=0 , max=2631.6K, avg=58716.99, stdev=191377.30 clat (usec): min=0 , max=1753.2K, avg=66423.25, stdev=81774.35 clat (usec): min=0 , max=2915.5K, avg=61685.70, stdev=120598.39 clat (usec): min=0 , max=1933.4K, avg=76935.12, stdev=96603.45 cpu : usr=74.08%, sys=703.84%, ctx=29661403, majf=21354, minf=22460954 cpu : usr=70.92%, sys=702.81%, ctx=77219828, majf=13980, minf=27713137 cpu : usr=72.23%, sys=695.37%, ctx=88081059, majf=18475, minf=28177648 cpu : usr=69.69%, sys=654.13%, ctx=145476035, majf=15867, minf=26176375 With request-based IO path: seq-read : io=2048.0MB, bw=91074KB/s, iops=182147 , runt= 23027msec seq-write : io=2048.0MB, bw=80725KB/s, iops=161449 , runt= 25979msec rand-read : io=3095.7MB, bw=92106KB/s, iops=184211 , runt= 34416msec rand-write: io=3095.7MB, bw=82815KB/s, iops=165630 , runt= 38277msec clat (usec): min=0 , max=1932.4K, avg=77824.17, stdev=170339.49 clat (usec): min=0 , max=2510.2K, avg=78023.96, stdev=146949.15 clat (usec): min=0 , max=3037.2K, avg=74746.53, stdev=128498.27 clat (usec): min=0 , max=1363.4K, avg=89830.75, stdev=114279.68 cpu : usr=53.28%, sys=724.19%, ctx=37988895, majf=17531, minf=23577622 cpu : usr=49.03%, sys=633.20%, ctx=205935380, majf=18197, minf=27288959 cpu : usr=55.78%, sys=722.40%, ctx=101525058, majf=19273, minf=28067082 cpu : usr=56.55%, sys=690.83%, ctx=228205022, majf=18039, minf=26551985 2) Fio test is performed in a 8 vcpu guest with Fusion-IO based guest using kvm tool. Short version: With bio-based IO path, sequential read/write, random read/write IOPS boost : 11%, 11%, 13%, 10% Latency improvement: 10%, 10%, 12%, 10% Long Version: With bio-based IO path: read : io=2048.0MB, bw=58920KB/s, iops=117840 , runt= 35593msec write: io=2048.0MB, bw=64308KB/s, iops=128616 , runt= 32611msec read : io=3095.7MB, bw=59633KB/s, iops=119266 , runt= 53157msec write: io=3095.7MB, bw=62993KB/s, iops=125985 , runt= 50322msec clat (usec): min=0 , max=1284.3K, avg=128109.01, stdev=71513.29 clat (usec): min=94 , max=962339 , avg=116832.95, stdev=65836.80 clat (usec): min=0 , max=1846.6K, avg=128509.99, stdev=89575.07 clat (usec): min=0 , max=2256.4K, avg=121361.84, stdev=82747.25 cpu : usr=56.79%, sys=421.70%, ctx=147335118, majf=21080, minf=19852517 cpu : usr=61.81%, sys=455.53%, ctx=143269950, majf=16027, minf=24800604 cpu : usr=63.10%, sys=455.38%, ctx=178373538, majf=16958, minf=24822612 cpu : usr=62.04%, sys=453.58%, ctx=226902362, majf=16089, minf=23278105 With request-based IO path: read : io=2048.0MB, bw=52896KB/s, iops=105791 , runt= 39647msec write: io=2048.0MB, bw=57856KB/s, iops=115711 , runt= 36248msec read : io=3095.7MB, bw=52387KB/s, iops=104773 , runt= 60510msec write: io=3095.7MB, bw=57310KB/s, iops=114619 , runt= 55312msec clat (usec): min=0 , max=1532.6K, avg=142085.62, stdev=109196.84 clat (usec): min=0 , max=1487.4K, avg=129110.71, stdev=114973.64 clat (usec): min=0 , max=1388.6K, avg=145049.22, stdev=107232.55 clat (usec): min=0 , max=1465.9K, avg=133585.67, stdev=110322.95 cpu : usr=44.08%, sys=590.71%, ctx=451812322, majf=14841, minf=17648641 cpu : usr=48.73%, sys=610.78%, ctx=418953997, majf=22164, minf=26850689 cpu : usr=45.58%, sys=581.16%, ctx=714079216, majf=21497, minf=22558223 cpu : usr=48.40%, sys=599.65%, ctx=656089423, majf=16393, minf=23824409 3) Fio test is performed in a 8 vcpu guest with normal SATA based guest using kvm tool. Short version: With bio-based IO path, sequential read/write, random read/write IOPS boost : -10%, -10%, 4.4%, 0.5% Latency improvement: -12%, -15%, 2.5%, 0.8% Long Version: With bio-based IO path: read : io=124812KB, bw=36537KB/s, iops=9060 , runt= 3416msec write: io=169180KB, bw=24406KB/s, iops=6065 , runt= 6932msec read : io=256200KB, bw=2089.3KB/s, iops=520 , runt=122630msec write: io=257988KB, bw=1545.7KB/s, iops=384 , runt=166910msec clat (msec): min=1 , max=1527 , avg=28.06, stdev=89.54 clat (msec): min=2 , max=344 , avg=41.12, stdev=38.70 clat (msec): min=8 , max=1984 , avg=490.63, stdev=207.28 clat (msec): min=33 , max=4131 , avg=659.19, stdev=304.71 cpu : usr=4.85%, sys=17.15%, ctx=31593, majf=0, minf=7 cpu : usr=3.04%, sys=11.45%, ctx=39377, majf=0, minf=0 cpu : usr=0.47%, sys=1.59%, ctx=262986, majf=0, minf=16 cpu : usr=0.47%, sys=1.46%, ctx=337410, majf=0, minf=0 With request-based IO path: read : io=150120KB, bw=40420KB/s, iops=10037 , runt= 3714msec write: io=194932KB, bw=27029KB/s, iops=6722 , runt= 7212msec read : io=257136KB, bw=2001.1KB/s, iops=498 , runt=128443msec write: io=258276KB, bw=1537.2KB/s, iops=382 , runt=168028msec clat (msec): min=1 , max=1542 , avg=24.84, stdev=32.45 clat (msec): min=3 , max=628 , avg=35.62, stdev=39.71 clat (msec): min=8 , max=2540 , avg=503.28, stdev=236.97 clat (msec): min=41 , max=4398 , avg=653.88, stdev=302.61 cpu : usr=3.91%, sys=15.75%, ctx=26968, majf=0, minf=23 cpu : usr=2.50%, sys=10.56%, ctx=19090, majf=0, minf=0 cpu : usr=0.16%, sys=0.43%, ctx=20159, majf=0, minf=16 cpu : usr=0.18%, sys=0.53%, ctx=81364, majf=0, minf=0 How to use: ----------------------------- Add 'virtio_blk.use_bio=1' to kernel cmdline or 'modprobe virtio_blk use_bio=1' to enable ->make_request_fn() based I/O path. Cc: Rusty Russell <rusty@rustcorp.com.au> Cc: Jens Axboe <axboe@kernel.dk> Cc: Christoph Hellwig <hch@lst.de> Cc: Tejun Heo <tj@kernel.org> Cc: Shaohua Li <shli@kernel.org> Cc: "Michael S. Tsirkin" <mst@redhat.com> Cc: kvm@vger.kernel.org Cc: linux-kernel@vger.kernel.org Cc: virtualization@lists.linux-foundation.org Signed-off-by: Christoph Hellwig <hch@lst.de> Signed-off-by: Minchan Kim <minchan.kim@gmail.com> Signed-off-by: Asias He <asias@redhat.com> Acked-by: Rusty Russell <rusty@rustcorp.com.au> Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
2012-08-08 16:07:04 +08:00
}
static void virtblk_done(struct virtqueue *vq)
{
struct virtio_blk *vblk = vq->vdev->priv;
bool req_done = false;
int qid = vq->index;
struct virtblk_req *vbr;
unsigned long flags;
virtio-blk: Add bio-based IO path for virtio-blk This patch introduces bio-based IO path for virtio-blk. Compared to request-based IO path, bio-based IO path uses driver provided ->make_request_fn() method to bypasses the IO scheduler. It handles the bio to device directly without allocating a request in block layer. This reduces the IO path in guest kernel to achieve high IOPS and lower latency. The downside is that guest can not use the IO scheduler to merge and sort requests. However, this is not a big problem if the backend disk in host side uses faster disk device. When the bio-based IO path is not enabled, virtio-blk still uses the original request-based IO path, no performance difference is observed. Using a slow device e.g. normal SATA disk, the bio-based IO path for sequential read and write are slower than req-based IO path due to lack of merge in guest kernel. So we make the bio-based path optional. Performance evaluation: ----------------------------- 1) Fio test is performed in a 8 vcpu guest with ramdisk based guest using kvm tool. Short version: With bio-based IO path, sequential read/write, random read/write IOPS boost : 28%, 24%, 21%, 16% Latency improvement: 32%, 17%, 21%, 16% Long version: With bio-based IO path: seq-read : io=2048.0MB, bw=116996KB/s, iops=233991 , runt= 17925msec seq-write : io=2048.0MB, bw=100829KB/s, iops=201658 , runt= 20799msec rand-read : io=3095.7MB, bw=112134KB/s, iops=224268 , runt= 28269msec rand-write: io=3095.7MB, bw=96198KB/s, iops=192396 , runt= 32952msec clat (usec): min=0 , max=2631.6K, avg=58716.99, stdev=191377.30 clat (usec): min=0 , max=1753.2K, avg=66423.25, stdev=81774.35 clat (usec): min=0 , max=2915.5K, avg=61685.70, stdev=120598.39 clat (usec): min=0 , max=1933.4K, avg=76935.12, stdev=96603.45 cpu : usr=74.08%, sys=703.84%, ctx=29661403, majf=21354, minf=22460954 cpu : usr=70.92%, sys=702.81%, ctx=77219828, majf=13980, minf=27713137 cpu : usr=72.23%, sys=695.37%, ctx=88081059, majf=18475, minf=28177648 cpu : usr=69.69%, sys=654.13%, ctx=145476035, majf=15867, minf=26176375 With request-based IO path: seq-read : io=2048.0MB, bw=91074KB/s, iops=182147 , runt= 23027msec seq-write : io=2048.0MB, bw=80725KB/s, iops=161449 , runt= 25979msec rand-read : io=3095.7MB, bw=92106KB/s, iops=184211 , runt= 34416msec rand-write: io=3095.7MB, bw=82815KB/s, iops=165630 , runt= 38277msec clat (usec): min=0 , max=1932.4K, avg=77824.17, stdev=170339.49 clat (usec): min=0 , max=2510.2K, avg=78023.96, stdev=146949.15 clat (usec): min=0 , max=3037.2K, avg=74746.53, stdev=128498.27 clat (usec): min=0 , max=1363.4K, avg=89830.75, stdev=114279.68 cpu : usr=53.28%, sys=724.19%, ctx=37988895, majf=17531, minf=23577622 cpu : usr=49.03%, sys=633.20%, ctx=205935380, majf=18197, minf=27288959 cpu : usr=55.78%, sys=722.40%, ctx=101525058, majf=19273, minf=28067082 cpu : usr=56.55%, sys=690.83%, ctx=228205022, majf=18039, minf=26551985 2) Fio test is performed in a 8 vcpu guest with Fusion-IO based guest using kvm tool. Short version: With bio-based IO path, sequential read/write, random read/write IOPS boost : 11%, 11%, 13%, 10% Latency improvement: 10%, 10%, 12%, 10% Long Version: With bio-based IO path: read : io=2048.0MB, bw=58920KB/s, iops=117840 , runt= 35593msec write: io=2048.0MB, bw=64308KB/s, iops=128616 , runt= 32611msec read : io=3095.7MB, bw=59633KB/s, iops=119266 , runt= 53157msec write: io=3095.7MB, bw=62993KB/s, iops=125985 , runt= 50322msec clat (usec): min=0 , max=1284.3K, avg=128109.01, stdev=71513.29 clat (usec): min=94 , max=962339 , avg=116832.95, stdev=65836.80 clat (usec): min=0 , max=1846.6K, avg=128509.99, stdev=89575.07 clat (usec): min=0 , max=2256.4K, avg=121361.84, stdev=82747.25 cpu : usr=56.79%, sys=421.70%, ctx=147335118, majf=21080, minf=19852517 cpu : usr=61.81%, sys=455.53%, ctx=143269950, majf=16027, minf=24800604 cpu : usr=63.10%, sys=455.38%, ctx=178373538, majf=16958, minf=24822612 cpu : usr=62.04%, sys=453.58%, ctx=226902362, majf=16089, minf=23278105 With request-based IO path: read : io=2048.0MB, bw=52896KB/s, iops=105791 , runt= 39647msec write: io=2048.0MB, bw=57856KB/s, iops=115711 , runt= 36248msec read : io=3095.7MB, bw=52387KB/s, iops=104773 , runt= 60510msec write: io=3095.7MB, bw=57310KB/s, iops=114619 , runt= 55312msec clat (usec): min=0 , max=1532.6K, avg=142085.62, stdev=109196.84 clat (usec): min=0 , max=1487.4K, avg=129110.71, stdev=114973.64 clat (usec): min=0 , max=1388.6K, avg=145049.22, stdev=107232.55 clat (usec): min=0 , max=1465.9K, avg=133585.67, stdev=110322.95 cpu : usr=44.08%, sys=590.71%, ctx=451812322, majf=14841, minf=17648641 cpu : usr=48.73%, sys=610.78%, ctx=418953997, majf=22164, minf=26850689 cpu : usr=45.58%, sys=581.16%, ctx=714079216, majf=21497, minf=22558223 cpu : usr=48.40%, sys=599.65%, ctx=656089423, majf=16393, minf=23824409 3) Fio test is performed in a 8 vcpu guest with normal SATA based guest using kvm tool. Short version: With bio-based IO path, sequential read/write, random read/write IOPS boost : -10%, -10%, 4.4%, 0.5% Latency improvement: -12%, -15%, 2.5%, 0.8% Long Version: With bio-based IO path: read : io=124812KB, bw=36537KB/s, iops=9060 , runt= 3416msec write: io=169180KB, bw=24406KB/s, iops=6065 , runt= 6932msec read : io=256200KB, bw=2089.3KB/s, iops=520 , runt=122630msec write: io=257988KB, bw=1545.7KB/s, iops=384 , runt=166910msec clat (msec): min=1 , max=1527 , avg=28.06, stdev=89.54 clat (msec): min=2 , max=344 , avg=41.12, stdev=38.70 clat (msec): min=8 , max=1984 , avg=490.63, stdev=207.28 clat (msec): min=33 , max=4131 , avg=659.19, stdev=304.71 cpu : usr=4.85%, sys=17.15%, ctx=31593, majf=0, minf=7 cpu : usr=3.04%, sys=11.45%, ctx=39377, majf=0, minf=0 cpu : usr=0.47%, sys=1.59%, ctx=262986, majf=0, minf=16 cpu : usr=0.47%, sys=1.46%, ctx=337410, majf=0, minf=0 With request-based IO path: read : io=150120KB, bw=40420KB/s, iops=10037 , runt= 3714msec write: io=194932KB, bw=27029KB/s, iops=6722 , runt= 7212msec read : io=257136KB, bw=2001.1KB/s, iops=498 , runt=128443msec write: io=258276KB, bw=1537.2KB/s, iops=382 , runt=168028msec clat (msec): min=1 , max=1542 , avg=24.84, stdev=32.45 clat (msec): min=3 , max=628 , avg=35.62, stdev=39.71 clat (msec): min=8 , max=2540 , avg=503.28, stdev=236.97 clat (msec): min=41 , max=4398 , avg=653.88, stdev=302.61 cpu : usr=3.91%, sys=15.75%, ctx=26968, majf=0, minf=23 cpu : usr=2.50%, sys=10.56%, ctx=19090, majf=0, minf=0 cpu : usr=0.16%, sys=0.43%, ctx=20159, majf=0, minf=16 cpu : usr=0.18%, sys=0.53%, ctx=81364, majf=0, minf=0 How to use: ----------------------------- Add 'virtio_blk.use_bio=1' to kernel cmdline or 'modprobe virtio_blk use_bio=1' to enable ->make_request_fn() based I/O path. Cc: Rusty Russell <rusty@rustcorp.com.au> Cc: Jens Axboe <axboe@kernel.dk> Cc: Christoph Hellwig <hch@lst.de> Cc: Tejun Heo <tj@kernel.org> Cc: Shaohua Li <shli@kernel.org> Cc: "Michael S. Tsirkin" <mst@redhat.com> Cc: kvm@vger.kernel.org Cc: linux-kernel@vger.kernel.org Cc: virtualization@lists.linux-foundation.org Signed-off-by: Christoph Hellwig <hch@lst.de> Signed-off-by: Minchan Kim <minchan.kim@gmail.com> Signed-off-by: Asias He <asias@redhat.com> Acked-by: Rusty Russell <rusty@rustcorp.com.au> Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
2012-08-08 16:07:04 +08:00
unsigned int len;
spin_lock_irqsave(&vblk->vqs[qid].lock, flags);
virtio-blk: Disable callback in virtblk_done() This reduces unnecessary interrupts that host could send to guest while guest is in the progress of irq handling. If one vcpu is handling the irq, while another interrupt comes, in handle_edge_irq(), the guest will mask the interrupt via mask_msi_irq() which is a very heavy operation that goes all the way down to host. Here are some performance numbers on qemu: Before: ------------------------------------- seq-read : io=0 B, bw=269730KB/s, iops=67432 , runt= 62200msec seq-write : io=0 B, bw=339716KB/s, iops=84929 , runt= 49386msec rand-read : io=0 B, bw=270435KB/s, iops=67608 , runt= 62038msec rand-write: io=0 B, bw=354436KB/s, iops=88608 , runt= 47335msec clat (usec): min=101 , max=138052 , avg=14822.09, stdev=11771.01 clat (usec): min=96 , max=81543 , avg=11798.94, stdev=7735.60 clat (usec): min=128 , max=140043 , avg=14835.85, stdev=11765.33 clat (usec): min=109 , max=147207 , avg=11337.09, stdev=5990.35 cpu : usr=15.93%, sys=60.37%, ctx=7764972, majf=0, minf=54 cpu : usr=32.73%, sys=120.49%, ctx=7372945, majf=0, minf=1 cpu : usr=18.84%, sys=58.18%, ctx=7775420, majf=0, minf=1 cpu : usr=24.20%, sys=59.85%, ctx=8307886, majf=0, minf=0 vdb: ios=8389107/8368136, merge=0/0, ticks=19457874/14616506, in_queue=34206098, util=99.68% 43: interrupt in total: 887320 fio --exec_prerun="echo 3 > /proc/sys/vm/drop_caches" --group_reporting --ioscheduler=noop --thread --bs=4k --size=512MB --direct=1 --numjobs=16 --ioengine=libaio --iodepth=64 --loops=3 --ramp_time=0 --filename=/dev/vdb --name=seq-read --stonewall --rw=read --name=seq-write --stonewall --rw=write --name=rnd-read --stonewall --rw=randread --name=rnd-write --stonewall --rw=randwrite After: ------------------------------------- seq-read : io=0 B, bw=309503KB/s, iops=77375 , runt= 54207msec seq-write : io=0 B, bw=448205KB/s, iops=112051 , runt= 37432msec rand-read : io=0 B, bw=311254KB/s, iops=77813 , runt= 53902msec rand-write: io=0 B, bw=377152KB/s, iops=94287 , runt= 44484msec clat (usec): min=81 , max=90588 , avg=12946.06, stdev=9085.94 clat (usec): min=57 , max=72264 , avg=8967.97, stdev=5951.04 clat (usec): min=29 , max=101046 , avg=12889.95, stdev=9067.91 clat (usec): min=52 , max=106152 , avg=10660.56, stdev=4778.19 cpu : usr=15.05%, sys=57.92%, ctx=7710941, majf=0, minf=54 cpu : usr=26.78%, sys=101.40%, ctx=7387891, majf=0, minf=2 cpu : usr=19.03%, sys=58.17%, ctx=7681976, majf=0, minf=8 cpu : usr=24.65%, sys=58.34%, ctx=8442632, majf=0, minf=4 vdb: ios=8389086/8361888, merge=0/0, ticks=17243780/12742010, in_queue=30078377, util=99.59% 43: interrupt in total: 1259639 fio --exec_prerun="echo 3 > /proc/sys/vm/drop_caches" --group_reporting --ioscheduler=noop --thread --bs=4k --size=512MB --direct=1 --numjobs=16 --ioengine=libaio --iodepth=64 --loops=3 --ramp_time=0 --filename=/dev/vdb --name=seq-read --stonewall --rw=read --name=seq-write --stonewall --rw=write --name=rnd-read --stonewall --rw=randread --name=rnd-write --stonewall --rw=randwrite Signed-off-by: Asias He <asias@redhat.com> Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
2012-09-25 10:36:17 +08:00
do {
virtqueue_disable_cb(vq);
while ((vbr = virtqueue_get_buf(vblk->vqs[qid].vq, &len)) != NULL) {
struct request *req = blk_mq_rq_from_pdu(vbr);
if (likely(!blk_should_fake_timeout(req->q)))
blk_mq_complete_request(req);
req_done = true;
}
if (unlikely(virtqueue_is_broken(vq)))
break;
virtio-blk: Disable callback in virtblk_done() This reduces unnecessary interrupts that host could send to guest while guest is in the progress of irq handling. If one vcpu is handling the irq, while another interrupt comes, in handle_edge_irq(), the guest will mask the interrupt via mask_msi_irq() which is a very heavy operation that goes all the way down to host. Here are some performance numbers on qemu: Before: ------------------------------------- seq-read : io=0 B, bw=269730KB/s, iops=67432 , runt= 62200msec seq-write : io=0 B, bw=339716KB/s, iops=84929 , runt= 49386msec rand-read : io=0 B, bw=270435KB/s, iops=67608 , runt= 62038msec rand-write: io=0 B, bw=354436KB/s, iops=88608 , runt= 47335msec clat (usec): min=101 , max=138052 , avg=14822.09, stdev=11771.01 clat (usec): min=96 , max=81543 , avg=11798.94, stdev=7735.60 clat (usec): min=128 , max=140043 , avg=14835.85, stdev=11765.33 clat (usec): min=109 , max=147207 , avg=11337.09, stdev=5990.35 cpu : usr=15.93%, sys=60.37%, ctx=7764972, majf=0, minf=54 cpu : usr=32.73%, sys=120.49%, ctx=7372945, majf=0, minf=1 cpu : usr=18.84%, sys=58.18%, ctx=7775420, majf=0, minf=1 cpu : usr=24.20%, sys=59.85%, ctx=8307886, majf=0, minf=0 vdb: ios=8389107/8368136, merge=0/0, ticks=19457874/14616506, in_queue=34206098, util=99.68% 43: interrupt in total: 887320 fio --exec_prerun="echo 3 > /proc/sys/vm/drop_caches" --group_reporting --ioscheduler=noop --thread --bs=4k --size=512MB --direct=1 --numjobs=16 --ioengine=libaio --iodepth=64 --loops=3 --ramp_time=0 --filename=/dev/vdb --name=seq-read --stonewall --rw=read --name=seq-write --stonewall --rw=write --name=rnd-read --stonewall --rw=randread --name=rnd-write --stonewall --rw=randwrite After: ------------------------------------- seq-read : io=0 B, bw=309503KB/s, iops=77375 , runt= 54207msec seq-write : io=0 B, bw=448205KB/s, iops=112051 , runt= 37432msec rand-read : io=0 B, bw=311254KB/s, iops=77813 , runt= 53902msec rand-write: io=0 B, bw=377152KB/s, iops=94287 , runt= 44484msec clat (usec): min=81 , max=90588 , avg=12946.06, stdev=9085.94 clat (usec): min=57 , max=72264 , avg=8967.97, stdev=5951.04 clat (usec): min=29 , max=101046 , avg=12889.95, stdev=9067.91 clat (usec): min=52 , max=106152 , avg=10660.56, stdev=4778.19 cpu : usr=15.05%, sys=57.92%, ctx=7710941, majf=0, minf=54 cpu : usr=26.78%, sys=101.40%, ctx=7387891, majf=0, minf=2 cpu : usr=19.03%, sys=58.17%, ctx=7681976, majf=0, minf=8 cpu : usr=24.65%, sys=58.34%, ctx=8442632, majf=0, minf=4 vdb: ios=8389086/8361888, merge=0/0, ticks=17243780/12742010, in_queue=30078377, util=99.59% 43: interrupt in total: 1259639 fio --exec_prerun="echo 3 > /proc/sys/vm/drop_caches" --group_reporting --ioscheduler=noop --thread --bs=4k --size=512MB --direct=1 --numjobs=16 --ioengine=libaio --iodepth=64 --loops=3 --ramp_time=0 --filename=/dev/vdb --name=seq-read --stonewall --rw=read --name=seq-write --stonewall --rw=write --name=rnd-read --stonewall --rw=randread --name=rnd-write --stonewall --rw=randwrite Signed-off-by: Asias He <asias@redhat.com> Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
2012-09-25 10:36:17 +08:00
} while (!virtqueue_enable_cb(vq));
/* In case queue is stopped waiting for more buffers. */
virtio-blk: Add bio-based IO path for virtio-blk This patch introduces bio-based IO path for virtio-blk. Compared to request-based IO path, bio-based IO path uses driver provided ->make_request_fn() method to bypasses the IO scheduler. It handles the bio to device directly without allocating a request in block layer. This reduces the IO path in guest kernel to achieve high IOPS and lower latency. The downside is that guest can not use the IO scheduler to merge and sort requests. However, this is not a big problem if the backend disk in host side uses faster disk device. When the bio-based IO path is not enabled, virtio-blk still uses the original request-based IO path, no performance difference is observed. Using a slow device e.g. normal SATA disk, the bio-based IO path for sequential read and write are slower than req-based IO path due to lack of merge in guest kernel. So we make the bio-based path optional. Performance evaluation: ----------------------------- 1) Fio test is performed in a 8 vcpu guest with ramdisk based guest using kvm tool. Short version: With bio-based IO path, sequential read/write, random read/write IOPS boost : 28%, 24%, 21%, 16% Latency improvement: 32%, 17%, 21%, 16% Long version: With bio-based IO path: seq-read : io=2048.0MB, bw=116996KB/s, iops=233991 , runt= 17925msec seq-write : io=2048.0MB, bw=100829KB/s, iops=201658 , runt= 20799msec rand-read : io=3095.7MB, bw=112134KB/s, iops=224268 , runt= 28269msec rand-write: io=3095.7MB, bw=96198KB/s, iops=192396 , runt= 32952msec clat (usec): min=0 , max=2631.6K, avg=58716.99, stdev=191377.30 clat (usec): min=0 , max=1753.2K, avg=66423.25, stdev=81774.35 clat (usec): min=0 , max=2915.5K, avg=61685.70, stdev=120598.39 clat (usec): min=0 , max=1933.4K, avg=76935.12, stdev=96603.45 cpu : usr=74.08%, sys=703.84%, ctx=29661403, majf=21354, minf=22460954 cpu : usr=70.92%, sys=702.81%, ctx=77219828, majf=13980, minf=27713137 cpu : usr=72.23%, sys=695.37%, ctx=88081059, majf=18475, minf=28177648 cpu : usr=69.69%, sys=654.13%, ctx=145476035, majf=15867, minf=26176375 With request-based IO path: seq-read : io=2048.0MB, bw=91074KB/s, iops=182147 , runt= 23027msec seq-write : io=2048.0MB, bw=80725KB/s, iops=161449 , runt= 25979msec rand-read : io=3095.7MB, bw=92106KB/s, iops=184211 , runt= 34416msec rand-write: io=3095.7MB, bw=82815KB/s, iops=165630 , runt= 38277msec clat (usec): min=0 , max=1932.4K, avg=77824.17, stdev=170339.49 clat (usec): min=0 , max=2510.2K, avg=78023.96, stdev=146949.15 clat (usec): min=0 , max=3037.2K, avg=74746.53, stdev=128498.27 clat (usec): min=0 , max=1363.4K, avg=89830.75, stdev=114279.68 cpu : usr=53.28%, sys=724.19%, ctx=37988895, majf=17531, minf=23577622 cpu : usr=49.03%, sys=633.20%, ctx=205935380, majf=18197, minf=27288959 cpu : usr=55.78%, sys=722.40%, ctx=101525058, majf=19273, minf=28067082 cpu : usr=56.55%, sys=690.83%, ctx=228205022, majf=18039, minf=26551985 2) Fio test is performed in a 8 vcpu guest with Fusion-IO based guest using kvm tool. Short version: With bio-based IO path, sequential read/write, random read/write IOPS boost : 11%, 11%, 13%, 10% Latency improvement: 10%, 10%, 12%, 10% Long Version: With bio-based IO path: read : io=2048.0MB, bw=58920KB/s, iops=117840 , runt= 35593msec write: io=2048.0MB, bw=64308KB/s, iops=128616 , runt= 32611msec read : io=3095.7MB, bw=59633KB/s, iops=119266 , runt= 53157msec write: io=3095.7MB, bw=62993KB/s, iops=125985 , runt= 50322msec clat (usec): min=0 , max=1284.3K, avg=128109.01, stdev=71513.29 clat (usec): min=94 , max=962339 , avg=116832.95, stdev=65836.80 clat (usec): min=0 , max=1846.6K, avg=128509.99, stdev=89575.07 clat (usec): min=0 , max=2256.4K, avg=121361.84, stdev=82747.25 cpu : usr=56.79%, sys=421.70%, ctx=147335118, majf=21080, minf=19852517 cpu : usr=61.81%, sys=455.53%, ctx=143269950, majf=16027, minf=24800604 cpu : usr=63.10%, sys=455.38%, ctx=178373538, majf=16958, minf=24822612 cpu : usr=62.04%, sys=453.58%, ctx=226902362, majf=16089, minf=23278105 With request-based IO path: read : io=2048.0MB, bw=52896KB/s, iops=105791 , runt= 39647msec write: io=2048.0MB, bw=57856KB/s, iops=115711 , runt= 36248msec read : io=3095.7MB, bw=52387KB/s, iops=104773 , runt= 60510msec write: io=3095.7MB, bw=57310KB/s, iops=114619 , runt= 55312msec clat (usec): min=0 , max=1532.6K, avg=142085.62, stdev=109196.84 clat (usec): min=0 , max=1487.4K, avg=129110.71, stdev=114973.64 clat (usec): min=0 , max=1388.6K, avg=145049.22, stdev=107232.55 clat (usec): min=0 , max=1465.9K, avg=133585.67, stdev=110322.95 cpu : usr=44.08%, sys=590.71%, ctx=451812322, majf=14841, minf=17648641 cpu : usr=48.73%, sys=610.78%, ctx=418953997, majf=22164, minf=26850689 cpu : usr=45.58%, sys=581.16%, ctx=714079216, majf=21497, minf=22558223 cpu : usr=48.40%, sys=599.65%, ctx=656089423, majf=16393, minf=23824409 3) Fio test is performed in a 8 vcpu guest with normal SATA based guest using kvm tool. Short version: With bio-based IO path, sequential read/write, random read/write IOPS boost : -10%, -10%, 4.4%, 0.5% Latency improvement: -12%, -15%, 2.5%, 0.8% Long Version: With bio-based IO path: read : io=124812KB, bw=36537KB/s, iops=9060 , runt= 3416msec write: io=169180KB, bw=24406KB/s, iops=6065 , runt= 6932msec read : io=256200KB, bw=2089.3KB/s, iops=520 , runt=122630msec write: io=257988KB, bw=1545.7KB/s, iops=384 , runt=166910msec clat (msec): min=1 , max=1527 , avg=28.06, stdev=89.54 clat (msec): min=2 , max=344 , avg=41.12, stdev=38.70 clat (msec): min=8 , max=1984 , avg=490.63, stdev=207.28 clat (msec): min=33 , max=4131 , avg=659.19, stdev=304.71 cpu : usr=4.85%, sys=17.15%, ctx=31593, majf=0, minf=7 cpu : usr=3.04%, sys=11.45%, ctx=39377, majf=0, minf=0 cpu : usr=0.47%, sys=1.59%, ctx=262986, majf=0, minf=16 cpu : usr=0.47%, sys=1.46%, ctx=337410, majf=0, minf=0 With request-based IO path: read : io=150120KB, bw=40420KB/s, iops=10037 , runt= 3714msec write: io=194932KB, bw=27029KB/s, iops=6722 , runt= 7212msec read : io=257136KB, bw=2001.1KB/s, iops=498 , runt=128443msec write: io=258276KB, bw=1537.2KB/s, iops=382 , runt=168028msec clat (msec): min=1 , max=1542 , avg=24.84, stdev=32.45 clat (msec): min=3 , max=628 , avg=35.62, stdev=39.71 clat (msec): min=8 , max=2540 , avg=503.28, stdev=236.97 clat (msec): min=41 , max=4398 , avg=653.88, stdev=302.61 cpu : usr=3.91%, sys=15.75%, ctx=26968, majf=0, minf=23 cpu : usr=2.50%, sys=10.56%, ctx=19090, majf=0, minf=0 cpu : usr=0.16%, sys=0.43%, ctx=20159, majf=0, minf=16 cpu : usr=0.18%, sys=0.53%, ctx=81364, majf=0, minf=0 How to use: ----------------------------- Add 'virtio_blk.use_bio=1' to kernel cmdline or 'modprobe virtio_blk use_bio=1' to enable ->make_request_fn() based I/O path. Cc: Rusty Russell <rusty@rustcorp.com.au> Cc: Jens Axboe <axboe@kernel.dk> Cc: Christoph Hellwig <hch@lst.de> Cc: Tejun Heo <tj@kernel.org> Cc: Shaohua Li <shli@kernel.org> Cc: "Michael S. Tsirkin" <mst@redhat.com> Cc: kvm@vger.kernel.org Cc: linux-kernel@vger.kernel.org Cc: virtualization@lists.linux-foundation.org Signed-off-by: Christoph Hellwig <hch@lst.de> Signed-off-by: Minchan Kim <minchan.kim@gmail.com> Signed-off-by: Asias He <asias@redhat.com> Acked-by: Rusty Russell <rusty@rustcorp.com.au> Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
2012-08-08 16:07:04 +08:00
if (req_done)
blk_mq_start_stopped_hw_queues(vblk->disk->queue, true);
spin_unlock_irqrestore(&vblk->vqs[qid].lock, flags);
virtio-blk: Add bio-based IO path for virtio-blk This patch introduces bio-based IO path for virtio-blk. Compared to request-based IO path, bio-based IO path uses driver provided ->make_request_fn() method to bypasses the IO scheduler. It handles the bio to device directly without allocating a request in block layer. This reduces the IO path in guest kernel to achieve high IOPS and lower latency. The downside is that guest can not use the IO scheduler to merge and sort requests. However, this is not a big problem if the backend disk in host side uses faster disk device. When the bio-based IO path is not enabled, virtio-blk still uses the original request-based IO path, no performance difference is observed. Using a slow device e.g. normal SATA disk, the bio-based IO path for sequential read and write are slower than req-based IO path due to lack of merge in guest kernel. So we make the bio-based path optional. Performance evaluation: ----------------------------- 1) Fio test is performed in a 8 vcpu guest with ramdisk based guest using kvm tool. Short version: With bio-based IO path, sequential read/write, random read/write IOPS boost : 28%, 24%, 21%, 16% Latency improvement: 32%, 17%, 21%, 16% Long version: With bio-based IO path: seq-read : io=2048.0MB, bw=116996KB/s, iops=233991 , runt= 17925msec seq-write : io=2048.0MB, bw=100829KB/s, iops=201658 , runt= 20799msec rand-read : io=3095.7MB, bw=112134KB/s, iops=224268 , runt= 28269msec rand-write: io=3095.7MB, bw=96198KB/s, iops=192396 , runt= 32952msec clat (usec): min=0 , max=2631.6K, avg=58716.99, stdev=191377.30 clat (usec): min=0 , max=1753.2K, avg=66423.25, stdev=81774.35 clat (usec): min=0 , max=2915.5K, avg=61685.70, stdev=120598.39 clat (usec): min=0 , max=1933.4K, avg=76935.12, stdev=96603.45 cpu : usr=74.08%, sys=703.84%, ctx=29661403, majf=21354, minf=22460954 cpu : usr=70.92%, sys=702.81%, ctx=77219828, majf=13980, minf=27713137 cpu : usr=72.23%, sys=695.37%, ctx=88081059, majf=18475, minf=28177648 cpu : usr=69.69%, sys=654.13%, ctx=145476035, majf=15867, minf=26176375 With request-based IO path: seq-read : io=2048.0MB, bw=91074KB/s, iops=182147 , runt= 23027msec seq-write : io=2048.0MB, bw=80725KB/s, iops=161449 , runt= 25979msec rand-read : io=3095.7MB, bw=92106KB/s, iops=184211 , runt= 34416msec rand-write: io=3095.7MB, bw=82815KB/s, iops=165630 , runt= 38277msec clat (usec): min=0 , max=1932.4K, avg=77824.17, stdev=170339.49 clat (usec): min=0 , max=2510.2K, avg=78023.96, stdev=146949.15 clat (usec): min=0 , max=3037.2K, avg=74746.53, stdev=128498.27 clat (usec): min=0 , max=1363.4K, avg=89830.75, stdev=114279.68 cpu : usr=53.28%, sys=724.19%, ctx=37988895, majf=17531, minf=23577622 cpu : usr=49.03%, sys=633.20%, ctx=205935380, majf=18197, minf=27288959 cpu : usr=55.78%, sys=722.40%, ctx=101525058, majf=19273, minf=28067082 cpu : usr=56.55%, sys=690.83%, ctx=228205022, majf=18039, minf=26551985 2) Fio test is performed in a 8 vcpu guest with Fusion-IO based guest using kvm tool. Short version: With bio-based IO path, sequential read/write, random read/write IOPS boost : 11%, 11%, 13%, 10% Latency improvement: 10%, 10%, 12%, 10% Long Version: With bio-based IO path: read : io=2048.0MB, bw=58920KB/s, iops=117840 , runt= 35593msec write: io=2048.0MB, bw=64308KB/s, iops=128616 , runt= 32611msec read : io=3095.7MB, bw=59633KB/s, iops=119266 , runt= 53157msec write: io=3095.7MB, bw=62993KB/s, iops=125985 , runt= 50322msec clat (usec): min=0 , max=1284.3K, avg=128109.01, stdev=71513.29 clat (usec): min=94 , max=962339 , avg=116832.95, stdev=65836.80 clat (usec): min=0 , max=1846.6K, avg=128509.99, stdev=89575.07 clat (usec): min=0 , max=2256.4K, avg=121361.84, stdev=82747.25 cpu : usr=56.79%, sys=421.70%, ctx=147335118, majf=21080, minf=19852517 cpu : usr=61.81%, sys=455.53%, ctx=143269950, majf=16027, minf=24800604 cpu : usr=63.10%, sys=455.38%, ctx=178373538, majf=16958, minf=24822612 cpu : usr=62.04%, sys=453.58%, ctx=226902362, majf=16089, minf=23278105 With request-based IO path: read : io=2048.0MB, bw=52896KB/s, iops=105791 , runt= 39647msec write: io=2048.0MB, bw=57856KB/s, iops=115711 , runt= 36248msec read : io=3095.7MB, bw=52387KB/s, iops=104773 , runt= 60510msec write: io=3095.7MB, bw=57310KB/s, iops=114619 , runt= 55312msec clat (usec): min=0 , max=1532.6K, avg=142085.62, stdev=109196.84 clat (usec): min=0 , max=1487.4K, avg=129110.71, stdev=114973.64 clat (usec): min=0 , max=1388.6K, avg=145049.22, stdev=107232.55 clat (usec): min=0 , max=1465.9K, avg=133585.67, stdev=110322.95 cpu : usr=44.08%, sys=590.71%, ctx=451812322, majf=14841, minf=17648641 cpu : usr=48.73%, sys=610.78%, ctx=418953997, majf=22164, minf=26850689 cpu : usr=45.58%, sys=581.16%, ctx=714079216, majf=21497, minf=22558223 cpu : usr=48.40%, sys=599.65%, ctx=656089423, majf=16393, minf=23824409 3) Fio test is performed in a 8 vcpu guest with normal SATA based guest using kvm tool. Short version: With bio-based IO path, sequential read/write, random read/write IOPS boost : -10%, -10%, 4.4%, 0.5% Latency improvement: -12%, -15%, 2.5%, 0.8% Long Version: With bio-based IO path: read : io=124812KB, bw=36537KB/s, iops=9060 , runt= 3416msec write: io=169180KB, bw=24406KB/s, iops=6065 , runt= 6932msec read : io=256200KB, bw=2089.3KB/s, iops=520 , runt=122630msec write: io=257988KB, bw=1545.7KB/s, iops=384 , runt=166910msec clat (msec): min=1 , max=1527 , avg=28.06, stdev=89.54 clat (msec): min=2 , max=344 , avg=41.12, stdev=38.70 clat (msec): min=8 , max=1984 , avg=490.63, stdev=207.28 clat (msec): min=33 , max=4131 , avg=659.19, stdev=304.71 cpu : usr=4.85%, sys=17.15%, ctx=31593, majf=0, minf=7 cpu : usr=3.04%, sys=11.45%, ctx=39377, majf=0, minf=0 cpu : usr=0.47%, sys=1.59%, ctx=262986, majf=0, minf=16 cpu : usr=0.47%, sys=1.46%, ctx=337410, majf=0, minf=0 With request-based IO path: read : io=150120KB, bw=40420KB/s, iops=10037 , runt= 3714msec write: io=194932KB, bw=27029KB/s, iops=6722 , runt= 7212msec read : io=257136KB, bw=2001.1KB/s, iops=498 , runt=128443msec write: io=258276KB, bw=1537.2KB/s, iops=382 , runt=168028msec clat (msec): min=1 , max=1542 , avg=24.84, stdev=32.45 clat (msec): min=3 , max=628 , avg=35.62, stdev=39.71 clat (msec): min=8 , max=2540 , avg=503.28, stdev=236.97 clat (msec): min=41 , max=4398 , avg=653.88, stdev=302.61 cpu : usr=3.91%, sys=15.75%, ctx=26968, majf=0, minf=23 cpu : usr=2.50%, sys=10.56%, ctx=19090, majf=0, minf=0 cpu : usr=0.16%, sys=0.43%, ctx=20159, majf=0, minf=16 cpu : usr=0.18%, sys=0.53%, ctx=81364, majf=0, minf=0 How to use: ----------------------------- Add 'virtio_blk.use_bio=1' to kernel cmdline or 'modprobe virtio_blk use_bio=1' to enable ->make_request_fn() based I/O path. Cc: Rusty Russell <rusty@rustcorp.com.au> Cc: Jens Axboe <axboe@kernel.dk> Cc: Christoph Hellwig <hch@lst.de> Cc: Tejun Heo <tj@kernel.org> Cc: Shaohua Li <shli@kernel.org> Cc: "Michael S. Tsirkin" <mst@redhat.com> Cc: kvm@vger.kernel.org Cc: linux-kernel@vger.kernel.org Cc: virtualization@lists.linux-foundation.org Signed-off-by: Christoph Hellwig <hch@lst.de> Signed-off-by: Minchan Kim <minchan.kim@gmail.com> Signed-off-by: Asias He <asias@redhat.com> Acked-by: Rusty Russell <rusty@rustcorp.com.au> Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
2012-08-08 16:07:04 +08:00
}
static void virtio_commit_rqs(struct blk_mq_hw_ctx *hctx)
{
struct virtio_blk *vblk = hctx->queue->queuedata;
struct virtio_blk_vq *vq = &vblk->vqs[hctx->queue_num];
bool kick;
spin_lock_irq(&vq->lock);
kick = virtqueue_kick_prepare(vq->vq);
spin_unlock_irq(&vq->lock);
if (kick)
virtqueue_notify(vq->vq);
}
static blk_status_t virtio_queue_rq(struct blk_mq_hw_ctx *hctx,
const struct blk_mq_queue_data *bd)
{
struct virtio_blk *vblk = hctx->queue->queuedata;
struct request *req = bd->rq;
struct virtblk_req *vbr = blk_mq_rq_to_pdu(req);
unsigned long flags;
unsigned int num;
int qid = hctx->queue_num;
int err;
bool notify = false;
bool unmap = false;
u32 type;
BUG_ON(req->nr_phys_segments + 2 > vblk->sg_elems);
switch (req_op(req)) {
case REQ_OP_READ:
case REQ_OP_WRITE:
type = 0;
break;
case REQ_OP_FLUSH:
type = VIRTIO_BLK_T_FLUSH;
break;
case REQ_OP_DISCARD:
type = VIRTIO_BLK_T_DISCARD;
break;
case REQ_OP_WRITE_ZEROES:
type = VIRTIO_BLK_T_WRITE_ZEROES;
unmap = !(req->cmd_flags & REQ_NOUNMAP);
break;
case REQ_OP_DRV_IN:
type = VIRTIO_BLK_T_GET_ID;
break;
default:
WARN_ON_ONCE(1);
return BLK_STS_IOERR;
}
vbr->out_hdr.type = cpu_to_virtio32(vblk->vdev, type);
vbr->out_hdr.sector = type ?
0 : cpu_to_virtio64(vblk->vdev, blk_rq_pos(req));
vbr->out_hdr.ioprio = cpu_to_virtio32(vblk->vdev, req_get_ioprio(req));
blk_mq_start_request(req);
if (type == VIRTIO_BLK_T_DISCARD || type == VIRTIO_BLK_T_WRITE_ZEROES) {
err = virtblk_setup_discard_write_zeroes(req, unmap);
if (err)
return BLK_STS_RESOURCE;
}
num = blk_rq_map_sg(hctx->queue, req, vbr->sg);
if (num) {
if (rq_data_dir(req) == WRITE)
vbr->out_hdr.type |= cpu_to_virtio32(vblk->vdev, VIRTIO_BLK_T_OUT);
else
vbr->out_hdr.type |= cpu_to_virtio32(vblk->vdev, VIRTIO_BLK_T_IN);
}
spin_lock_irqsave(&vblk->vqs[qid].lock, flags);
err = virtblk_add_req(vblk->vqs[qid].vq, vbr, vbr->sg, num);
if (err) {
virtqueue_kick(vblk->vqs[qid].vq);
/* Don't stop the queue if -ENOMEM: we may have failed to
* bounce the buffer due to global resource outage.
*/
if (err == -ENOSPC)
blk_mq_stop_hw_queue(hctx);
spin_unlock_irqrestore(&vblk->vqs[qid].lock, flags);
switch (err) {
case -ENOSPC:
blk-mq: introduce BLK_STS_DEV_RESOURCE This status is returned from driver to block layer if device related resource is unavailable, but driver can guarantee that IO dispatch will be triggered in future when the resource is available. Convert some drivers to return BLK_STS_DEV_RESOURCE. Also, if driver returns BLK_STS_RESOURCE and SCHED_RESTART is set, rerun queue after a delay (BLK_MQ_DELAY_QUEUE) to avoid IO stalls. BLK_MQ_DELAY_QUEUE is 3 ms because both scsi-mq and nvmefc are using that magic value. If a driver can make sure there is in-flight IO, it is safe to return BLK_STS_DEV_RESOURCE because: 1) If all in-flight IOs complete before examining SCHED_RESTART in blk_mq_dispatch_rq_list(), SCHED_RESTART must be cleared, so queue is run immediately in this case by blk_mq_dispatch_rq_list(); 2) if there is any in-flight IO after/when examining SCHED_RESTART in blk_mq_dispatch_rq_list(): - if SCHED_RESTART isn't set, queue is run immediately as handled in 1) - otherwise, this request will be dispatched after any in-flight IO is completed via blk_mq_sched_restart() 3) if SCHED_RESTART is set concurently in context because of BLK_STS_RESOURCE, blk_mq_delay_run_hw_queue() will cover the above two cases and make sure IO hang can be avoided. One invariant is that queue will be rerun if SCHED_RESTART is set. Suggested-by: Jens Axboe <axboe@kernel.dk> Tested-by: Laurence Oberman <loberman@redhat.com> Signed-off-by: Ming Lei <ming.lei@redhat.com> Signed-off-by: Mike Snitzer <snitzer@redhat.com> Signed-off-by: Jens Axboe <axboe@kernel.dk>
2018-01-31 11:04:57 +08:00
return BLK_STS_DEV_RESOURCE;
case -ENOMEM:
return BLK_STS_RESOURCE;
default:
return BLK_STS_IOERR;
}
virtio-blk: Add bio-based IO path for virtio-blk This patch introduces bio-based IO path for virtio-blk. Compared to request-based IO path, bio-based IO path uses driver provided ->make_request_fn() method to bypasses the IO scheduler. It handles the bio to device directly without allocating a request in block layer. This reduces the IO path in guest kernel to achieve high IOPS and lower latency. The downside is that guest can not use the IO scheduler to merge and sort requests. However, this is not a big problem if the backend disk in host side uses faster disk device. When the bio-based IO path is not enabled, virtio-blk still uses the original request-based IO path, no performance difference is observed. Using a slow device e.g. normal SATA disk, the bio-based IO path for sequential read and write are slower than req-based IO path due to lack of merge in guest kernel. So we make the bio-based path optional. Performance evaluation: ----------------------------- 1) Fio test is performed in a 8 vcpu guest with ramdisk based guest using kvm tool. Short version: With bio-based IO path, sequential read/write, random read/write IOPS boost : 28%, 24%, 21%, 16% Latency improvement: 32%, 17%, 21%, 16% Long version: With bio-based IO path: seq-read : io=2048.0MB, bw=116996KB/s, iops=233991 , runt= 17925msec seq-write : io=2048.0MB, bw=100829KB/s, iops=201658 , runt= 20799msec rand-read : io=3095.7MB, bw=112134KB/s, iops=224268 , runt= 28269msec rand-write: io=3095.7MB, bw=96198KB/s, iops=192396 , runt= 32952msec clat (usec): min=0 , max=2631.6K, avg=58716.99, stdev=191377.30 clat (usec): min=0 , max=1753.2K, avg=66423.25, stdev=81774.35 clat (usec): min=0 , max=2915.5K, avg=61685.70, stdev=120598.39 clat (usec): min=0 , max=1933.4K, avg=76935.12, stdev=96603.45 cpu : usr=74.08%, sys=703.84%, ctx=29661403, majf=21354, minf=22460954 cpu : usr=70.92%, sys=702.81%, ctx=77219828, majf=13980, minf=27713137 cpu : usr=72.23%, sys=695.37%, ctx=88081059, majf=18475, minf=28177648 cpu : usr=69.69%, sys=654.13%, ctx=145476035, majf=15867, minf=26176375 With request-based IO path: seq-read : io=2048.0MB, bw=91074KB/s, iops=182147 , runt= 23027msec seq-write : io=2048.0MB, bw=80725KB/s, iops=161449 , runt= 25979msec rand-read : io=3095.7MB, bw=92106KB/s, iops=184211 , runt= 34416msec rand-write: io=3095.7MB, bw=82815KB/s, iops=165630 , runt= 38277msec clat (usec): min=0 , max=1932.4K, avg=77824.17, stdev=170339.49 clat (usec): min=0 , max=2510.2K, avg=78023.96, stdev=146949.15 clat (usec): min=0 , max=3037.2K, avg=74746.53, stdev=128498.27 clat (usec): min=0 , max=1363.4K, avg=89830.75, stdev=114279.68 cpu : usr=53.28%, sys=724.19%, ctx=37988895, majf=17531, minf=23577622 cpu : usr=49.03%, sys=633.20%, ctx=205935380, majf=18197, minf=27288959 cpu : usr=55.78%, sys=722.40%, ctx=101525058, majf=19273, minf=28067082 cpu : usr=56.55%, sys=690.83%, ctx=228205022, majf=18039, minf=26551985 2) Fio test is performed in a 8 vcpu guest with Fusion-IO based guest using kvm tool. Short version: With bio-based IO path, sequential read/write, random read/write IOPS boost : 11%, 11%, 13%, 10% Latency improvement: 10%, 10%, 12%, 10% Long Version: With bio-based IO path: read : io=2048.0MB, bw=58920KB/s, iops=117840 , runt= 35593msec write: io=2048.0MB, bw=64308KB/s, iops=128616 , runt= 32611msec read : io=3095.7MB, bw=59633KB/s, iops=119266 , runt= 53157msec write: io=3095.7MB, bw=62993KB/s, iops=125985 , runt= 50322msec clat (usec): min=0 , max=1284.3K, avg=128109.01, stdev=71513.29 clat (usec): min=94 , max=962339 , avg=116832.95, stdev=65836.80 clat (usec): min=0 , max=1846.6K, avg=128509.99, stdev=89575.07 clat (usec): min=0 , max=2256.4K, avg=121361.84, stdev=82747.25 cpu : usr=56.79%, sys=421.70%, ctx=147335118, majf=21080, minf=19852517 cpu : usr=61.81%, sys=455.53%, ctx=143269950, majf=16027, minf=24800604 cpu : usr=63.10%, sys=455.38%, ctx=178373538, majf=16958, minf=24822612 cpu : usr=62.04%, sys=453.58%, ctx=226902362, majf=16089, minf=23278105 With request-based IO path: read : io=2048.0MB, bw=52896KB/s, iops=105791 , runt= 39647msec write: io=2048.0MB, bw=57856KB/s, iops=115711 , runt= 36248msec read : io=3095.7MB, bw=52387KB/s, iops=104773 , runt= 60510msec write: io=3095.7MB, bw=57310KB/s, iops=114619 , runt= 55312msec clat (usec): min=0 , max=1532.6K, avg=142085.62, stdev=109196.84 clat (usec): min=0 , max=1487.4K, avg=129110.71, stdev=114973.64 clat (usec): min=0 , max=1388.6K, avg=145049.22, stdev=107232.55 clat (usec): min=0 , max=1465.9K, avg=133585.67, stdev=110322.95 cpu : usr=44.08%, sys=590.71%, ctx=451812322, majf=14841, minf=17648641 cpu : usr=48.73%, sys=610.78%, ctx=418953997, majf=22164, minf=26850689 cpu : usr=45.58%, sys=581.16%, ctx=714079216, majf=21497, minf=22558223 cpu : usr=48.40%, sys=599.65%, ctx=656089423, majf=16393, minf=23824409 3) Fio test is performed in a 8 vcpu guest with normal SATA based guest using kvm tool. Short version: With bio-based IO path, sequential read/write, random read/write IOPS boost : -10%, -10%, 4.4%, 0.5% Latency improvement: -12%, -15%, 2.5%, 0.8% Long Version: With bio-based IO path: read : io=124812KB, bw=36537KB/s, iops=9060 , runt= 3416msec write: io=169180KB, bw=24406KB/s, iops=6065 , runt= 6932msec read : io=256200KB, bw=2089.3KB/s, iops=520 , runt=122630msec write: io=257988KB, bw=1545.7KB/s, iops=384 , runt=166910msec clat (msec): min=1 , max=1527 , avg=28.06, stdev=89.54 clat (msec): min=2 , max=344 , avg=41.12, stdev=38.70 clat (msec): min=8 , max=1984 , avg=490.63, stdev=207.28 clat (msec): min=33 , max=4131 , avg=659.19, stdev=304.71 cpu : usr=4.85%, sys=17.15%, ctx=31593, majf=0, minf=7 cpu : usr=3.04%, sys=11.45%, ctx=39377, majf=0, minf=0 cpu : usr=0.47%, sys=1.59%, ctx=262986, majf=0, minf=16 cpu : usr=0.47%, sys=1.46%, ctx=337410, majf=0, minf=0 With request-based IO path: read : io=150120KB, bw=40420KB/s, iops=10037 , runt= 3714msec write: io=194932KB, bw=27029KB/s, iops=6722 , runt= 7212msec read : io=257136KB, bw=2001.1KB/s, iops=498 , runt=128443msec write: io=258276KB, bw=1537.2KB/s, iops=382 , runt=168028msec clat (msec): min=1 , max=1542 , avg=24.84, stdev=32.45 clat (msec): min=3 , max=628 , avg=35.62, stdev=39.71 clat (msec): min=8 , max=2540 , avg=503.28, stdev=236.97 clat (msec): min=41 , max=4398 , avg=653.88, stdev=302.61 cpu : usr=3.91%, sys=15.75%, ctx=26968, majf=0, minf=23 cpu : usr=2.50%, sys=10.56%, ctx=19090, majf=0, minf=0 cpu : usr=0.16%, sys=0.43%, ctx=20159, majf=0, minf=16 cpu : usr=0.18%, sys=0.53%, ctx=81364, majf=0, minf=0 How to use: ----------------------------- Add 'virtio_blk.use_bio=1' to kernel cmdline or 'modprobe virtio_blk use_bio=1' to enable ->make_request_fn() based I/O path. Cc: Rusty Russell <rusty@rustcorp.com.au> Cc: Jens Axboe <axboe@kernel.dk> Cc: Christoph Hellwig <hch@lst.de> Cc: Tejun Heo <tj@kernel.org> Cc: Shaohua Li <shli@kernel.org> Cc: "Michael S. Tsirkin" <mst@redhat.com> Cc: kvm@vger.kernel.org Cc: linux-kernel@vger.kernel.org Cc: virtualization@lists.linux-foundation.org Signed-off-by: Christoph Hellwig <hch@lst.de> Signed-off-by: Minchan Kim <minchan.kim@gmail.com> Signed-off-by: Asias He <asias@redhat.com> Acked-by: Rusty Russell <rusty@rustcorp.com.au> Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
2012-08-08 16:07:04 +08:00
}
if (bd->last && virtqueue_kick_prepare(vblk->vqs[qid].vq))
notify = true;
spin_unlock_irqrestore(&vblk->vqs[qid].lock, flags);
if (notify)
virtqueue_notify(vblk->vqs[qid].vq);
return BLK_STS_OK;
virtio-blk: Add bio-based IO path for virtio-blk This patch introduces bio-based IO path for virtio-blk. Compared to request-based IO path, bio-based IO path uses driver provided ->make_request_fn() method to bypasses the IO scheduler. It handles the bio to device directly without allocating a request in block layer. This reduces the IO path in guest kernel to achieve high IOPS and lower latency. The downside is that guest can not use the IO scheduler to merge and sort requests. However, this is not a big problem if the backend disk in host side uses faster disk device. When the bio-based IO path is not enabled, virtio-blk still uses the original request-based IO path, no performance difference is observed. Using a slow device e.g. normal SATA disk, the bio-based IO path for sequential read and write are slower than req-based IO path due to lack of merge in guest kernel. So we make the bio-based path optional. Performance evaluation: ----------------------------- 1) Fio test is performed in a 8 vcpu guest with ramdisk based guest using kvm tool. Short version: With bio-based IO path, sequential read/write, random read/write IOPS boost : 28%, 24%, 21%, 16% Latency improvement: 32%, 17%, 21%, 16% Long version: With bio-based IO path: seq-read : io=2048.0MB, bw=116996KB/s, iops=233991 , runt= 17925msec seq-write : io=2048.0MB, bw=100829KB/s, iops=201658 , runt= 20799msec rand-read : io=3095.7MB, bw=112134KB/s, iops=224268 , runt= 28269msec rand-write: io=3095.7MB, bw=96198KB/s, iops=192396 , runt= 32952msec clat (usec): min=0 , max=2631.6K, avg=58716.99, stdev=191377.30 clat (usec): min=0 , max=1753.2K, avg=66423.25, stdev=81774.35 clat (usec): min=0 , max=2915.5K, avg=61685.70, stdev=120598.39 clat (usec): min=0 , max=1933.4K, avg=76935.12, stdev=96603.45 cpu : usr=74.08%, sys=703.84%, ctx=29661403, majf=21354, minf=22460954 cpu : usr=70.92%, sys=702.81%, ctx=77219828, majf=13980, minf=27713137 cpu : usr=72.23%, sys=695.37%, ctx=88081059, majf=18475, minf=28177648 cpu : usr=69.69%, sys=654.13%, ctx=145476035, majf=15867, minf=26176375 With request-based IO path: seq-read : io=2048.0MB, bw=91074KB/s, iops=182147 , runt= 23027msec seq-write : io=2048.0MB, bw=80725KB/s, iops=161449 , runt= 25979msec rand-read : io=3095.7MB, bw=92106KB/s, iops=184211 , runt= 34416msec rand-write: io=3095.7MB, bw=82815KB/s, iops=165630 , runt= 38277msec clat (usec): min=0 , max=1932.4K, avg=77824.17, stdev=170339.49 clat (usec): min=0 , max=2510.2K, avg=78023.96, stdev=146949.15 clat (usec): min=0 , max=3037.2K, avg=74746.53, stdev=128498.27 clat (usec): min=0 , max=1363.4K, avg=89830.75, stdev=114279.68 cpu : usr=53.28%, sys=724.19%, ctx=37988895, majf=17531, minf=23577622 cpu : usr=49.03%, sys=633.20%, ctx=205935380, majf=18197, minf=27288959 cpu : usr=55.78%, sys=722.40%, ctx=101525058, majf=19273, minf=28067082 cpu : usr=56.55%, sys=690.83%, ctx=228205022, majf=18039, minf=26551985 2) Fio test is performed in a 8 vcpu guest with Fusion-IO based guest using kvm tool. Short version: With bio-based IO path, sequential read/write, random read/write IOPS boost : 11%, 11%, 13%, 10% Latency improvement: 10%, 10%, 12%, 10% Long Version: With bio-based IO path: read : io=2048.0MB, bw=58920KB/s, iops=117840 , runt= 35593msec write: io=2048.0MB, bw=64308KB/s, iops=128616 , runt= 32611msec read : io=3095.7MB, bw=59633KB/s, iops=119266 , runt= 53157msec write: io=3095.7MB, bw=62993KB/s, iops=125985 , runt= 50322msec clat (usec): min=0 , max=1284.3K, avg=128109.01, stdev=71513.29 clat (usec): min=94 , max=962339 , avg=116832.95, stdev=65836.80 clat (usec): min=0 , max=1846.6K, avg=128509.99, stdev=89575.07 clat (usec): min=0 , max=2256.4K, avg=121361.84, stdev=82747.25 cpu : usr=56.79%, sys=421.70%, ctx=147335118, majf=21080, minf=19852517 cpu : usr=61.81%, sys=455.53%, ctx=143269950, majf=16027, minf=24800604 cpu : usr=63.10%, sys=455.38%, ctx=178373538, majf=16958, minf=24822612 cpu : usr=62.04%, sys=453.58%, ctx=226902362, majf=16089, minf=23278105 With request-based IO path: read : io=2048.0MB, bw=52896KB/s, iops=105791 , runt= 39647msec write: io=2048.0MB, bw=57856KB/s, iops=115711 , runt= 36248msec read : io=3095.7MB, bw=52387KB/s, iops=104773 , runt= 60510msec write: io=3095.7MB, bw=57310KB/s, iops=114619 , runt= 55312msec clat (usec): min=0 , max=1532.6K, avg=142085.62, stdev=109196.84 clat (usec): min=0 , max=1487.4K, avg=129110.71, stdev=114973.64 clat (usec): min=0 , max=1388.6K, avg=145049.22, stdev=107232.55 clat (usec): min=0 , max=1465.9K, avg=133585.67, stdev=110322.95 cpu : usr=44.08%, sys=590.71%, ctx=451812322, majf=14841, minf=17648641 cpu : usr=48.73%, sys=610.78%, ctx=418953997, majf=22164, minf=26850689 cpu : usr=45.58%, sys=581.16%, ctx=714079216, majf=21497, minf=22558223 cpu : usr=48.40%, sys=599.65%, ctx=656089423, majf=16393, minf=23824409 3) Fio test is performed in a 8 vcpu guest with normal SATA based guest using kvm tool. Short version: With bio-based IO path, sequential read/write, random read/write IOPS boost : -10%, -10%, 4.4%, 0.5% Latency improvement: -12%, -15%, 2.5%, 0.8% Long Version: With bio-based IO path: read : io=124812KB, bw=36537KB/s, iops=9060 , runt= 3416msec write: io=169180KB, bw=24406KB/s, iops=6065 , runt= 6932msec read : io=256200KB, bw=2089.3KB/s, iops=520 , runt=122630msec write: io=257988KB, bw=1545.7KB/s, iops=384 , runt=166910msec clat (msec): min=1 , max=1527 , avg=28.06, stdev=89.54 clat (msec): min=2 , max=344 , avg=41.12, stdev=38.70 clat (msec): min=8 , max=1984 , avg=490.63, stdev=207.28 clat (msec): min=33 , max=4131 , avg=659.19, stdev=304.71 cpu : usr=4.85%, sys=17.15%, ctx=31593, majf=0, minf=7 cpu : usr=3.04%, sys=11.45%, ctx=39377, majf=0, minf=0 cpu : usr=0.47%, sys=1.59%, ctx=262986, majf=0, minf=16 cpu : usr=0.47%, sys=1.46%, ctx=337410, majf=0, minf=0 With request-based IO path: read : io=150120KB, bw=40420KB/s, iops=10037 , runt= 3714msec write: io=194932KB, bw=27029KB/s, iops=6722 , runt= 7212msec read : io=257136KB, bw=2001.1KB/s, iops=498 , runt=128443msec write: io=258276KB, bw=1537.2KB/s, iops=382 , runt=168028msec clat (msec): min=1 , max=1542 , avg=24.84, stdev=32.45 clat (msec): min=3 , max=628 , avg=35.62, stdev=39.71 clat (msec): min=8 , max=2540 , avg=503.28, stdev=236.97 clat (msec): min=41 , max=4398 , avg=653.88, stdev=302.61 cpu : usr=3.91%, sys=15.75%, ctx=26968, majf=0, minf=23 cpu : usr=2.50%, sys=10.56%, ctx=19090, majf=0, minf=0 cpu : usr=0.16%, sys=0.43%, ctx=20159, majf=0, minf=16 cpu : usr=0.18%, sys=0.53%, ctx=81364, majf=0, minf=0 How to use: ----------------------------- Add 'virtio_blk.use_bio=1' to kernel cmdline or 'modprobe virtio_blk use_bio=1' to enable ->make_request_fn() based I/O path. Cc: Rusty Russell <rusty@rustcorp.com.au> Cc: Jens Axboe <axboe@kernel.dk> Cc: Christoph Hellwig <hch@lst.de> Cc: Tejun Heo <tj@kernel.org> Cc: Shaohua Li <shli@kernel.org> Cc: "Michael S. Tsirkin" <mst@redhat.com> Cc: kvm@vger.kernel.org Cc: linux-kernel@vger.kernel.org Cc: virtualization@lists.linux-foundation.org Signed-off-by: Christoph Hellwig <hch@lst.de> Signed-off-by: Minchan Kim <minchan.kim@gmail.com> Signed-off-by: Asias He <asias@redhat.com> Acked-by: Rusty Russell <rusty@rustcorp.com.au> Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
2012-08-08 16:07:04 +08:00
}
/* return id (s/n) string for *disk to *id_str
*/
static int virtblk_get_id(struct gendisk *disk, char *id_str)
{
struct virtio_blk *vblk = disk->private_data;
struct request_queue *q = vblk->disk->queue;
struct request *req;
int err;
req = blk_get_request(q, REQ_OP_DRV_IN, 0);
if (IS_ERR(req))
return PTR_ERR(req);
err = blk_rq_map_kern(q, req, id_str, VIRTIO_BLK_ID_BYTES, GFP_KERNEL);
if (err)
goto out;
blk_execute_rq(vblk->disk, req, false);
err = blk_status_to_errno(virtblk_result(blk_mq_rq_to_pdu(req)));
out:
blk_put_request(req);
return err;
}
virtio-blk: handle block_device_operations callbacks after hot unplug A userspace process holding a file descriptor to a virtio_blk device can still invoke block_device_operations after hot unplug. This leads to a use-after-free accessing vblk->vdev in virtblk_getgeo() when ioctl(HDIO_GETGEO) is invoked: BUG: unable to handle kernel NULL pointer dereference at 0000000000000090 IP: [<ffffffffc00e5450>] virtio_check_driver_offered_feature+0x10/0x90 [virtio] PGD 800000003a92f067 PUD 3a930067 PMD 0 Oops: 0000 [#1] SMP CPU: 0 PID: 1310 Comm: hdio-getgeo Tainted: G OE ------------ 3.10.0-1062.el7.x86_64 #1 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.13.0-0-gf21b5a4aeb02-prebuilt.qemu.org 04/01/2014 task: ffff9be5fbfb8000 ti: ffff9be5fa890000 task.ti: ffff9be5fa890000 RIP: 0010:[<ffffffffc00e5450>] [<ffffffffc00e5450>] virtio_check_driver_offered_feature+0x10/0x90 [virtio] RSP: 0018:ffff9be5fa893dc8 EFLAGS: 00010246 RAX: ffff9be5fc3f3400 RBX: ffff9be5fa893e30 RCX: 0000000000000000 RDX: 0000000000000000 RSI: 0000000000000004 RDI: ffff9be5fbc10b40 RBP: ffff9be5fa893dc8 R08: 0000000000000301 R09: 0000000000000301 R10: 0000000000000000 R11: 0000000000000000 R12: ffff9be5fdc24680 R13: ffff9be5fbc10b40 R14: ffff9be5fbc10480 R15: 0000000000000000 FS: 00007f1bfb968740(0000) GS:ffff9be5ffc00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000000000000090 CR3: 000000003a894000 CR4: 0000000000360ff0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: [<ffffffffc016ac37>] virtblk_getgeo+0x47/0x110 [virtio_blk] [<ffffffff8d3f200d>] ? handle_mm_fault+0x39d/0x9b0 [<ffffffff8d561265>] blkdev_ioctl+0x1f5/0xa20 [<ffffffff8d488771>] block_ioctl+0x41/0x50 [<ffffffff8d45d9e0>] do_vfs_ioctl+0x3a0/0x5a0 [<ffffffff8d45dc81>] SyS_ioctl+0xa1/0xc0 A related problem is that virtblk_remove() leaks the vd_index_ida index when something still holds a reference to vblk->disk during hot unplug. This causes virtio-blk device names to be lost (vda, vdb, etc). Fix these issues by protecting vblk->vdev with a mutex and reference counting vblk so the vd_index_ida index can be removed in all cases. Fixes: 48e4043d4529 ("virtio: add virtio disk geometry feature") Reported-by: Lance Digby <ldigby@redhat.com> Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com> Link: https://lore.kernel.org/r/20200430140442.171016-1-stefanha@redhat.com Signed-off-by: Michael S. Tsirkin <mst@redhat.com> Reviewed-by: Stefano Garzarella <sgarzare@redhat.com>
2020-04-30 22:04:42 +08:00
static void virtblk_get(struct virtio_blk *vblk)
{
refcount_inc(&vblk->refs);
}
static void virtblk_put(struct virtio_blk *vblk)
{
if (refcount_dec_and_test(&vblk->refs)) {
ida_simple_remove(&vd_index_ida, vblk->index);
mutex_destroy(&vblk->vdev_mutex);
kfree(vblk);
}
}
static int virtblk_open(struct block_device *bd, fmode_t mode)
{
struct virtio_blk *vblk = bd->bd_disk->private_data;
int ret = 0;
mutex_lock(&vblk->vdev_mutex);
if (vblk->vdev)
virtblk_get(vblk);
else
ret = -ENXIO;
mutex_unlock(&vblk->vdev_mutex);
return ret;
}
static void virtblk_release(struct gendisk *disk, fmode_t mode)
{
struct virtio_blk *vblk = disk->private_data;
virtblk_put(vblk);
}
/* We provide getgeo only to please some old bootloader/partitioning tools */
static int virtblk_getgeo(struct block_device *bd, struct hd_geometry *geo)
{
struct virtio_blk *vblk = bd->bd_disk->private_data;
virtio-blk: handle block_device_operations callbacks after hot unplug A userspace process holding a file descriptor to a virtio_blk device can still invoke block_device_operations after hot unplug. This leads to a use-after-free accessing vblk->vdev in virtblk_getgeo() when ioctl(HDIO_GETGEO) is invoked: BUG: unable to handle kernel NULL pointer dereference at 0000000000000090 IP: [<ffffffffc00e5450>] virtio_check_driver_offered_feature+0x10/0x90 [virtio] PGD 800000003a92f067 PUD 3a930067 PMD 0 Oops: 0000 [#1] SMP CPU: 0 PID: 1310 Comm: hdio-getgeo Tainted: G OE ------------ 3.10.0-1062.el7.x86_64 #1 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.13.0-0-gf21b5a4aeb02-prebuilt.qemu.org 04/01/2014 task: ffff9be5fbfb8000 ti: ffff9be5fa890000 task.ti: ffff9be5fa890000 RIP: 0010:[<ffffffffc00e5450>] [<ffffffffc00e5450>] virtio_check_driver_offered_feature+0x10/0x90 [virtio] RSP: 0018:ffff9be5fa893dc8 EFLAGS: 00010246 RAX: ffff9be5fc3f3400 RBX: ffff9be5fa893e30 RCX: 0000000000000000 RDX: 0000000000000000 RSI: 0000000000000004 RDI: ffff9be5fbc10b40 RBP: ffff9be5fa893dc8 R08: 0000000000000301 R09: 0000000000000301 R10: 0000000000000000 R11: 0000000000000000 R12: ffff9be5fdc24680 R13: ffff9be5fbc10b40 R14: ffff9be5fbc10480 R15: 0000000000000000 FS: 00007f1bfb968740(0000) GS:ffff9be5ffc00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000000000000090 CR3: 000000003a894000 CR4: 0000000000360ff0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: [<ffffffffc016ac37>] virtblk_getgeo+0x47/0x110 [virtio_blk] [<ffffffff8d3f200d>] ? handle_mm_fault+0x39d/0x9b0 [<ffffffff8d561265>] blkdev_ioctl+0x1f5/0xa20 [<ffffffff8d488771>] block_ioctl+0x41/0x50 [<ffffffff8d45d9e0>] do_vfs_ioctl+0x3a0/0x5a0 [<ffffffff8d45dc81>] SyS_ioctl+0xa1/0xc0 A related problem is that virtblk_remove() leaks the vd_index_ida index when something still holds a reference to vblk->disk during hot unplug. This causes virtio-blk device names to be lost (vda, vdb, etc). Fix these issues by protecting vblk->vdev with a mutex and reference counting vblk so the vd_index_ida index can be removed in all cases. Fixes: 48e4043d4529 ("virtio: add virtio disk geometry feature") Reported-by: Lance Digby <ldigby@redhat.com> Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com> Link: https://lore.kernel.org/r/20200430140442.171016-1-stefanha@redhat.com Signed-off-by: Michael S. Tsirkin <mst@redhat.com> Reviewed-by: Stefano Garzarella <sgarzare@redhat.com>
2020-04-30 22:04:42 +08:00
int ret = 0;
mutex_lock(&vblk->vdev_mutex);
if (!vblk->vdev) {
ret = -ENXIO;
goto out;
}
/* see if the host passed in geometry config */
if (virtio_has_feature(vblk->vdev, VIRTIO_BLK_F_GEOMETRY)) {
virtio_cread(vblk->vdev, struct virtio_blk_config,
geometry.cylinders, &geo->cylinders);
virtio_cread(vblk->vdev, struct virtio_blk_config,
geometry.heads, &geo->heads);
virtio_cread(vblk->vdev, struct virtio_blk_config,
geometry.sectors, &geo->sectors);
} else {
/* some standard values, similar to sd */
geo->heads = 1 << 6;
geo->sectors = 1 << 5;
geo->cylinders = get_capacity(bd->bd_disk) >> 11;
}
virtio-blk: handle block_device_operations callbacks after hot unplug A userspace process holding a file descriptor to a virtio_blk device can still invoke block_device_operations after hot unplug. This leads to a use-after-free accessing vblk->vdev in virtblk_getgeo() when ioctl(HDIO_GETGEO) is invoked: BUG: unable to handle kernel NULL pointer dereference at 0000000000000090 IP: [<ffffffffc00e5450>] virtio_check_driver_offered_feature+0x10/0x90 [virtio] PGD 800000003a92f067 PUD 3a930067 PMD 0 Oops: 0000 [#1] SMP CPU: 0 PID: 1310 Comm: hdio-getgeo Tainted: G OE ------------ 3.10.0-1062.el7.x86_64 #1 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.13.0-0-gf21b5a4aeb02-prebuilt.qemu.org 04/01/2014 task: ffff9be5fbfb8000 ti: ffff9be5fa890000 task.ti: ffff9be5fa890000 RIP: 0010:[<ffffffffc00e5450>] [<ffffffffc00e5450>] virtio_check_driver_offered_feature+0x10/0x90 [virtio] RSP: 0018:ffff9be5fa893dc8 EFLAGS: 00010246 RAX: ffff9be5fc3f3400 RBX: ffff9be5fa893e30 RCX: 0000000000000000 RDX: 0000000000000000 RSI: 0000000000000004 RDI: ffff9be5fbc10b40 RBP: ffff9be5fa893dc8 R08: 0000000000000301 R09: 0000000000000301 R10: 0000000000000000 R11: 0000000000000000 R12: ffff9be5fdc24680 R13: ffff9be5fbc10b40 R14: ffff9be5fbc10480 R15: 0000000000000000 FS: 00007f1bfb968740(0000) GS:ffff9be5ffc00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000000000000090 CR3: 000000003a894000 CR4: 0000000000360ff0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: [<ffffffffc016ac37>] virtblk_getgeo+0x47/0x110 [virtio_blk] [<ffffffff8d3f200d>] ? handle_mm_fault+0x39d/0x9b0 [<ffffffff8d561265>] blkdev_ioctl+0x1f5/0xa20 [<ffffffff8d488771>] block_ioctl+0x41/0x50 [<ffffffff8d45d9e0>] do_vfs_ioctl+0x3a0/0x5a0 [<ffffffff8d45dc81>] SyS_ioctl+0xa1/0xc0 A related problem is that virtblk_remove() leaks the vd_index_ida index when something still holds a reference to vblk->disk during hot unplug. This causes virtio-blk device names to be lost (vda, vdb, etc). Fix these issues by protecting vblk->vdev with a mutex and reference counting vblk so the vd_index_ida index can be removed in all cases. Fixes: 48e4043d4529 ("virtio: add virtio disk geometry feature") Reported-by: Lance Digby <ldigby@redhat.com> Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com> Link: https://lore.kernel.org/r/20200430140442.171016-1-stefanha@redhat.com Signed-off-by: Michael S. Tsirkin <mst@redhat.com> Reviewed-by: Stefano Garzarella <sgarzare@redhat.com>
2020-04-30 22:04:42 +08:00
out:
mutex_unlock(&vblk->vdev_mutex);
return ret;
}
static const struct block_device_operations virtblk_fops = {
.owner = THIS_MODULE,
virtio-blk: handle block_device_operations callbacks after hot unplug A userspace process holding a file descriptor to a virtio_blk device can still invoke block_device_operations after hot unplug. This leads to a use-after-free accessing vblk->vdev in virtblk_getgeo() when ioctl(HDIO_GETGEO) is invoked: BUG: unable to handle kernel NULL pointer dereference at 0000000000000090 IP: [<ffffffffc00e5450>] virtio_check_driver_offered_feature+0x10/0x90 [virtio] PGD 800000003a92f067 PUD 3a930067 PMD 0 Oops: 0000 [#1] SMP CPU: 0 PID: 1310 Comm: hdio-getgeo Tainted: G OE ------------ 3.10.0-1062.el7.x86_64 #1 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.13.0-0-gf21b5a4aeb02-prebuilt.qemu.org 04/01/2014 task: ffff9be5fbfb8000 ti: ffff9be5fa890000 task.ti: ffff9be5fa890000 RIP: 0010:[<ffffffffc00e5450>] [<ffffffffc00e5450>] virtio_check_driver_offered_feature+0x10/0x90 [virtio] RSP: 0018:ffff9be5fa893dc8 EFLAGS: 00010246 RAX: ffff9be5fc3f3400 RBX: ffff9be5fa893e30 RCX: 0000000000000000 RDX: 0000000000000000 RSI: 0000000000000004 RDI: ffff9be5fbc10b40 RBP: ffff9be5fa893dc8 R08: 0000000000000301 R09: 0000000000000301 R10: 0000000000000000 R11: 0000000000000000 R12: ffff9be5fdc24680 R13: ffff9be5fbc10b40 R14: ffff9be5fbc10480 R15: 0000000000000000 FS: 00007f1bfb968740(0000) GS:ffff9be5ffc00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000000000000090 CR3: 000000003a894000 CR4: 0000000000360ff0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: [<ffffffffc016ac37>] virtblk_getgeo+0x47/0x110 [virtio_blk] [<ffffffff8d3f200d>] ? handle_mm_fault+0x39d/0x9b0 [<ffffffff8d561265>] blkdev_ioctl+0x1f5/0xa20 [<ffffffff8d488771>] block_ioctl+0x41/0x50 [<ffffffff8d45d9e0>] do_vfs_ioctl+0x3a0/0x5a0 [<ffffffff8d45dc81>] SyS_ioctl+0xa1/0xc0 A related problem is that virtblk_remove() leaks the vd_index_ida index when something still holds a reference to vblk->disk during hot unplug. This causes virtio-blk device names to be lost (vda, vdb, etc). Fix these issues by protecting vblk->vdev with a mutex and reference counting vblk so the vd_index_ida index can be removed in all cases. Fixes: 48e4043d4529 ("virtio: add virtio disk geometry feature") Reported-by: Lance Digby <ldigby@redhat.com> Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com> Link: https://lore.kernel.org/r/20200430140442.171016-1-stefanha@redhat.com Signed-off-by: Michael S. Tsirkin <mst@redhat.com> Reviewed-by: Stefano Garzarella <sgarzare@redhat.com>
2020-04-30 22:04:42 +08:00
.open = virtblk_open,
.release = virtblk_release,
.getgeo = virtblk_getgeo,
};
static int index_to_minor(int index)
{
return index << PART_BITS;
}
static int minor_to_index(int minor)
{
return minor >> PART_BITS;
}
static ssize_t serial_show(struct device *dev,
struct device_attribute *attr, char *buf)
{
struct gendisk *disk = dev_to_disk(dev);
int err;
/* sysfs gives us a PAGE_SIZE buffer */
BUILD_BUG_ON(PAGE_SIZE < VIRTIO_BLK_ID_BYTES);
buf[VIRTIO_BLK_ID_BYTES] = '\0';
err = virtblk_get_id(disk, buf);
if (!err)
return strlen(buf);
if (err == -EIO) /* Unsupported? Make it empty. */
return 0;
return err;
}
static DEVICE_ATTR_RO(serial);
/* The queue's logical block size must be set before calling this */
static void virtblk_update_capacity(struct virtio_blk *vblk, bool resize)
{
struct virtio_device *vdev = vblk->vdev;
struct request_queue *q = vblk->disk->queue;
char cap_str_2[10], cap_str_10[10];
unsigned long long nblocks;
u64 capacity;
/* Host must always specify the capacity. */
virtio_cread(vdev, struct virtio_blk_config, capacity, &capacity);
nblocks = DIV_ROUND_UP_ULL(capacity, queue_logical_block_size(q) >> 9);
string_get_size(nblocks, queue_logical_block_size(q),
STRING_UNITS_2, cap_str_2, sizeof(cap_str_2));
string_get_size(nblocks, queue_logical_block_size(q),
STRING_UNITS_10, cap_str_10, sizeof(cap_str_10));
dev_notice(&vdev->dev,
"[%s] %s%llu %d-byte logical blocks (%s/%s)\n",
vblk->disk->disk_name,
resize ? "new size: " : "",
nblocks,
queue_logical_block_size(q),
cap_str_10,
cap_str_2);
set_capacity_and_notify(vblk->disk, capacity);
}
static void virtblk_config_changed_work(struct work_struct *work)
{
struct virtio_blk *vblk =
container_of(work, struct virtio_blk, config_work);
virtblk_update_capacity(vblk, true);
}
static void virtblk_config_changed(struct virtio_device *vdev)
{
struct virtio_blk *vblk = vdev->priv;
queue_work(virtblk_wq, &vblk->config_work);
}
static int init_vq(struct virtio_blk *vblk)
{
int err;
int i;
vq_callback_t **callbacks;
const char **names;
struct virtqueue **vqs;
unsigned short num_vqs;
struct virtio_device *vdev = vblk->vdev;
struct irq_affinity desc = { 0, };
err = virtio_cread_feature(vdev, VIRTIO_BLK_F_MQ,
struct virtio_blk_config, num_queues,
&num_vqs);
if (err)
num_vqs = 1;
num_vqs = min_t(unsigned int, nr_cpu_ids, num_vqs);
vblk->vqs = kmalloc_array(num_vqs, sizeof(*vblk->vqs), GFP_KERNEL);
if (!vblk->vqs)
return -ENOMEM;
names = kmalloc_array(num_vqs, sizeof(*names), GFP_KERNEL);
callbacks = kmalloc_array(num_vqs, sizeof(*callbacks), GFP_KERNEL);
vqs = kmalloc_array(num_vqs, sizeof(*vqs), GFP_KERNEL);
if (!names || !callbacks || !vqs) {
err = -ENOMEM;
goto out;
}
for (i = 0; i < num_vqs; i++) {
callbacks[i] = virtblk_done;
snprintf(vblk->vqs[i].name, VQ_NAME_LEN, "req.%d", i);
names[i] = vblk->vqs[i].name;
}
/* Discover virtqueues and write information to configuration. */
err = virtio_find_vqs(vdev, num_vqs, vqs, callbacks, names, &desc);
if (err)
goto out;
for (i = 0; i < num_vqs; i++) {
spin_lock_init(&vblk->vqs[i].lock);
vblk->vqs[i].vq = vqs[i];
}
vblk->num_vqs = num_vqs;
out:
kfree(vqs);
kfree(callbacks);
kfree(names);
if (err)
kfree(vblk->vqs);
return err;
}
/*
* Legacy naming scheme used for virtio devices. We are stuck with it for
* virtio blk but don't ever use it for any new driver.
*/
static int virtblk_name_format(char *prefix, int index, char *buf, int buflen)
{
const int base = 'z' - 'a' + 1;
char *begin = buf + strlen(prefix);
char *end = buf + buflen;
char *p;
int unit;
p = end - 1;
*p = '\0';
unit = base;
do {
if (p == begin)
return -EINVAL;
*--p = 'a' + (index % unit);
index = (index / unit) - 1;
} while (index >= 0);
memmove(begin, p, end - p);
memcpy(buf, prefix, strlen(prefix));
return 0;
}
static int virtblk_get_cache_mode(struct virtio_device *vdev)
{
u8 writeback;
int err;
err = virtio_cread_feature(vdev, VIRTIO_BLK_F_CONFIG_WCE,
struct virtio_blk_config, wce,
&writeback);
/*
* If WCE is not configurable and flush is not available,
* assume no writeback cache is in use.
*/
if (err)
writeback = virtio_has_feature(vdev, VIRTIO_BLK_F_FLUSH);
return writeback;
}
static void virtblk_update_cache_mode(struct virtio_device *vdev)
{
u8 writeback = virtblk_get_cache_mode(vdev);
struct virtio_blk *vblk = vdev->priv;
blk_queue_write_cache(vblk->disk->queue, writeback, false);
}
static const char *const virtblk_cache_types[] = {
"write through", "write back"
};
static ssize_t
cache_type_store(struct device *dev, struct device_attribute *attr,
const char *buf, size_t count)
{
struct gendisk *disk = dev_to_disk(dev);
struct virtio_blk *vblk = disk->private_data;
struct virtio_device *vdev = vblk->vdev;
int i;
BUG_ON(!virtio_has_feature(vblk->vdev, VIRTIO_BLK_F_CONFIG_WCE));
i = sysfs_match_string(virtblk_cache_types, buf);
if (i < 0)
return i;
virtio_cwrite8(vdev, offsetof(struct virtio_blk_config, wce), i);
virtblk_update_cache_mode(vdev);
return count;
}
static ssize_t
cache_type_show(struct device *dev, struct device_attribute *attr, char *buf)
{
struct gendisk *disk = dev_to_disk(dev);
struct virtio_blk *vblk = disk->private_data;
u8 writeback = virtblk_get_cache_mode(vblk->vdev);
BUG_ON(writeback >= ARRAY_SIZE(virtblk_cache_types));
return snprintf(buf, 40, "%s\n", virtblk_cache_types[writeback]);
}
static DEVICE_ATTR_RW(cache_type);
static struct attribute *virtblk_attrs[] = {
&dev_attr_serial.attr,
&dev_attr_cache_type.attr,
NULL,
};
static umode_t virtblk_attrs_are_visible(struct kobject *kobj,
struct attribute *a, int n)
{
struct device *dev = kobj_to_dev(kobj);
struct gendisk *disk = dev_to_disk(dev);
struct virtio_blk *vblk = disk->private_data;
struct virtio_device *vdev = vblk->vdev;
if (a == &dev_attr_cache_type.attr &&
!virtio_has_feature(vdev, VIRTIO_BLK_F_CONFIG_WCE))
return S_IRUGO;
return a->mode;
}
static const struct attribute_group virtblk_attr_group = {
.attrs = virtblk_attrs,
.is_visible = virtblk_attrs_are_visible,
};
static const struct attribute_group *virtblk_attr_groups[] = {
&virtblk_attr_group,
NULL,
};
static int virtblk_init_request(struct blk_mq_tag_set *set, struct request *rq,
unsigned int hctx_idx, unsigned int numa_node)
{
struct virtio_blk *vblk = set->driver_data;
struct virtblk_req *vbr = blk_mq_rq_to_pdu(rq);
sg_init_table(vbr->sg, vblk->sg_elems);
return 0;
}
static int virtblk_map_queues(struct blk_mq_tag_set *set)
{
struct virtio_blk *vblk = set->driver_data;
return blk_mq_virtio_map_queues(&set->map[HCTX_TYPE_DEFAULT],
vblk->vdev, 0);
}
static const struct blk_mq_ops virtio_mq_ops = {
.queue_rq = virtio_queue_rq,
.commit_rqs = virtio_commit_rqs,
.complete = virtblk_request_done,
.init_request = virtblk_init_request,
.map_queues = virtblk_map_queues,
};
static unsigned int virtblk_queue_depth;
module_param_named(queue_depth, virtblk_queue_depth, uint, 0444);
static int virtblk_validate(struct virtio_device *vdev)
{
u32 blk_size;
if (!vdev->config->get) {
dev_err(&vdev->dev, "%s failure: config access disabled\n",
__func__);
return -EINVAL;
}
if (!virtio_has_feature(vdev, VIRTIO_BLK_F_BLK_SIZE))
return 0;
blk_size = virtio_cread32(vdev,
offsetof(struct virtio_blk_config, blk_size));
if (blk_size < SECTOR_SIZE || blk_size > PAGE_SIZE)
__virtio_clear_bit(vdev, VIRTIO_BLK_F_BLK_SIZE);
return 0;
}
static int virtblk_probe(struct virtio_device *vdev)
{
struct virtio_blk *vblk;
struct request_queue *q;
int err, index;
virtio-blk: Add bio-based IO path for virtio-blk This patch introduces bio-based IO path for virtio-blk. Compared to request-based IO path, bio-based IO path uses driver provided ->make_request_fn() method to bypasses the IO scheduler. It handles the bio to device directly without allocating a request in block layer. This reduces the IO path in guest kernel to achieve high IOPS and lower latency. The downside is that guest can not use the IO scheduler to merge and sort requests. However, this is not a big problem if the backend disk in host side uses faster disk device. When the bio-based IO path is not enabled, virtio-blk still uses the original request-based IO path, no performance difference is observed. Using a slow device e.g. normal SATA disk, the bio-based IO path for sequential read and write are slower than req-based IO path due to lack of merge in guest kernel. So we make the bio-based path optional. Performance evaluation: ----------------------------- 1) Fio test is performed in a 8 vcpu guest with ramdisk based guest using kvm tool. Short version: With bio-based IO path, sequential read/write, random read/write IOPS boost : 28%, 24%, 21%, 16% Latency improvement: 32%, 17%, 21%, 16% Long version: With bio-based IO path: seq-read : io=2048.0MB, bw=116996KB/s, iops=233991 , runt= 17925msec seq-write : io=2048.0MB, bw=100829KB/s, iops=201658 , runt= 20799msec rand-read : io=3095.7MB, bw=112134KB/s, iops=224268 , runt= 28269msec rand-write: io=3095.7MB, bw=96198KB/s, iops=192396 , runt= 32952msec clat (usec): min=0 , max=2631.6K, avg=58716.99, stdev=191377.30 clat (usec): min=0 , max=1753.2K, avg=66423.25, stdev=81774.35 clat (usec): min=0 , max=2915.5K, avg=61685.70, stdev=120598.39 clat (usec): min=0 , max=1933.4K, avg=76935.12, stdev=96603.45 cpu : usr=74.08%, sys=703.84%, ctx=29661403, majf=21354, minf=22460954 cpu : usr=70.92%, sys=702.81%, ctx=77219828, majf=13980, minf=27713137 cpu : usr=72.23%, sys=695.37%, ctx=88081059, majf=18475, minf=28177648 cpu : usr=69.69%, sys=654.13%, ctx=145476035, majf=15867, minf=26176375 With request-based IO path: seq-read : io=2048.0MB, bw=91074KB/s, iops=182147 , runt= 23027msec seq-write : io=2048.0MB, bw=80725KB/s, iops=161449 , runt= 25979msec rand-read : io=3095.7MB, bw=92106KB/s, iops=184211 , runt= 34416msec rand-write: io=3095.7MB, bw=82815KB/s, iops=165630 , runt= 38277msec clat (usec): min=0 , max=1932.4K, avg=77824.17, stdev=170339.49 clat (usec): min=0 , max=2510.2K, avg=78023.96, stdev=146949.15 clat (usec): min=0 , max=3037.2K, avg=74746.53, stdev=128498.27 clat (usec): min=0 , max=1363.4K, avg=89830.75, stdev=114279.68 cpu : usr=53.28%, sys=724.19%, ctx=37988895, majf=17531, minf=23577622 cpu : usr=49.03%, sys=633.20%, ctx=205935380, majf=18197, minf=27288959 cpu : usr=55.78%, sys=722.40%, ctx=101525058, majf=19273, minf=28067082 cpu : usr=56.55%, sys=690.83%, ctx=228205022, majf=18039, minf=26551985 2) Fio test is performed in a 8 vcpu guest with Fusion-IO based guest using kvm tool. Short version: With bio-based IO path, sequential read/write, random read/write IOPS boost : 11%, 11%, 13%, 10% Latency improvement: 10%, 10%, 12%, 10% Long Version: With bio-based IO path: read : io=2048.0MB, bw=58920KB/s, iops=117840 , runt= 35593msec write: io=2048.0MB, bw=64308KB/s, iops=128616 , runt= 32611msec read : io=3095.7MB, bw=59633KB/s, iops=119266 , runt= 53157msec write: io=3095.7MB, bw=62993KB/s, iops=125985 , runt= 50322msec clat (usec): min=0 , max=1284.3K, avg=128109.01, stdev=71513.29 clat (usec): min=94 , max=962339 , avg=116832.95, stdev=65836.80 clat (usec): min=0 , max=1846.6K, avg=128509.99, stdev=89575.07 clat (usec): min=0 , max=2256.4K, avg=121361.84, stdev=82747.25 cpu : usr=56.79%, sys=421.70%, ctx=147335118, majf=21080, minf=19852517 cpu : usr=61.81%, sys=455.53%, ctx=143269950, majf=16027, minf=24800604 cpu : usr=63.10%, sys=455.38%, ctx=178373538, majf=16958, minf=24822612 cpu : usr=62.04%, sys=453.58%, ctx=226902362, majf=16089, minf=23278105 With request-based IO path: read : io=2048.0MB, bw=52896KB/s, iops=105791 , runt= 39647msec write: io=2048.0MB, bw=57856KB/s, iops=115711 , runt= 36248msec read : io=3095.7MB, bw=52387KB/s, iops=104773 , runt= 60510msec write: io=3095.7MB, bw=57310KB/s, iops=114619 , runt= 55312msec clat (usec): min=0 , max=1532.6K, avg=142085.62, stdev=109196.84 clat (usec): min=0 , max=1487.4K, avg=129110.71, stdev=114973.64 clat (usec): min=0 , max=1388.6K, avg=145049.22, stdev=107232.55 clat (usec): min=0 , max=1465.9K, avg=133585.67, stdev=110322.95 cpu : usr=44.08%, sys=590.71%, ctx=451812322, majf=14841, minf=17648641 cpu : usr=48.73%, sys=610.78%, ctx=418953997, majf=22164, minf=26850689 cpu : usr=45.58%, sys=581.16%, ctx=714079216, majf=21497, minf=22558223 cpu : usr=48.40%, sys=599.65%, ctx=656089423, majf=16393, minf=23824409 3) Fio test is performed in a 8 vcpu guest with normal SATA based guest using kvm tool. Short version: With bio-based IO path, sequential read/write, random read/write IOPS boost : -10%, -10%, 4.4%, 0.5% Latency improvement: -12%, -15%, 2.5%, 0.8% Long Version: With bio-based IO path: read : io=124812KB, bw=36537KB/s, iops=9060 , runt= 3416msec write: io=169180KB, bw=24406KB/s, iops=6065 , runt= 6932msec read : io=256200KB, bw=2089.3KB/s, iops=520 , runt=122630msec write: io=257988KB, bw=1545.7KB/s, iops=384 , runt=166910msec clat (msec): min=1 , max=1527 , avg=28.06, stdev=89.54 clat (msec): min=2 , max=344 , avg=41.12, stdev=38.70 clat (msec): min=8 , max=1984 , avg=490.63, stdev=207.28 clat (msec): min=33 , max=4131 , avg=659.19, stdev=304.71 cpu : usr=4.85%, sys=17.15%, ctx=31593, majf=0, minf=7 cpu : usr=3.04%, sys=11.45%, ctx=39377, majf=0, minf=0 cpu : usr=0.47%, sys=1.59%, ctx=262986, majf=0, minf=16 cpu : usr=0.47%, sys=1.46%, ctx=337410, majf=0, minf=0 With request-based IO path: read : io=150120KB, bw=40420KB/s, iops=10037 , runt= 3714msec write: io=194932KB, bw=27029KB/s, iops=6722 , runt= 7212msec read : io=257136KB, bw=2001.1KB/s, iops=498 , runt=128443msec write: io=258276KB, bw=1537.2KB/s, iops=382 , runt=168028msec clat (msec): min=1 , max=1542 , avg=24.84, stdev=32.45 clat (msec): min=3 , max=628 , avg=35.62, stdev=39.71 clat (msec): min=8 , max=2540 , avg=503.28, stdev=236.97 clat (msec): min=41 , max=4398 , avg=653.88, stdev=302.61 cpu : usr=3.91%, sys=15.75%, ctx=26968, majf=0, minf=23 cpu : usr=2.50%, sys=10.56%, ctx=19090, majf=0, minf=0 cpu : usr=0.16%, sys=0.43%, ctx=20159, majf=0, minf=16 cpu : usr=0.18%, sys=0.53%, ctx=81364, majf=0, minf=0 How to use: ----------------------------- Add 'virtio_blk.use_bio=1' to kernel cmdline or 'modprobe virtio_blk use_bio=1' to enable ->make_request_fn() based I/O path. Cc: Rusty Russell <rusty@rustcorp.com.au> Cc: Jens Axboe <axboe@kernel.dk> Cc: Christoph Hellwig <hch@lst.de> Cc: Tejun Heo <tj@kernel.org> Cc: Shaohua Li <shli@kernel.org> Cc: "Michael S. Tsirkin" <mst@redhat.com> Cc: kvm@vger.kernel.org Cc: linux-kernel@vger.kernel.org Cc: virtualization@lists.linux-foundation.org Signed-off-by: Christoph Hellwig <hch@lst.de> Signed-off-by: Minchan Kim <minchan.kim@gmail.com> Signed-off-by: Asias He <asias@redhat.com> Acked-by: Rusty Russell <rusty@rustcorp.com.au> Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
2012-08-08 16:07:04 +08:00
u32 v, blk_size, max_size, sg_elems, opt_io_size;
u16 min_io_size;
u8 physical_block_exp, alignment_offset;
unsigned int queue_depth;
err = ida_simple_get(&vd_index_ida, 0, minor_to_index(1 << MINORBITS),
GFP_KERNEL);
if (err < 0)
goto out;
index = err;
/* We need to know how many segments before we allocate. */
err = virtio_cread_feature(vdev, VIRTIO_BLK_F_SEG_MAX,
struct virtio_blk_config, seg_max,
&sg_elems);
/* We need at least one SG element, whatever they say. */
if (err || !sg_elems)
sg_elems = 1;
/* Prevent integer overflows and honor max vq size */
sg_elems = min_t(u32, sg_elems, VIRTIO_BLK_MAX_SG_ELEMS - 2);
/* We need extra sg elements at head and tail. */
sg_elems += 2;
vdev->priv = vblk = kmalloc(sizeof(*vblk), GFP_KERNEL);
if (!vblk) {
err = -ENOMEM;
goto out_free_index;
}
virtio-blk: handle block_device_operations callbacks after hot unplug A userspace process holding a file descriptor to a virtio_blk device can still invoke block_device_operations after hot unplug. This leads to a use-after-free accessing vblk->vdev in virtblk_getgeo() when ioctl(HDIO_GETGEO) is invoked: BUG: unable to handle kernel NULL pointer dereference at 0000000000000090 IP: [<ffffffffc00e5450>] virtio_check_driver_offered_feature+0x10/0x90 [virtio] PGD 800000003a92f067 PUD 3a930067 PMD 0 Oops: 0000 [#1] SMP CPU: 0 PID: 1310 Comm: hdio-getgeo Tainted: G OE ------------ 3.10.0-1062.el7.x86_64 #1 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.13.0-0-gf21b5a4aeb02-prebuilt.qemu.org 04/01/2014 task: ffff9be5fbfb8000 ti: ffff9be5fa890000 task.ti: ffff9be5fa890000 RIP: 0010:[<ffffffffc00e5450>] [<ffffffffc00e5450>] virtio_check_driver_offered_feature+0x10/0x90 [virtio] RSP: 0018:ffff9be5fa893dc8 EFLAGS: 00010246 RAX: ffff9be5fc3f3400 RBX: ffff9be5fa893e30 RCX: 0000000000000000 RDX: 0000000000000000 RSI: 0000000000000004 RDI: ffff9be5fbc10b40 RBP: ffff9be5fa893dc8 R08: 0000000000000301 R09: 0000000000000301 R10: 0000000000000000 R11: 0000000000000000 R12: ffff9be5fdc24680 R13: ffff9be5fbc10b40 R14: ffff9be5fbc10480 R15: 0000000000000000 FS: 00007f1bfb968740(0000) GS:ffff9be5ffc00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000000000000090 CR3: 000000003a894000 CR4: 0000000000360ff0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: [<ffffffffc016ac37>] virtblk_getgeo+0x47/0x110 [virtio_blk] [<ffffffff8d3f200d>] ? handle_mm_fault+0x39d/0x9b0 [<ffffffff8d561265>] blkdev_ioctl+0x1f5/0xa20 [<ffffffff8d488771>] block_ioctl+0x41/0x50 [<ffffffff8d45d9e0>] do_vfs_ioctl+0x3a0/0x5a0 [<ffffffff8d45dc81>] SyS_ioctl+0xa1/0xc0 A related problem is that virtblk_remove() leaks the vd_index_ida index when something still holds a reference to vblk->disk during hot unplug. This causes virtio-blk device names to be lost (vda, vdb, etc). Fix these issues by protecting vblk->vdev with a mutex and reference counting vblk so the vd_index_ida index can be removed in all cases. Fixes: 48e4043d4529 ("virtio: add virtio disk geometry feature") Reported-by: Lance Digby <ldigby@redhat.com> Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com> Link: https://lore.kernel.org/r/20200430140442.171016-1-stefanha@redhat.com Signed-off-by: Michael S. Tsirkin <mst@redhat.com> Reviewed-by: Stefano Garzarella <sgarzare@redhat.com>
2020-04-30 22:04:42 +08:00
/* This reference is dropped in virtblk_remove(). */
refcount_set(&vblk->refs, 1);
mutex_init(&vblk->vdev_mutex);
vblk->vdev = vdev;
vblk->sg_elems = sg_elems;
virtio-blk: Add bio-based IO path for virtio-blk This patch introduces bio-based IO path for virtio-blk. Compared to request-based IO path, bio-based IO path uses driver provided ->make_request_fn() method to bypasses the IO scheduler. It handles the bio to device directly without allocating a request in block layer. This reduces the IO path in guest kernel to achieve high IOPS and lower latency. The downside is that guest can not use the IO scheduler to merge and sort requests. However, this is not a big problem if the backend disk in host side uses faster disk device. When the bio-based IO path is not enabled, virtio-blk still uses the original request-based IO path, no performance difference is observed. Using a slow device e.g. normal SATA disk, the bio-based IO path for sequential read and write are slower than req-based IO path due to lack of merge in guest kernel. So we make the bio-based path optional. Performance evaluation: ----------------------------- 1) Fio test is performed in a 8 vcpu guest with ramdisk based guest using kvm tool. Short version: With bio-based IO path, sequential read/write, random read/write IOPS boost : 28%, 24%, 21%, 16% Latency improvement: 32%, 17%, 21%, 16% Long version: With bio-based IO path: seq-read : io=2048.0MB, bw=116996KB/s, iops=233991 , runt= 17925msec seq-write : io=2048.0MB, bw=100829KB/s, iops=201658 , runt= 20799msec rand-read : io=3095.7MB, bw=112134KB/s, iops=224268 , runt= 28269msec rand-write: io=3095.7MB, bw=96198KB/s, iops=192396 , runt= 32952msec clat (usec): min=0 , max=2631.6K, avg=58716.99, stdev=191377.30 clat (usec): min=0 , max=1753.2K, avg=66423.25, stdev=81774.35 clat (usec): min=0 , max=2915.5K, avg=61685.70, stdev=120598.39 clat (usec): min=0 , max=1933.4K, avg=76935.12, stdev=96603.45 cpu : usr=74.08%, sys=703.84%, ctx=29661403, majf=21354, minf=22460954 cpu : usr=70.92%, sys=702.81%, ctx=77219828, majf=13980, minf=27713137 cpu : usr=72.23%, sys=695.37%, ctx=88081059, majf=18475, minf=28177648 cpu : usr=69.69%, sys=654.13%, ctx=145476035, majf=15867, minf=26176375 With request-based IO path: seq-read : io=2048.0MB, bw=91074KB/s, iops=182147 , runt= 23027msec seq-write : io=2048.0MB, bw=80725KB/s, iops=161449 , runt= 25979msec rand-read : io=3095.7MB, bw=92106KB/s, iops=184211 , runt= 34416msec rand-write: io=3095.7MB, bw=82815KB/s, iops=165630 , runt= 38277msec clat (usec): min=0 , max=1932.4K, avg=77824.17, stdev=170339.49 clat (usec): min=0 , max=2510.2K, avg=78023.96, stdev=146949.15 clat (usec): min=0 , max=3037.2K, avg=74746.53, stdev=128498.27 clat (usec): min=0 , max=1363.4K, avg=89830.75, stdev=114279.68 cpu : usr=53.28%, sys=724.19%, ctx=37988895, majf=17531, minf=23577622 cpu : usr=49.03%, sys=633.20%, ctx=205935380, majf=18197, minf=27288959 cpu : usr=55.78%, sys=722.40%, ctx=101525058, majf=19273, minf=28067082 cpu : usr=56.55%, sys=690.83%, ctx=228205022, majf=18039, minf=26551985 2) Fio test is performed in a 8 vcpu guest with Fusion-IO based guest using kvm tool. Short version: With bio-based IO path, sequential read/write, random read/write IOPS boost : 11%, 11%, 13%, 10% Latency improvement: 10%, 10%, 12%, 10% Long Version: With bio-based IO path: read : io=2048.0MB, bw=58920KB/s, iops=117840 , runt= 35593msec write: io=2048.0MB, bw=64308KB/s, iops=128616 , runt= 32611msec read : io=3095.7MB, bw=59633KB/s, iops=119266 , runt= 53157msec write: io=3095.7MB, bw=62993KB/s, iops=125985 , runt= 50322msec clat (usec): min=0 , max=1284.3K, avg=128109.01, stdev=71513.29 clat (usec): min=94 , max=962339 , avg=116832.95, stdev=65836.80 clat (usec): min=0 , max=1846.6K, avg=128509.99, stdev=89575.07 clat (usec): min=0 , max=2256.4K, avg=121361.84, stdev=82747.25 cpu : usr=56.79%, sys=421.70%, ctx=147335118, majf=21080, minf=19852517 cpu : usr=61.81%, sys=455.53%, ctx=143269950, majf=16027, minf=24800604 cpu : usr=63.10%, sys=455.38%, ctx=178373538, majf=16958, minf=24822612 cpu : usr=62.04%, sys=453.58%, ctx=226902362, majf=16089, minf=23278105 With request-based IO path: read : io=2048.0MB, bw=52896KB/s, iops=105791 , runt= 39647msec write: io=2048.0MB, bw=57856KB/s, iops=115711 , runt= 36248msec read : io=3095.7MB, bw=52387KB/s, iops=104773 , runt= 60510msec write: io=3095.7MB, bw=57310KB/s, iops=114619 , runt= 55312msec clat (usec): min=0 , max=1532.6K, avg=142085.62, stdev=109196.84 clat (usec): min=0 , max=1487.4K, avg=129110.71, stdev=114973.64 clat (usec): min=0 , max=1388.6K, avg=145049.22, stdev=107232.55 clat (usec): min=0 , max=1465.9K, avg=133585.67, stdev=110322.95 cpu : usr=44.08%, sys=590.71%, ctx=451812322, majf=14841, minf=17648641 cpu : usr=48.73%, sys=610.78%, ctx=418953997, majf=22164, minf=26850689 cpu : usr=45.58%, sys=581.16%, ctx=714079216, majf=21497, minf=22558223 cpu : usr=48.40%, sys=599.65%, ctx=656089423, majf=16393, minf=23824409 3) Fio test is performed in a 8 vcpu guest with normal SATA based guest using kvm tool. Short version: With bio-based IO path, sequential read/write, random read/write IOPS boost : -10%, -10%, 4.4%, 0.5% Latency improvement: -12%, -15%, 2.5%, 0.8% Long Version: With bio-based IO path: read : io=124812KB, bw=36537KB/s, iops=9060 , runt= 3416msec write: io=169180KB, bw=24406KB/s, iops=6065 , runt= 6932msec read : io=256200KB, bw=2089.3KB/s, iops=520 , runt=122630msec write: io=257988KB, bw=1545.7KB/s, iops=384 , runt=166910msec clat (msec): min=1 , max=1527 , avg=28.06, stdev=89.54 clat (msec): min=2 , max=344 , avg=41.12, stdev=38.70 clat (msec): min=8 , max=1984 , avg=490.63, stdev=207.28 clat (msec): min=33 , max=4131 , avg=659.19, stdev=304.71 cpu : usr=4.85%, sys=17.15%, ctx=31593, majf=0, minf=7 cpu : usr=3.04%, sys=11.45%, ctx=39377, majf=0, minf=0 cpu : usr=0.47%, sys=1.59%, ctx=262986, majf=0, minf=16 cpu : usr=0.47%, sys=1.46%, ctx=337410, majf=0, minf=0 With request-based IO path: read : io=150120KB, bw=40420KB/s, iops=10037 , runt= 3714msec write: io=194932KB, bw=27029KB/s, iops=6722 , runt= 7212msec read : io=257136KB, bw=2001.1KB/s, iops=498 , runt=128443msec write: io=258276KB, bw=1537.2KB/s, iops=382 , runt=168028msec clat (msec): min=1 , max=1542 , avg=24.84, stdev=32.45 clat (msec): min=3 , max=628 , avg=35.62, stdev=39.71 clat (msec): min=8 , max=2540 , avg=503.28, stdev=236.97 clat (msec): min=41 , max=4398 , avg=653.88, stdev=302.61 cpu : usr=3.91%, sys=15.75%, ctx=26968, majf=0, minf=23 cpu : usr=2.50%, sys=10.56%, ctx=19090, majf=0, minf=0 cpu : usr=0.16%, sys=0.43%, ctx=20159, majf=0, minf=16 cpu : usr=0.18%, sys=0.53%, ctx=81364, majf=0, minf=0 How to use: ----------------------------- Add 'virtio_blk.use_bio=1' to kernel cmdline or 'modprobe virtio_blk use_bio=1' to enable ->make_request_fn() based I/O path. Cc: Rusty Russell <rusty@rustcorp.com.au> Cc: Jens Axboe <axboe@kernel.dk> Cc: Christoph Hellwig <hch@lst.de> Cc: Tejun Heo <tj@kernel.org> Cc: Shaohua Li <shli@kernel.org> Cc: "Michael S. Tsirkin" <mst@redhat.com> Cc: kvm@vger.kernel.org Cc: linux-kernel@vger.kernel.org Cc: virtualization@lists.linux-foundation.org Signed-off-by: Christoph Hellwig <hch@lst.de> Signed-off-by: Minchan Kim <minchan.kim@gmail.com> Signed-off-by: Asias He <asias@redhat.com> Acked-by: Rusty Russell <rusty@rustcorp.com.au> Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
2012-08-08 16:07:04 +08:00
INIT_WORK(&vblk->config_work, virtblk_config_changed_work);
err = init_vq(vblk);
if (err)
goto out_free_vblk;
/* Default queue sizing is to fill the ring. */
if (likely(!virtblk_queue_depth)) {
queue_depth = vblk->vqs[0].vq->num_free;
/* ... but without indirect descs, we use 2 descs per req */
if (!virtio_has_feature(vdev, VIRTIO_RING_F_INDIRECT_DESC))
queue_depth /= 2;
} else {
queue_depth = virtblk_queue_depth;
}
memset(&vblk->tag_set, 0, sizeof(vblk->tag_set));
vblk->tag_set.ops = &virtio_mq_ops;
vblk->tag_set.queue_depth = queue_depth;
vblk->tag_set.numa_node = NUMA_NO_NODE;
vblk->tag_set.flags = BLK_MQ_F_SHOULD_MERGE;
vblk->tag_set.cmd_size =
sizeof(struct virtblk_req) +
sizeof(struct scatterlist) * sg_elems;
vblk->tag_set.driver_data = vblk;
vblk->tag_set.nr_hw_queues = vblk->num_vqs;
err = blk_mq_alloc_tag_set(&vblk->tag_set);
if (err)
goto out_free_vq;
vblk->disk = blk_mq_alloc_disk(&vblk->tag_set, vblk);
if (IS_ERR(vblk->disk)) {
err = PTR_ERR(vblk->disk);
goto out_free_tags;
}
q = vblk->disk->queue;
virtblk_name_format("vd", index, vblk->disk->disk_name, DISK_NAME_LEN);
vblk->disk->major = major;
vblk->disk->first_minor = index_to_minor(index);
vblk->disk->minors = 1 << PART_BITS;
vblk->disk->private_data = vblk;
vblk->disk->fops = &virtblk_fops;
vblk->disk->flags |= GENHD_FL_EXT_DEVT;
vblk->index = index;
/* configure queue flush support */
virtblk_update_cache_mode(vdev);
/* If disk is read-only in the host, the guest should obey */
if (virtio_has_feature(vdev, VIRTIO_BLK_F_RO))
set_disk_ro(vblk->disk, 1);
/* We can handle whatever the host told us to handle. */
blk_queue_max_segments(q, vblk->sg_elems-2);
/* No real sector limit. */
blk_queue_max_hw_sectors(q, -1U);
max_size = virtio_max_dma_size(vdev);
/* Host can optionally specify maximum segment size and number of
* segments. */
err = virtio_cread_feature(vdev, VIRTIO_BLK_F_SIZE_MAX,
struct virtio_blk_config, size_max, &v);
if (!err)
max_size = min(max_size, v);
blk_queue_max_segment_size(q, max_size);
/* Host can optionally specify the block size of the device */
err = virtio_cread_feature(vdev, VIRTIO_BLK_F_BLK_SIZE,
struct virtio_blk_config, blk_size,
&blk_size);
if (!err)
blk_queue_logical_block_size(q, blk_size);
else
blk_size = queue_logical_block_size(q);
if (unlikely(blk_size < SECTOR_SIZE || blk_size > PAGE_SIZE)) {
dev_err(&vdev->dev,
"block size is changed unexpectedly, now is %u\n",
blk_size);
err = -EINVAL;
for-5.15/block-2021-08-30 -----BEGIN PGP SIGNATURE----- iQJEBAABCAAuFiEEwPw5LcreJtl1+l5K99NY+ylx4KYFAmEs6H0QHGF4Ym9lQGtl cm5lbC5kawAKCRD301j7KXHgpukbD/9Qk9fQte+WJVmpbdvhV40gcKBVnGOVH0ke k+36x6AB/gWKnFHwtprsSyVqPxmzqwTv9VIq5l/s3Vydt3L61znvTneBeN03Wlkn UTxD0lY8HzyVWnZb82LBBjjy7cs6EzrFG4kBH/ZiTAyTcBsCAvzo5J7mywb4gFjj L/HeBq58EJ3WCUlxlVW1ijctvi7wnGoaH5bZY1TE00GGT6TysN2bEPfzjkuYHrDz RqhoQdWPLDz6h3x9lAncPw2MWlcmlGvJ96ABseAKFPKvXxE2PzgolSoQfVUUJtko bqGyy2ns+pxN11SrcGYjogEKVKhONoms/5UN1RtwRBVsgvecxlHER/SgyZ8luBDo lFhVXulkSjpswbWutRy3USge98GwMu2Z4ppP2CDmO7hkQd0DF8sL0kPKyaREkcHi NmsD/0zF2uUhUVN+PRC/MuzngAmL4Mmxjk70L+MohlK7e+H3pnEo1ec3OMcXe+wB dG6t/BFD9bYmj0UjsHeXEoR/iRuvSba1L8zBz5dhRaHH6DvdycYhpynXWWlU3C8K 3nzEVVpcDINMsiRl1Vqb6g6HsMwHIH84FRl7Mc51UmhW9C4gLfWMCt1guQuzOj72 yEbmCLydE/FR2IUPY7eqX8hRG8GTUlMtSvGdgnvBOcWj+K3buT/c5yVTHgTrN8ox LCOXHSvV6w== =S8fs -----END PGP SIGNATURE----- Merge tag 'for-5.15/block-2021-08-30' of git://git.kernel.dk/linux-block Pull block updates from Jens Axboe: "Nothing major in here - lots of good cleanups and tech debt handling, which is also evident in the diffstats. In particular: - Add disk sequence numbers (Matteo) - Discard merge fix (Ming) - Relax disk zoned reporting restrictions (Niklas) - Bio error handling zoned leak fix (Pavel) - Start of proper add_disk() error handling (Luis, Christoph) - blk crypto fix (Eric) - Non-standard GPT location support (Dmitry) - IO priority improvements and cleanups (Damien)o - blk-throtl improvements (Chunguang) - diskstats_show() stack reduction (Abd-Alrhman) - Loop scheduler selection (Bart) - Switch block layer to use kmap_local_page() (Christoph) - Remove obsolete disk_name helper (Christoph) - block_device refcounting improvements (Christoph) - Ensure gendisk always has a request queue reference (Christoph) - Misc fixes/cleanups (Shaokun, Oliver, Guoqing)" * tag 'for-5.15/block-2021-08-30' of git://git.kernel.dk/linux-block: (129 commits) sg: pass the device name to blk_trace_setup block, bfq: cleanup the repeated declaration blk-crypto: fix check for too-large dun_bytes blk-zoned: allow BLKREPORTZONE without CAP_SYS_ADMIN blk-zoned: allow zone management send operations without CAP_SYS_ADMIN block: mark blkdev_fsync static block: refine the disk_live check in del_gendisk mmc: sdhci-tegra: Enable MMC_CAP2_ALT_GPT_TEGRA mmc: block: Support alternative_gpt_sector() operation partitions/efi: Support non-standard GPT location block: Add alternative_gpt_sector() operation bio: fix page leak bio_add_hw_page failure block: remove CONFIG_DEBUG_BLOCK_EXT_DEVT block: remove a pointless call to MINOR() in device_add_disk null_blk: add error handling support for add_disk() virtio_blk: add error handling support for add_disk() block: add error handling for device_add_disk / add_disk block: return errors from disk_alloc_events block: return errors from blk_integrity_add block: call blk_register_queue earlier in device_add_disk ...
2021-08-31 09:52:11 +08:00
goto out_cleanup_disk;
}
/* Use topology information if available */
err = virtio_cread_feature(vdev, VIRTIO_BLK_F_TOPOLOGY,
struct virtio_blk_config, physical_block_exp,
&physical_block_exp);
if (!err && physical_block_exp)
blk_queue_physical_block_size(q,
blk_size * (1 << physical_block_exp));
err = virtio_cread_feature(vdev, VIRTIO_BLK_F_TOPOLOGY,
struct virtio_blk_config, alignment_offset,
&alignment_offset);
if (!err && alignment_offset)
blk_queue_alignment_offset(q, blk_size * alignment_offset);
err = virtio_cread_feature(vdev, VIRTIO_BLK_F_TOPOLOGY,
struct virtio_blk_config, min_io_size,
&min_io_size);
if (!err && min_io_size)
blk_queue_io_min(q, blk_size * min_io_size);
err = virtio_cread_feature(vdev, VIRTIO_BLK_F_TOPOLOGY,
struct virtio_blk_config, opt_io_size,
&opt_io_size);
if (!err && opt_io_size)
blk_queue_io_opt(q, blk_size * opt_io_size);
if (virtio_has_feature(vdev, VIRTIO_BLK_F_DISCARD)) {
q->limits.discard_granularity = blk_size;
virtio_cread(vdev, struct virtio_blk_config,
discard_sector_alignment, &v);
q->limits.discard_alignment = v ? v << SECTOR_SHIFT : 0;
virtio_cread(vdev, struct virtio_blk_config,
max_discard_sectors, &v);
blk_queue_max_discard_sectors(q, v ? v : UINT_MAX);
virtio_cread(vdev, struct virtio_blk_config, max_discard_seg,
&v);
blk_queue_max_discard_segments(q,
min_not_zero(v,
MAX_DISCARD_SEGMENTS));
blk_queue_flag_set(QUEUE_FLAG_DISCARD, q);
}
if (virtio_has_feature(vdev, VIRTIO_BLK_F_WRITE_ZEROES)) {
virtio_cread(vdev, struct virtio_blk_config,
max_write_zeroes_sectors, &v);
blk_queue_max_write_zeroes_sectors(q, v ? v : UINT_MAX);
}
virtblk_update_capacity(vblk, false);
virtio_device_ready(vdev);
err = device_add_disk(&vdev->dev, vblk->disk, virtblk_attr_groups);
if (err)
goto out_cleanup_disk;
return 0;
out_cleanup_disk:
blk_cleanup_disk(vblk->disk);
out_free_tags:
blk_mq_free_tag_set(&vblk->tag_set);
out_free_vq:
vdev->config->del_vqs(vdev);
kfree(vblk->vqs);
out_free_vblk:
kfree(vblk);
out_free_index:
ida_simple_remove(&vd_index_ida, index);
out:
return err;
}
static void virtblk_remove(struct virtio_device *vdev)
{
struct virtio_blk *vblk = vdev->priv;
/* Make sure no work handler is accessing the device. */
flush_work(&vblk->config_work);
del_gendisk(vblk->disk);
blk_cleanup_disk(vblk->disk);
blk_mq_free_tag_set(&vblk->tag_set);
virtio-blk: handle block_device_operations callbacks after hot unplug A userspace process holding a file descriptor to a virtio_blk device can still invoke block_device_operations after hot unplug. This leads to a use-after-free accessing vblk->vdev in virtblk_getgeo() when ioctl(HDIO_GETGEO) is invoked: BUG: unable to handle kernel NULL pointer dereference at 0000000000000090 IP: [<ffffffffc00e5450>] virtio_check_driver_offered_feature+0x10/0x90 [virtio] PGD 800000003a92f067 PUD 3a930067 PMD 0 Oops: 0000 [#1] SMP CPU: 0 PID: 1310 Comm: hdio-getgeo Tainted: G OE ------------ 3.10.0-1062.el7.x86_64 #1 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.13.0-0-gf21b5a4aeb02-prebuilt.qemu.org 04/01/2014 task: ffff9be5fbfb8000 ti: ffff9be5fa890000 task.ti: ffff9be5fa890000 RIP: 0010:[<ffffffffc00e5450>] [<ffffffffc00e5450>] virtio_check_driver_offered_feature+0x10/0x90 [virtio] RSP: 0018:ffff9be5fa893dc8 EFLAGS: 00010246 RAX: ffff9be5fc3f3400 RBX: ffff9be5fa893e30 RCX: 0000000000000000 RDX: 0000000000000000 RSI: 0000000000000004 RDI: ffff9be5fbc10b40 RBP: ffff9be5fa893dc8 R08: 0000000000000301 R09: 0000000000000301 R10: 0000000000000000 R11: 0000000000000000 R12: ffff9be5fdc24680 R13: ffff9be5fbc10b40 R14: ffff9be5fbc10480 R15: 0000000000000000 FS: 00007f1bfb968740(0000) GS:ffff9be5ffc00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000000000000090 CR3: 000000003a894000 CR4: 0000000000360ff0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: [<ffffffffc016ac37>] virtblk_getgeo+0x47/0x110 [virtio_blk] [<ffffffff8d3f200d>] ? handle_mm_fault+0x39d/0x9b0 [<ffffffff8d561265>] blkdev_ioctl+0x1f5/0xa20 [<ffffffff8d488771>] block_ioctl+0x41/0x50 [<ffffffff8d45d9e0>] do_vfs_ioctl+0x3a0/0x5a0 [<ffffffff8d45dc81>] SyS_ioctl+0xa1/0xc0 A related problem is that virtblk_remove() leaks the vd_index_ida index when something still holds a reference to vblk->disk during hot unplug. This causes virtio-blk device names to be lost (vda, vdb, etc). Fix these issues by protecting vblk->vdev with a mutex and reference counting vblk so the vd_index_ida index can be removed in all cases. Fixes: 48e4043d4529 ("virtio: add virtio disk geometry feature") Reported-by: Lance Digby <ldigby@redhat.com> Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com> Link: https://lore.kernel.org/r/20200430140442.171016-1-stefanha@redhat.com Signed-off-by: Michael S. Tsirkin <mst@redhat.com> Reviewed-by: Stefano Garzarella <sgarzare@redhat.com>
2020-04-30 22:04:42 +08:00
mutex_lock(&vblk->vdev_mutex);
/* Stop all the virtqueues. */
vdev->config->reset(vdev);
virtio-blk: handle block_device_operations callbacks after hot unplug A userspace process holding a file descriptor to a virtio_blk device can still invoke block_device_operations after hot unplug. This leads to a use-after-free accessing vblk->vdev in virtblk_getgeo() when ioctl(HDIO_GETGEO) is invoked: BUG: unable to handle kernel NULL pointer dereference at 0000000000000090 IP: [<ffffffffc00e5450>] virtio_check_driver_offered_feature+0x10/0x90 [virtio] PGD 800000003a92f067 PUD 3a930067 PMD 0 Oops: 0000 [#1] SMP CPU: 0 PID: 1310 Comm: hdio-getgeo Tainted: G OE ------------ 3.10.0-1062.el7.x86_64 #1 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.13.0-0-gf21b5a4aeb02-prebuilt.qemu.org 04/01/2014 task: ffff9be5fbfb8000 ti: ffff9be5fa890000 task.ti: ffff9be5fa890000 RIP: 0010:[<ffffffffc00e5450>] [<ffffffffc00e5450>] virtio_check_driver_offered_feature+0x10/0x90 [virtio] RSP: 0018:ffff9be5fa893dc8 EFLAGS: 00010246 RAX: ffff9be5fc3f3400 RBX: ffff9be5fa893e30 RCX: 0000000000000000 RDX: 0000000000000000 RSI: 0000000000000004 RDI: ffff9be5fbc10b40 RBP: ffff9be5fa893dc8 R08: 0000000000000301 R09: 0000000000000301 R10: 0000000000000000 R11: 0000000000000000 R12: ffff9be5fdc24680 R13: ffff9be5fbc10b40 R14: ffff9be5fbc10480 R15: 0000000000000000 FS: 00007f1bfb968740(0000) GS:ffff9be5ffc00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000000000000090 CR3: 000000003a894000 CR4: 0000000000360ff0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: [<ffffffffc016ac37>] virtblk_getgeo+0x47/0x110 [virtio_blk] [<ffffffff8d3f200d>] ? handle_mm_fault+0x39d/0x9b0 [<ffffffff8d561265>] blkdev_ioctl+0x1f5/0xa20 [<ffffffff8d488771>] block_ioctl+0x41/0x50 [<ffffffff8d45d9e0>] do_vfs_ioctl+0x3a0/0x5a0 [<ffffffff8d45dc81>] SyS_ioctl+0xa1/0xc0 A related problem is that virtblk_remove() leaks the vd_index_ida index when something still holds a reference to vblk->disk during hot unplug. This causes virtio-blk device names to be lost (vda, vdb, etc). Fix these issues by protecting vblk->vdev with a mutex and reference counting vblk so the vd_index_ida index can be removed in all cases. Fixes: 48e4043d4529 ("virtio: add virtio disk geometry feature") Reported-by: Lance Digby <ldigby@redhat.com> Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com> Link: https://lore.kernel.org/r/20200430140442.171016-1-stefanha@redhat.com Signed-off-by: Michael S. Tsirkin <mst@redhat.com> Reviewed-by: Stefano Garzarella <sgarzare@redhat.com>
2020-04-30 22:04:42 +08:00
/* Virtqueues are stopped, nothing can use vblk->vdev anymore. */
vblk->vdev = NULL;
vdev->config->del_vqs(vdev);
kfree(vblk->vqs);
virtio-blk: handle block_device_operations callbacks after hot unplug A userspace process holding a file descriptor to a virtio_blk device can still invoke block_device_operations after hot unplug. This leads to a use-after-free accessing vblk->vdev in virtblk_getgeo() when ioctl(HDIO_GETGEO) is invoked: BUG: unable to handle kernel NULL pointer dereference at 0000000000000090 IP: [<ffffffffc00e5450>] virtio_check_driver_offered_feature+0x10/0x90 [virtio] PGD 800000003a92f067 PUD 3a930067 PMD 0 Oops: 0000 [#1] SMP CPU: 0 PID: 1310 Comm: hdio-getgeo Tainted: G OE ------------ 3.10.0-1062.el7.x86_64 #1 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.13.0-0-gf21b5a4aeb02-prebuilt.qemu.org 04/01/2014 task: ffff9be5fbfb8000 ti: ffff9be5fa890000 task.ti: ffff9be5fa890000 RIP: 0010:[<ffffffffc00e5450>] [<ffffffffc00e5450>] virtio_check_driver_offered_feature+0x10/0x90 [virtio] RSP: 0018:ffff9be5fa893dc8 EFLAGS: 00010246 RAX: ffff9be5fc3f3400 RBX: ffff9be5fa893e30 RCX: 0000000000000000 RDX: 0000000000000000 RSI: 0000000000000004 RDI: ffff9be5fbc10b40 RBP: ffff9be5fa893dc8 R08: 0000000000000301 R09: 0000000000000301 R10: 0000000000000000 R11: 0000000000000000 R12: ffff9be5fdc24680 R13: ffff9be5fbc10b40 R14: ffff9be5fbc10480 R15: 0000000000000000 FS: 00007f1bfb968740(0000) GS:ffff9be5ffc00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000000000000090 CR3: 000000003a894000 CR4: 0000000000360ff0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: [<ffffffffc016ac37>] virtblk_getgeo+0x47/0x110 [virtio_blk] [<ffffffff8d3f200d>] ? handle_mm_fault+0x39d/0x9b0 [<ffffffff8d561265>] blkdev_ioctl+0x1f5/0xa20 [<ffffffff8d488771>] block_ioctl+0x41/0x50 [<ffffffff8d45d9e0>] do_vfs_ioctl+0x3a0/0x5a0 [<ffffffff8d45dc81>] SyS_ioctl+0xa1/0xc0 A related problem is that virtblk_remove() leaks the vd_index_ida index when something still holds a reference to vblk->disk during hot unplug. This causes virtio-blk device names to be lost (vda, vdb, etc). Fix these issues by protecting vblk->vdev with a mutex and reference counting vblk so the vd_index_ida index can be removed in all cases. Fixes: 48e4043d4529 ("virtio: add virtio disk geometry feature") Reported-by: Lance Digby <ldigby@redhat.com> Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com> Link: https://lore.kernel.org/r/20200430140442.171016-1-stefanha@redhat.com Signed-off-by: Michael S. Tsirkin <mst@redhat.com> Reviewed-by: Stefano Garzarella <sgarzare@redhat.com>
2020-04-30 22:04:42 +08:00
mutex_unlock(&vblk->vdev_mutex);
virtblk_put(vblk);
}
#ifdef CONFIG_PM_SLEEP
static int virtblk_freeze(struct virtio_device *vdev)
{
struct virtio_blk *vblk = vdev->priv;
/* Ensure we don't receive any more interrupts */
vdev->config->reset(vdev);
/* Make sure no work handler is accessing the device. */
flush_work(&vblk->config_work);
blk_mq_quiesce_queue(vblk->disk->queue);
vdev->config->del_vqs(vdev);
kfree(vblk->vqs);
return 0;
}
static int virtblk_restore(struct virtio_device *vdev)
{
struct virtio_blk *vblk = vdev->priv;
int ret;
ret = init_vq(vdev->priv);
if (ret)
return ret;
virtio_device_ready(vdev);
blk_mq_unquiesce_queue(vblk->disk->queue);
return 0;
}
#endif
static const struct virtio_device_id id_table[] = {
{ VIRTIO_ID_BLOCK, VIRTIO_DEV_ANY_ID },
{ 0 },
};
static unsigned int features_legacy[] = {
VIRTIO_BLK_F_SEG_MAX, VIRTIO_BLK_F_SIZE_MAX, VIRTIO_BLK_F_GEOMETRY,
VIRTIO_BLK_F_RO, VIRTIO_BLK_F_BLK_SIZE,
VIRTIO_BLK_F_FLUSH, VIRTIO_BLK_F_TOPOLOGY, VIRTIO_BLK_F_CONFIG_WCE,
VIRTIO_BLK_F_MQ, VIRTIO_BLK_F_DISCARD, VIRTIO_BLK_F_WRITE_ZEROES,
}
;
static unsigned int features[] = {
VIRTIO_BLK_F_SEG_MAX, VIRTIO_BLK_F_SIZE_MAX, VIRTIO_BLK_F_GEOMETRY,
VIRTIO_BLK_F_RO, VIRTIO_BLK_F_BLK_SIZE,
VIRTIO_BLK_F_FLUSH, VIRTIO_BLK_F_TOPOLOGY, VIRTIO_BLK_F_CONFIG_WCE,
VIRTIO_BLK_F_MQ, VIRTIO_BLK_F_DISCARD, VIRTIO_BLK_F_WRITE_ZEROES,
};
static struct virtio_driver virtio_blk = {
.feature_table = features,
.feature_table_size = ARRAY_SIZE(features),
.feature_table_legacy = features_legacy,
.feature_table_size_legacy = ARRAY_SIZE(features_legacy),
.driver.name = KBUILD_MODNAME,
.driver.owner = THIS_MODULE,
.id_table = id_table,
.validate = virtblk_validate,
.probe = virtblk_probe,
.remove = virtblk_remove,
.config_changed = virtblk_config_changed,
#ifdef CONFIG_PM_SLEEP
.freeze = virtblk_freeze,
.restore = virtblk_restore,
#endif
};
static int __init init(void)
{
int error;
virtblk_wq = alloc_workqueue("virtio-blk", 0, 0);
if (!virtblk_wq)
return -ENOMEM;
major = register_blkdev(0, "virtblk");
if (major < 0) {
error = major;
goto out_destroy_workqueue;
}
error = register_virtio_driver(&virtio_blk);
if (error)
goto out_unregister_blkdev;
return 0;
out_unregister_blkdev:
unregister_blkdev(major, "virtblk");
out_destroy_workqueue:
destroy_workqueue(virtblk_wq);
return error;
}
static void __exit fini(void)
{
unregister_virtio_driver(&virtio_blk);
unregister_blkdev(major, "virtblk");
destroy_workqueue(virtblk_wq);
}
module_init(init);
module_exit(fini);
MODULE_DEVICE_TABLE(virtio, id_table);
MODULE_DESCRIPTION("Virtio block driver");
MODULE_LICENSE("GPL");