2005-04-17 06:20:36 +08:00
|
|
|
/*
|
2013-05-12 22:14:07 +08:00
|
|
|
* loop.h
|
2005-04-17 06:20:36 +08:00
|
|
|
*
|
|
|
|
* Written by Theodore Ts'o, 3/29/93.
|
|
|
|
*
|
|
|
|
* Copyright 1993 by Theodore Ts'o. Redistribution of this file is
|
|
|
|
* permitted under the GNU General Public License.
|
|
|
|
*/
|
2012-10-13 17:46:48 +08:00
|
|
|
#ifndef _LINUX_LOOP_H
|
|
|
|
#define _LINUX_LOOP_H
|
2005-04-17 06:20:36 +08:00
|
|
|
|
|
|
|
#include <linux/bio.h>
|
|
|
|
#include <linux/blkdev.h>
|
2015-01-03 06:20:25 +08:00
|
|
|
#include <linux/blk-mq.h>
|
2005-04-17 06:20:36 +08:00
|
|
|
#include <linux/spinlock.h>
|
2006-03-23 19:00:38 +08:00
|
|
|
#include <linux/mutex.h>
|
2015-08-17 10:31:48 +08:00
|
|
|
#include <linux/kthread.h>
|
2012-10-13 17:46:48 +08:00
|
|
|
#include <uapi/linux/loop.h>
|
2005-04-17 06:20:36 +08:00
|
|
|
|
|
|
|
/* Possible states of device */
|
|
|
|
enum {
|
|
|
|
Lo_unbound,
|
|
|
|
Lo_bound,
|
|
|
|
Lo_rundown,
|
|
|
|
};
|
|
|
|
|
|
|
|
struct loop_func_table;
|
|
|
|
|
|
|
|
struct loop_device {
|
|
|
|
int lo_number;
|
2015-05-06 12:26:23 +08:00
|
|
|
atomic_t lo_refcnt;
|
2005-04-17 06:20:36 +08:00
|
|
|
loff_t lo_offset;
|
|
|
|
loff_t lo_sizelimit;
|
|
|
|
int lo_flags;
|
|
|
|
int (*transfer)(struct loop_device *, int cmd,
|
|
|
|
struct page *raw_page, unsigned raw_off,
|
|
|
|
struct page *loop_page, unsigned loop_off,
|
|
|
|
int size, sector_t real_block);
|
|
|
|
char lo_file_name[LO_NAME_SIZE];
|
|
|
|
char lo_crypt_name[LO_NAME_SIZE];
|
|
|
|
char lo_encrypt_key[LO_KEY_SIZE];
|
|
|
|
int lo_encrypt_key_size;
|
|
|
|
struct loop_func_table *lo_encryption;
|
|
|
|
__u32 lo_init[2];
|
2012-02-12 03:23:51 +08:00
|
|
|
kuid_t lo_key_owner; /* Who set the key */
|
2005-04-17 06:20:36 +08:00
|
|
|
int (*ioctl)(struct loop_device *, int cmd,
|
|
|
|
unsigned long arg);
|
|
|
|
|
|
|
|
struct file * lo_backing_file;
|
|
|
|
struct block_device *lo_device;
|
|
|
|
unsigned lo_blocksize;
|
|
|
|
void *key_data;
|
|
|
|
|
2005-10-21 15:22:34 +08:00
|
|
|
gfp_t old_gfp_mask;
|
2005-04-17 06:20:36 +08:00
|
|
|
|
|
|
|
spinlock_t lo_lock;
|
|
|
|
int lo_state;
|
2006-03-23 19:00:38 +08:00
|
|
|
struct mutex lo_ctl_mutex;
|
2015-08-17 10:31:48 +08:00
|
|
|
struct kthread_worker worker;
|
|
|
|
struct task_struct *worker_task;
|
2015-08-17 10:31:49 +08:00
|
|
|
bool use_dio;
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2007-07-24 09:44:00 +08:00
|
|
|
struct request_queue *lo_queue;
|
block: loop: improve performance via blk-mq
The conversion is a bit straightforward, and use work queue to
dispatch requests of loop block, and one big change is that requests
is submitted to backend file/device concurrently with work queue,
so throughput may get improved much. Given write requests over same
file are often run exclusively, so don't handle them concurrently for
avoiding extra context switch cost, possible lock contention and work
schedule cost. Also with blk-mq, there is opportunity to get loop I/O
merged before submitting to backend file/device.
In the following test:
- base: v3.19-rc2-2041231
- loop over file in ext4 file system on SSD disk
- bs: 4k, libaio, io depth: 64, O_DIRECT, num of jobs: 1
- throughput: IOPS
------------------------------------------------------
| | base | base with loop-mq | delta |
------------------------------------------------------
| randread | 1740 | 25318 | +1355%|
------------------------------------------------------
| read | 42196 | 51771 | +22.6%|
-----------------------------------------------------
| randwrite | 35709 | 34624 | -3% |
-----------------------------------------------------
| write | 39137 | 40326 | +3% |
-----------------------------------------------------
So loop-mq can improve throughput for both read and randread, meantime,
performance of write and randwrite isn't hurted basically.
Another benefit is that loop driver code gets simplified
much after blk-mq conversion, and the patch can be thought as
cleanup too.
Signed-off-by: Ming Lei <ming.lei@canonical.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
2014-12-31 21:22:57 +08:00
|
|
|
struct blk_mq_tag_set tag_set;
|
2007-05-08 15:28:20 +08:00
|
|
|
struct gendisk *lo_disk;
|
2005-04-17 06:20:36 +08:00
|
|
|
};
|
|
|
|
|
block: loop: improve performance via blk-mq
The conversion is a bit straightforward, and use work queue to
dispatch requests of loop block, and one big change is that requests
is submitted to backend file/device concurrently with work queue,
so throughput may get improved much. Given write requests over same
file are often run exclusively, so don't handle them concurrently for
avoiding extra context switch cost, possible lock contention and work
schedule cost. Also with blk-mq, there is opportunity to get loop I/O
merged before submitting to backend file/device.
In the following test:
- base: v3.19-rc2-2041231
- loop over file in ext4 file system on SSD disk
- bs: 4k, libaio, io depth: 64, O_DIRECT, num of jobs: 1
- throughput: IOPS
------------------------------------------------------
| | base | base with loop-mq | delta |
------------------------------------------------------
| randread | 1740 | 25318 | +1355%|
------------------------------------------------------
| read | 42196 | 51771 | +22.6%|
-----------------------------------------------------
| randwrite | 35709 | 34624 | -3% |
-----------------------------------------------------
| write | 39137 | 40326 | +3% |
-----------------------------------------------------
So loop-mq can improve throughput for both read and randread, meantime,
performance of write and randwrite isn't hurted basically.
Another benefit is that loop driver code gets simplified
much after blk-mq conversion, and the patch can be thought as
cleanup too.
Signed-off-by: Ming Lei <ming.lei@canonical.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
2014-12-31 21:22:57 +08:00
|
|
|
struct loop_cmd {
|
2015-08-17 10:31:48 +08:00
|
|
|
struct kthread_work work;
|
block: loop: improve performance via blk-mq
The conversion is a bit straightforward, and use work queue to
dispatch requests of loop block, and one big change is that requests
is submitted to backend file/device concurrently with work queue,
so throughput may get improved much. Given write requests over same
file are often run exclusively, so don't handle them concurrently for
avoiding extra context switch cost, possible lock contention and work
schedule cost. Also with blk-mq, there is opportunity to get loop I/O
merged before submitting to backend file/device.
In the following test:
- base: v3.19-rc2-2041231
- loop over file in ext4 file system on SSD disk
- bs: 4k, libaio, io depth: 64, O_DIRECT, num of jobs: 1
- throughput: IOPS
------------------------------------------------------
| | base | base with loop-mq | delta |
------------------------------------------------------
| randread | 1740 | 25318 | +1355%|
------------------------------------------------------
| read | 42196 | 51771 | +22.6%|
-----------------------------------------------------
| randwrite | 35709 | 34624 | -3% |
-----------------------------------------------------
| write | 39137 | 40326 | +3% |
-----------------------------------------------------
So loop-mq can improve throughput for both read and randread, meantime,
performance of write and randwrite isn't hurted basically.
Another benefit is that loop driver code gets simplified
much after blk-mq conversion, and the patch can be thought as
cleanup too.
Signed-off-by: Ming Lei <ming.lei@canonical.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
2014-12-31 21:22:57 +08:00
|
|
|
struct request *rq;
|
|
|
|
struct list_head list;
|
block: loop: support DIO & AIO
There are at least 3 advantages to use direct I/O and AIO on
read/write loop's backing file:
1) double cache can be avoided, then memory usage gets
decreased a lot
2) not like user space direct I/O, there isn't cost of
pinning pages
3) avoid context switch for obtaining good throughput
- in buffered file read, random I/O top throughput is often obtained
only if they are submitted concurrently from lots of tasks; but for
sequential I/O, most of times they can be hit from page cache, so
concurrent submissions often introduce unnecessary context switch
and can't improve throughput much. There was such discussion[1]
to use non-blocking I/O to improve the problem for application.
- with direct I/O and AIO, concurrent submissions can be
avoided and random read throughput can't be affected meantime
xfstests(-g auto, ext4) is basically passed when running with
direct I/O(aio), one exception is generic/232, but it failed in
loop buffered I/O(4.2-rc6-next-20150814) too.
Follows the fio test result for performance purpose:
4 jobs fio test inside ext4 file system over loop block
1) How to run
- KVM: 4 VCPUs, 2G RAM
- linux kernel: 4.2-rc6-next-20150814(base) with the patchset
- the loop block is over one image on SSD.
- linux psync, 4 jobs, size 1500M, ext4 over loop block
- test result: IOPS from fio output
2) Throughput(IOPS) becomes a bit better with direct I/O(aio)
-------------------------------------------------------------
test cases |randread |read |randwrite |write |
-------------------------------------------------------------
base |8015 |113811 |67442 |106978
-------------------------------------------------------------
base+loop aio |8136 |125040 |67811 |111376
-------------------------------------------------------------
- somehow, it should be caused by more page cache avaiable for
application or one extra page copy is avoided in case of direct I/O
3) context switch
- context switch decreased by ~50% with loop direct I/O(aio)
compared with loop buffered I/O(4.2-rc6-next-20150814)
4) memory usage from /proc/meminfo
-------------------------------------------------------------
| Buffers | Cached
-------------------------------------------------------------
base | > 760MB | ~950MB
-------------------------------------------------------------
base+loop direct I/O(aio) | < 5MB | ~1.6GB
-------------------------------------------------------------
- so there are much more page caches available for application with
direct I/O
[1] https://lwn.net/Articles/612483/
Signed-off-by: Ming Lei <ming.lei@canonical.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@fb.com>
2015-08-17 10:31:51 +08:00
|
|
|
bool use_aio; /* use AIO interface to handle I/O */
|
|
|
|
struct kiocb iocb;
|
block: loop: improve performance via blk-mq
The conversion is a bit straightforward, and use work queue to
dispatch requests of loop block, and one big change is that requests
is submitted to backend file/device concurrently with work queue,
so throughput may get improved much. Given write requests over same
file are often run exclusively, so don't handle them concurrently for
avoiding extra context switch cost, possible lock contention and work
schedule cost. Also with blk-mq, there is opportunity to get loop I/O
merged before submitting to backend file/device.
In the following test:
- base: v3.19-rc2-2041231
- loop over file in ext4 file system on SSD disk
- bs: 4k, libaio, io depth: 64, O_DIRECT, num of jobs: 1
- throughput: IOPS
------------------------------------------------------
| | base | base with loop-mq | delta |
------------------------------------------------------
| randread | 1740 | 25318 | +1355%|
------------------------------------------------------
| read | 42196 | 51771 | +22.6%|
-----------------------------------------------------
| randwrite | 35709 | 34624 | -3% |
-----------------------------------------------------
| write | 39137 | 40326 | +3% |
-----------------------------------------------------
So loop-mq can improve throughput for both read and randread, meantime,
performance of write and randwrite isn't hurted basically.
Another benefit is that loop driver code gets simplified
much after blk-mq conversion, and the patch can be thought as
cleanup too.
Signed-off-by: Ming Lei <ming.lei@canonical.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
2014-12-31 21:22:57 +08:00
|
|
|
};
|
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
/* Support for loadable transfer modules */
|
|
|
|
struct loop_func_table {
|
|
|
|
int number; /* filter type */
|
|
|
|
int (*transfer)(struct loop_device *lo, int cmd,
|
|
|
|
struct page *raw_page, unsigned raw_off,
|
|
|
|
struct page *loop_page, unsigned loop_off,
|
|
|
|
int size, sector_t real_block);
|
|
|
|
int (*init)(struct loop_device *, const struct loop_info64 *);
|
|
|
|
/* release is called from loop_unregister_transfer or clr_fd */
|
|
|
|
int (*release)(struct loop_device *);
|
|
|
|
int (*ioctl)(struct loop_device *, int cmd, unsigned long arg);
|
|
|
|
struct module *owner;
|
|
|
|
};
|
|
|
|
|
|
|
|
int loop_register_transfer(struct loop_func_table *funcs);
|
|
|
|
int loop_unregister_transfer(int number);
|
|
|
|
|
|
|
|
#endif
|