OpenCloudOS-Kernel/drivers/md/dm.c

2781 lines
60 KiB
C
Raw Normal View History

/*
* Copyright (C) 2001, 2002 Sistina Software (UK) Limited.
* Copyright (C) 2004-2008 Red Hat, Inc. All rights reserved.
*
* This file is released under the GPL.
*/
#include "dm.h"
#include "dm-uevent.h"
#include <linux/init.h>
#include <linux/module.h>
#include <linux/mutex.h>
#include <linux/moduleparam.h>
#include <linux/blkpg.h>
#include <linux/bio.h>
#include <linux/mempool.h>
#include <linux/slab.h>
#include <linux/idr.h>
#include <linux/hdreg.h>
dm: separate device deletion from dm_put This patch separates the device deletion code from dm_put() to make sure the deletion happens in the process context. By this patch, device deletion always occurs in an ioctl (process) context and dm_put() can be called in interrupt context. As a result, the request-based dm's bad dm_put() usage pointed out by Mikulas below disappears. http://marc.info/?l=dm-devel&m=126699981019735&w=2 Without this patch, I confirmed there is a case to crash the system: dm_put() => dm_table_destroy() => vfree() => BUG_ON(in_interrupt()) Some more backgrounds and details: In request-based dm, a device opener can remove a mapped_device while the last request is still completing, because bios in the last request complete first and then the device opener can close and remove the mapped_device before the last request completes: CPU0 CPU1 ================================================================= <<INTERRUPT>> blk_end_request_all(clone_rq) blk_update_request(clone_rq) bio_endio(clone_bio) == end_clone_bio blk_update_request(orig_rq) bio_endio(orig_bio) <<I/O completed>> dm_blk_close() dev_remove() dm_put(md) <<Free md>> blk_finish_request(clone_rq) .... dm_end_request(clone_rq) free_rq_clone(clone_rq) blk_end_request_all(orig_rq) rq_completed(md) So request-based dm used dm_get()/dm_put() to hold md for each I/O until its request completion handling is fully done. However, the final dm_put() can call the device deletion code which must not be run in interrupt context and may cause kernel panic. To solve the problem, this patch moves the device deletion code, dm_destroy(), to predetermined places that is actually deleting the mapped_device in ioctl (process) context, and changes dm_put() just to decrement the reference count of the mapped_device. By this change, dm_put() can be used in any context and the symmetric model below is introduced: dm_create(): create a mapped_device dm_destroy(): destroy a mapped_device dm_get(): increment the reference count of a mapped_device dm_put(): decrement the reference count of a mapped_device dm_destroy() waits for all references of the mapped_device to disappear, then deletes the mapped_device. dm_destroy() uses active waiting with msleep(1), since deleting the mapped_device isn't performance-critical task. And since at this point, nobody opens the mapped_device and no new reference will be taken, the pending counts are just for racing completing activity and will eventually decrease to zero. For the unlikely case of the forced module unload, dm_destroy_immediate(), which doesn't wait and forcibly deletes the mapped_device, is also introduced and used in dm_hash_remove_all(). Otherwise, "rmmod -f" may be stuck and never return. And now, because the mapped_device is deleted at this point, subsequent accesses to the mapped_device may cause NULL pointer references. Cc: stable@kernel.org Signed-off-by: Kiyoshi Ueda <k-ueda@ct.jp.nec.com> Signed-off-by: Jun'ichi Nomura <j-nomura@ce.jp.nec.com> Signed-off-by: Alasdair G Kergon <agk@redhat.com>
2010-08-12 11:13:56 +08:00
#include <linux/delay.h>
tracing/events: convert block trace points to TRACE_EVENT() TRACE_EVENT is a more generic way to define tracepoints. Doing so adds these new capabilities to this tracepoint: - zero-copy and per-cpu splice() tracing - binary tracing without printf overhead - structured logging records exposed under /debug/tracing/events - trace events embedded in function tracer output and other plugins - user-defined, per tracepoint filter expressions ... Cons: - no dev_t info for the output of plug, unplug_timer and unplug_io events. no dev_t info for getrq and sleeprq events if bio == NULL. no dev_t info for rq_abort,...,rq_requeue events if rq->rq_disk == NULL. This is mainly because we can't get the deivce from a request queue. But this may change in the future. - A packet command is converted to a string in TP_assign, not TP_print. While blktrace do the convertion just before output. Since pc requests should be rather rare, this is not a big issue. - In blktrace, an event can have 2 different print formats, but a TRACE_EVENT has a unique format, which means we have some unused data in a trace entry. The overhead is minimized by using __dynamic_array() instead of __array(). I've benchmarked the ioctl blktrace vs the splice based TRACE_EVENT tracing: dd dd + ioctl blktrace dd + TRACE_EVENT (splice) 1 7.36s, 42.7 MB/s 7.50s, 42.0 MB/s 7.41s, 42.5 MB/s 2 7.43s, 42.3 MB/s 7.48s, 42.1 MB/s 7.43s, 42.4 MB/s 3 7.38s, 42.6 MB/s 7.45s, 42.2 MB/s 7.41s, 42.5 MB/s So the overhead of tracing is very small, and no regression when using those trace events vs blktrace. And the binary output of TRACE_EVENT is much smaller than blktrace: # ls -l -h -rw-r--r-- 1 root root 8.8M 06-09 13:24 sda.blktrace.0 -rw-r--r-- 1 root root 195K 06-09 13:24 sda.blktrace.1 -rw-r--r-- 1 root root 2.7M 06-09 13:25 trace_splice.out Following are some comparisons between TRACE_EVENT and blktrace: plug: kjournald-480 [000] 303.084981: block_plug: [kjournald] kjournald-480 [000] 303.084981: 8,0 P N [kjournald] unplug_io: kblockd/0-118 [000] 300.052973: block_unplug_io: [kblockd/0] 1 kblockd/0-118 [000] 300.052974: 8,0 U N [kblockd/0] 1 remap: kjournald-480 [000] 303.085042: block_remap: 8,0 W 102736992 + 8 <- (8,8) 33384 kjournald-480 [000] 303.085043: 8,0 A W 102736992 + 8 <- (8,8) 33384 bio_backmerge: kjournald-480 [000] 303.085086: block_bio_backmerge: 8,0 W 102737032 + 8 [kjournald] kjournald-480 [000] 303.085086: 8,0 M W 102737032 + 8 [kjournald] getrq: kjournald-480 [000] 303.084974: block_getrq: 8,0 W 102736984 + 8 [kjournald] kjournald-480 [000] 303.084975: 8,0 G W 102736984 + 8 [kjournald] bash-2066 [001] 1072.953770: 8,0 G N [bash] bash-2066 [001] 1072.953773: block_getrq: 0,0 N 0 + 0 [bash] rq_complete: konsole-2065 [001] 300.053184: block_rq_complete: 8,0 W () 103669040 + 16 [0] konsole-2065 [001] 300.053191: 8,0 C W 103669040 + 16 [0] ksoftirqd/1-7 [001] 1072.953811: 8,0 C N (5a 00 08 00 00 00 00 00 24 00) [0] ksoftirqd/1-7 [001] 1072.953813: block_rq_complete: 0,0 N (5a 00 08 00 00 00 00 00 24 00) 0 + 0 [0] rq_insert: kjournald-480 [000] 303.084985: block_rq_insert: 8,0 W 0 () 102736984 + 8 [kjournald] kjournald-480 [000] 303.084986: 8,0 I W 102736984 + 8 [kjournald] Changelog from v2 -> v3: - use the newly introduced __dynamic_array(). Changelog from v1 -> v2: - use __string() instead of __array() to minimize the memory required to store hex dump of rq->cmd(). - support large pc requests. - add missing blk_fill_rwbs_rq() in block_rq_requeue TRACE_EVENT. - some cleanups. Signed-off-by: Li Zefan <lizf@cn.fujitsu.com> LKML-Reference: <4A2DF669.5070905@cn.fujitsu.com> Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
2009-06-09 13:43:05 +08:00
#include <trace/events/block.h>
#define DM_MSG_PREFIX "core"
#ifdef CONFIG_PRINTK
/*
* ratelimit state to be used in DMXXX_LIMIT().
*/
DEFINE_RATELIMIT_STATE(dm_ratelimit_state,
DEFAULT_RATELIMIT_INTERVAL,
DEFAULT_RATELIMIT_BURST);
EXPORT_SYMBOL(dm_ratelimit_state);
#endif
/*
* Cookies are numeric values sent with CHANGE and REMOVE
* uevents while resuming, removing or renaming the device.
*/
#define DM_COOKIE_ENV_VAR_NAME "DM_COOKIE"
#define DM_COOKIE_LENGTH 24
static const char *_name = DM_NAME;
static unsigned int major = 0;
static unsigned int _major = 0;
static DEFINE_IDR(_minor_idr);
static DEFINE_SPINLOCK(_minor_lock);
/*
* For bio-based dm.
* One of these is allocated per bio.
*/
struct dm_io {
struct mapped_device *md;
int error;
atomic_t io_count;
struct bio *bio;
unsigned long start_time;
spinlock_t endio_lock;
};
/*
* For bio-based dm.
* One of these is allocated per target within a bio. Hopefully
* this will be simplified out one day.
*/
struct dm_target_io {
struct dm_io *io;
struct dm_target *ti;
union map_info info;
};
/*
* For request-based dm.
* One of these is allocated per request.
*/
struct dm_rq_target_io {
struct mapped_device *md;
struct dm_target *ti;
struct request *orig, clone;
int error;
union map_info info;
};
/*
* For request-based dm.
* One of these is allocated per bio.
*/
struct dm_rq_clone_bio_info {
struct bio *orig;
dm: prepare for request based option This patch adds core functions for request-based dm. When struct mapped device (md) is initialized, md->queue has an I/O scheduler and the following functions are used for request-based dm as the queue functions: make_request_fn: dm_make_request() pref_fn: dm_prep_fn() request_fn: dm_request_fn() softirq_done_fn: dm_softirq_done() lld_busy_fn: dm_lld_busy() Actual initializations are done in another patch (PATCH 2). Below is a brief summary of how request-based dm behaves, including: - making request from bio - cloning, mapping and dispatching request - completing request and bio - suspending md - resuming md bio to request ============== md->queue->make_request_fn() (dm_make_request()) calls __make_request() for a bio submitted to the md. Then, the bio is kept in the queue as a new request or merged into another request in the queue if possible. Cloning and Mapping =================== Cloning and mapping are done in md->queue->request_fn() (dm_request_fn()), when requests are dispatched after they are sorted by the I/O scheduler. dm_request_fn() checks busy state of underlying devices using target's busy() function and stops dispatching requests to keep them on the dm device's queue if busy. It helps better I/O merging, since no merge is done for a request once it is dispatched to underlying devices. Actual cloning and mapping are done in dm_prep_fn() and map_request() called from dm_request_fn(). dm_prep_fn() clones not only request but also bios of the request so that dm can hold bio completion in error cases and prevent the bio submitter from noticing the error. (See the "Completion" section below for details.) After the cloning, the clone is mapped by target's map_rq() function and inserted to underlying device's queue using blk_insert_cloned_request(). Completion ========== Request completion can be hooked by rq->end_io(), but then, all bios in the request will have been completed even error cases, and the bio submitter will have noticed the error. To prevent the bio completion in error cases, request-based dm clones both bio and request and hooks both bio->bi_end_io() and rq->end_io(): bio->bi_end_io(): end_clone_bio() rq->end_io(): end_clone_request() Summary of the request completion flow is below: blk_end_request() for a clone request => blk_update_request() => bio->bi_end_io() == end_clone_bio() for each clone bio => Free the clone bio => Success: Complete the original bio (blk_update_request()) Error: Don't complete the original bio => blk_finish_request() => rq->end_io() == end_clone_request() => blk_complete_request() => dm_softirq_done() => Free the clone request => Success: Complete the original request (blk_end_request()) Error: Requeue the original request end_clone_bio() completes the original request on the size of the original bio in successful cases. Even if all bios in the original request are completed by that completion, the original request must not be completed yet to keep the ordering of request completion for the stacking. So end_clone_bio() uses blk_update_request() instead of blk_end_request(). In error cases, end_clone_bio() doesn't complete the original bio. It just frees the cloned bio and gives over the error handling to end_clone_request(). end_clone_request(), which is called with queue lock held, completes the clone request and the original request in a softirq context (dm_softirq_done()), which has no queue lock, to avoid a deadlock issue on submission of another request during the completion: - The submitted request may be mapped to the same device - Request submission requires queue lock, but the queue lock has been held by itself and it doesn't know that The clone request has no clone bio when dm_softirq_done() is called. So target drivers can't resubmit it again even error cases. Instead, they can ask dm core for requeueing and remapping the original request in that cases. suspend ======= Request-based dm uses stopping md->queue as suspend of the md. For noflush suspend, just stops md->queue. For flush suspend, inserts a marker request to the tail of md->queue. And dispatches all requests in md->queue until the marker comes to the front of md->queue. Then, stops dispatching request and waits for the all dispatched requests to complete. After that, completes the marker request, stops md->queue and wake up the waiter on the suspend queue, md->wait. resume ====== Starts md->queue. Signed-off-by: Kiyoshi Ueda <k-ueda@ct.jp.nec.com> Signed-off-by: Jun'ichi Nomura <j-nomura@ce.jp.nec.com> Signed-off-by: Alasdair G Kergon <agk@redhat.com>
2009-06-22 17:12:35 +08:00
struct dm_rq_target_io *tio;
};
union map_info *dm_get_mapinfo(struct bio *bio)
{
if (bio && bio->bi_private)
return &((struct dm_target_io *)bio->bi_private)->info;
return NULL;
}
dm: prepare for request based option This patch adds core functions for request-based dm. When struct mapped device (md) is initialized, md->queue has an I/O scheduler and the following functions are used for request-based dm as the queue functions: make_request_fn: dm_make_request() pref_fn: dm_prep_fn() request_fn: dm_request_fn() softirq_done_fn: dm_softirq_done() lld_busy_fn: dm_lld_busy() Actual initializations are done in another patch (PATCH 2). Below is a brief summary of how request-based dm behaves, including: - making request from bio - cloning, mapping and dispatching request - completing request and bio - suspending md - resuming md bio to request ============== md->queue->make_request_fn() (dm_make_request()) calls __make_request() for a bio submitted to the md. Then, the bio is kept in the queue as a new request or merged into another request in the queue if possible. Cloning and Mapping =================== Cloning and mapping are done in md->queue->request_fn() (dm_request_fn()), when requests are dispatched after they are sorted by the I/O scheduler. dm_request_fn() checks busy state of underlying devices using target's busy() function and stops dispatching requests to keep them on the dm device's queue if busy. It helps better I/O merging, since no merge is done for a request once it is dispatched to underlying devices. Actual cloning and mapping are done in dm_prep_fn() and map_request() called from dm_request_fn(). dm_prep_fn() clones not only request but also bios of the request so that dm can hold bio completion in error cases and prevent the bio submitter from noticing the error. (See the "Completion" section below for details.) After the cloning, the clone is mapped by target's map_rq() function and inserted to underlying device's queue using blk_insert_cloned_request(). Completion ========== Request completion can be hooked by rq->end_io(), but then, all bios in the request will have been completed even error cases, and the bio submitter will have noticed the error. To prevent the bio completion in error cases, request-based dm clones both bio and request and hooks both bio->bi_end_io() and rq->end_io(): bio->bi_end_io(): end_clone_bio() rq->end_io(): end_clone_request() Summary of the request completion flow is below: blk_end_request() for a clone request => blk_update_request() => bio->bi_end_io() == end_clone_bio() for each clone bio => Free the clone bio => Success: Complete the original bio (blk_update_request()) Error: Don't complete the original bio => blk_finish_request() => rq->end_io() == end_clone_request() => blk_complete_request() => dm_softirq_done() => Free the clone request => Success: Complete the original request (blk_end_request()) Error: Requeue the original request end_clone_bio() completes the original request on the size of the original bio in successful cases. Even if all bios in the original request are completed by that completion, the original request must not be completed yet to keep the ordering of request completion for the stacking. So end_clone_bio() uses blk_update_request() instead of blk_end_request(). In error cases, end_clone_bio() doesn't complete the original bio. It just frees the cloned bio and gives over the error handling to end_clone_request(). end_clone_request(), which is called with queue lock held, completes the clone request and the original request in a softirq context (dm_softirq_done()), which has no queue lock, to avoid a deadlock issue on submission of another request during the completion: - The submitted request may be mapped to the same device - Request submission requires queue lock, but the queue lock has been held by itself and it doesn't know that The clone request has no clone bio when dm_softirq_done() is called. So target drivers can't resubmit it again even error cases. Instead, they can ask dm core for requeueing and remapping the original request in that cases. suspend ======= Request-based dm uses stopping md->queue as suspend of the md. For noflush suspend, just stops md->queue. For flush suspend, inserts a marker request to the tail of md->queue. And dispatches all requests in md->queue until the marker comes to the front of md->queue. Then, stops dispatching request and waits for the all dispatched requests to complete. After that, completes the marker request, stops md->queue and wake up the waiter on the suspend queue, md->wait. resume ====== Starts md->queue. Signed-off-by: Kiyoshi Ueda <k-ueda@ct.jp.nec.com> Signed-off-by: Jun'ichi Nomura <j-nomura@ce.jp.nec.com> Signed-off-by: Alasdair G Kergon <agk@redhat.com>
2009-06-22 17:12:35 +08:00
union map_info *dm_get_rq_mapinfo(struct request *rq)
{
if (rq && rq->end_io_data)
return &((struct dm_rq_target_io *)rq->end_io_data)->info;
return NULL;
}
EXPORT_SYMBOL_GPL(dm_get_rq_mapinfo);
#define MINOR_ALLOCED ((void *)-1)
/*
* Bits for the md->flags field.
*/
#define DMF_BLOCK_IO_FOR_SUSPEND 0
#define DMF_SUSPENDED 1
#define DMF_FROZEN 2
#define DMF_FREEING 3
#define DMF_DELETING 4
[PATCH] dm: suspend: add noflush pushback In device-mapper I/O is sometimes queued within targets for later processing. For example the multipath target can be configured to store I/O when no paths are available instead of returning it -EIO. This patch allows the device-mapper core to instruct a target to transfer the contents of any such in-target queue back into the core. This frees up the resources used by the target so the core can replace that target with an alternative one and then resend the I/O to it. Without this patch the only way to change the target in such circumstances involves returning the I/O with an error back to the filesystem/application. In the multipath case, this patch will let us add new paths for existing I/O to try after all the existing paths have failed. DMF_NOFLUSH_SUSPENDING ---------------------- If the DM_NOFLUSH_FLAG ioctl option is specified at suspend time, the DMF_NOFLUSH_SUSPENDING flag is set in md->flags during dm_suspend(). It is always cleared before dm_suspend() returns. The flag must be visible while the target is flushing pending I/Os so it is set before presuspend where the flush starts and unset after the wait for md->pending where the flush ends. Target drivers can check this flag by calling dm_noflush_suspending(). DM_MAPIO_REQUEUE / DM_ENDIO_REQUEUE ----------------------------------- A target's map() function can now return DM_MAPIO_REQUEUE to request the device mapper core queue the bio. Similarly, a target's end_io() function can return DM_ENDIO_REQUEUE to request the same. This has been labelled 'pushback'. The __map_bio() and clone_endio() functions in the core treat these return values as errors and call dec_pending() to end the I/O. dec_pending ----------- dec_pending() saves the pushback request in struct dm_io->error. Once all the split clones have ended, dec_pending() will put the original bio on the md->pushback list. Note that this supercedes any I/O errors. It is possible for the suspend with DM_NOFLUSH_FLAG to be aborted while in progress (e.g. by user interrupt). dec_pending() checks for this and returns -EIO if it happened. pushdback list and pushback_lock -------------------------------- The bio is queued on md->pushback temporarily in dec_pending(), and after all pending I/Os return, md->pushback is merged into md->deferred in dm_suspend() for re-issuing at resume time. md->pushback_lock protects md->pushback. The lock should be held with irq disabled because dec_pending() can be called from interrupt context. Queueing bios to md->pushback in dec_pending() must be done atomically with the check for DMF_NOFLUSH_SUSPENDING flag. So md->pushback_lock is held when checking the flag. Otherwise dec_pending() may queue a bio to md->pushback after the interrupted dm_suspend() flushes md->pushback. Then the bio would be left in md->pushback. Flag setting in dm_suspend() can be done without md->pushback_lock because the flag is checked only after presuspend and the set value is already made visible via the target's presuspend function. The flag can be checked without md->pushback_lock (e.g. the first part of the dec_pending() or target drivers), because the flag is checked again with md->pushback_lock held when the bio is really queued to md->pushback as described above. So even if the flag is cleared after the lockless checkings, the bio isn't left in md->pushback but returned to applications with -EIO. Other notes on the current patch -------------------------------- - md->pushback is added to the struct mapped_device instead of using md->deferred directly because md->io_lock which protects md->deferred is rw_semaphore and can't be used in interrupt context like dec_pending(), and md->io_lock protects the DMF_BLOCK_IO flag of md->flags too. - Don't issue lock_fs() in dm_suspend() if the DM_NOFLUSH_FLAG ioctl option is specified, because I/Os generated by lock_fs() would be pushed back and never return if there were no valid devices. - If an error occurs in dm_suspend() after the DMF_NOFLUSH_SUSPENDING flag is set, md->pushback must be flushed because I/Os may be queued to the list already. (flush_and_out label in dm_suspend()) Test results ------------ I have tested using multipath target with the next patch. The following tests are for regression/compatibility: - I/Os succeed when valid paths exist; - I/Os fail when there are no valid paths and queue_if_no_path is not set; - I/Os are queued in the multipath target when there are no valid paths and queue_if_no_path is set; - The queued I/Os above fail when suspend is issued without the DM_NOFLUSH_FLAG ioctl option. I/Os spanning 2 multipath targets also fail. The following tests are for the normal code path of new pushback feature: - Queued I/Os in the multipath target are flushed from the target but don't return when suspend is issued with the DM_NOFLUSH_FLAG ioctl option; - The I/Os above are queued in the multipath target again when resume is issued without path recovery; - The I/Os above succeed when resume is issued after path recovery or table load; - Queued I/Os in the multipath target succeed when resume is issued with the DM_NOFLUSH_FLAG ioctl option after table load. I/Os spanning 2 multipath targets also succeed. The following tests are for the error paths of the new pushback feature: - When the bdget_disk() fails in dm_suspend(), the DMF_NOFLUSH_SUSPENDING flag is cleared and I/Os already queued to the pushback list are flushed properly. - When suspend with the DM_NOFLUSH_FLAG ioctl option is interrupted, o I/Os which had already been queued to the pushback list at the time don't return, and are re-issued at resume time; o I/Os which hadn't been returned at the time return with EIO. Signed-off-by: Kiyoshi Ueda <k-ueda@ct.jp.nec.com> Signed-off-by: Jun'ichi Nomura <j-nomura@ce.jp.nec.com> Signed-off-by: Alasdair G Kergon <agk@redhat.com> Cc: dm-devel@redhat.com Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-12-08 18:41:09 +08:00
#define DMF_NOFLUSH_SUSPENDING 5
#define DMF_MERGE_IS_OPTIONAL 6
/*
* Work processed by per-device workqueue.
*/
struct mapped_device {
struct rw_semaphore io_lock;
struct mutex suspend_lock;
rwlock_t map_lock;
atomic_t holders;
atomic_t open_count;
unsigned long flags;
struct request_queue *queue;
unsigned type;
dm: do not initialise full request queue when bio based Change bio-based mapped devices no longer to have a fully initialized request_queue (request_fn, elevator, etc). This means bio-based DM devices no longer register elevator sysfs attributes ('iosched/' tree or 'scheduler' other than "none"). In contrast, a request-based DM device will continue to have a full request_queue and will register elevator sysfs attributes. Therefore a user can determine a DM device's type by checking if elevator sysfs attributes exist. First allocate a minimalist request_queue structure for a DM device (needed for both bio and request-based DM). Initialization of a full request_queue is deferred until it is known that the DM device is request-based, at the end of the table load sequence. Factor DM device's request_queue initialization: - common to both request-based and bio-based into dm_init_md_queue(). - specific to request-based into dm_init_request_based_queue(). The md->type_lock mutex is used to protect md->queue, in addition to md->type, during table_load(). A DM device's first table_load will establish the immutable md->type. But md->queue initialization, based on md->type, may fail at that time (because blk_init_allocated_queue cannot allocate memory). Therefore any subsequent table_load must (re)try dm_setup_md_queue independently of establishing md->type. Signed-off-by: Mike Snitzer <snitzer@redhat.com> Acked-by: Kiyoshi Ueda <k-ueda@ct.jp.nec.com> Signed-off-by: Alasdair G Kergon <agk@redhat.com>
2010-08-12 11:14:02 +08:00
/* Protect queue and type against concurrent access. */
struct mutex type_lock;
struct target_type *immutable_target_type;
struct gendisk *disk;
char name[16];
void *interface_ptr;
/*
* A list of ios that arrived while we were suspended.
*/
atomic_t pending[2];
wait_queue_head_t wait;
struct work_struct work;
struct bio_list deferred;
spinlock_t deferred_lock;
/*
* Processing queue (flush)
*/
struct workqueue_struct *wq;
/*
* The current mapping.
*/
struct dm_table *map;
/*
* io objects are allocated from here.
*/
mempool_t *io_pool;
mempool_t *tio_pool;
struct bio_set *bs;
/*
* Event handling.
*/
atomic_t event_nr;
wait_queue_head_t eventq;
atomic_t uevent_seq;
struct list_head uevent_list;
spinlock_t uevent_lock; /* Protect access to uevent_list */
/*
* freeze/thaw support require holding onto a super block
*/
struct super_block *frozen_sb;
struct block_device *bdev;
/* forced geometry settings */
struct hd_geometry geometry;
/* sysfs handle */
struct kobject kobj;
dm: implement REQ_FLUSH/FUA support for bio-based dm This patch converts bio-based dm to support REQ_FLUSH/FUA instead of now deprecated REQ_HARDBARRIER. * -EOPNOTSUPP handling logic dropped. * Preflush is handled as before but postflush is dropped and replaced with passing down REQ_FUA to member request_queues. This replaces one array wide cache flush w/ member specific FUA writes. * __split_and_process_bio() now calls __clone_and_map_flush() directly for flushes and guarantees all FLUSH bio's going to targets are zero ` length. * It's now guaranteed that all FLUSH bio's which are passed onto dm targets are zero length. bio_empty_barrier() tests are replaced with REQ_FLUSH tests. * Empty WRITE_BARRIERs are replaced with WRITE_FLUSHes. * Dropped unlikely() around REQ_FLUSH tests. Flushes are not unlikely enough to be marked with unlikely(). * Block layer now filters out REQ_FLUSH/FUA bio's if the request_queue doesn't support cache flushing. Advertise REQ_FLUSH | REQ_FUA capability. * Request based dm isn't converted yet. dm_init_request_based_queue() resets flush support to 0 for now. To avoid disturbing request based dm code, dm->flush_error is added for bio based dm while requested based dm continues to use dm->barrier_error. Lightly tested linear, stripe, raid1, snap and crypt targets. Please proceed with caution as I'm not familiar with the code base. Signed-off-by: Tejun Heo <tj@kernel.org> Cc: dm-devel@redhat.com Cc: Christoph Hellwig <hch@lst.de> Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
2010-09-03 17:56:19 +08:00
/* zero-length flush that will be cloned and submitted to targets */
struct bio flush_bio;
};
/*
* For mempools pre-allocation at the table loading time.
*/
struct dm_md_mempools {
mempool_t *io_pool;
mempool_t *tio_pool;
struct bio_set *bs;
};
#define MIN_IOS 256
static struct kmem_cache *_io_cache;
static struct kmem_cache *_tio_cache;
static struct kmem_cache *_rq_tio_cache;
static struct kmem_cache *_rq_bio_info_cache;
static int __init local_init(void)
{
int r = -ENOMEM;
/* allocate a slab for the dm_ios */
_io_cache = KMEM_CACHE(dm_io, 0);
if (!_io_cache)
return r;
/* allocate a slab for the target ios */
_tio_cache = KMEM_CACHE(dm_target_io, 0);
if (!_tio_cache)
goto out_free_io_cache;
_rq_tio_cache = KMEM_CACHE(dm_rq_target_io, 0);
if (!_rq_tio_cache)
goto out_free_tio_cache;
_rq_bio_info_cache = KMEM_CACHE(dm_rq_clone_bio_info, 0);
if (!_rq_bio_info_cache)
goto out_free_rq_tio_cache;
r = dm_uevent_init();
if (r)
goto out_free_rq_bio_info_cache;
_major = major;
r = register_blkdev(_major, _name);
if (r < 0)
goto out_uevent_exit;
if (!_major)
_major = r;
return 0;
out_uevent_exit:
dm_uevent_exit();
out_free_rq_bio_info_cache:
kmem_cache_destroy(_rq_bio_info_cache);
out_free_rq_tio_cache:
kmem_cache_destroy(_rq_tio_cache);
out_free_tio_cache:
kmem_cache_destroy(_tio_cache);
out_free_io_cache:
kmem_cache_destroy(_io_cache);
return r;
}
static void local_exit(void)
{
kmem_cache_destroy(_rq_bio_info_cache);
kmem_cache_destroy(_rq_tio_cache);
kmem_cache_destroy(_tio_cache);
kmem_cache_destroy(_io_cache);
unregister_blkdev(_major, _name);
dm_uevent_exit();
_major = 0;
DMINFO("cleaned up");
}
static int (*_inits[])(void) __initdata = {
local_init,
dm_target_init,
dm_linear_init,
dm_stripe_init,
dm_io_init,
dm_kcopyd_init,
dm_interface_init,
};
static void (*_exits[])(void) = {
local_exit,
dm_target_exit,
dm_linear_exit,
dm_stripe_exit,
dm_io_exit,
dm_kcopyd_exit,
dm_interface_exit,
};
static int __init dm_init(void)
{
const int count = ARRAY_SIZE(_inits);
int r, i;
for (i = 0; i < count; i++) {
r = _inits[i]();
if (r)
goto bad;
}
return 0;
bad:
while (i--)
_exits[i]();
return r;
}
static void __exit dm_exit(void)
{
int i = ARRAY_SIZE(_exits);
while (i--)
_exits[i]();
/*
* Should be empty by this point.
*/
idr_remove_all(&_minor_idr);
idr_destroy(&_minor_idr);
}
/*
* Block device functions
*/
int dm_deleting_md(struct mapped_device *md)
{
return test_bit(DMF_DELETING, &md->flags);
}
static int dm_blk_open(struct block_device *bdev, fmode_t mode)
{
struct mapped_device *md;
spin_lock(&_minor_lock);
md = bdev->bd_disk->private_data;
if (!md)
goto out;
if (test_bit(DMF_FREEING, &md->flags) ||
dm_deleting_md(md)) {
md = NULL;
goto out;
}
dm_get(md);
atomic_inc(&md->open_count);
out:
spin_unlock(&_minor_lock);
return md ? 0 : -ENXIO;
}
static int dm_blk_close(struct gendisk *disk, fmode_t mode)
{
struct mapped_device *md = disk->private_data;
spin_lock(&_minor_lock);
atomic_dec(&md->open_count);
dm_put(md);
spin_unlock(&_minor_lock);
return 0;
}
int dm_open_count(struct mapped_device *md)
{
return atomic_read(&md->open_count);
}
/*
* Guarantees nothing is using the device before it's deleted.
*/
int dm_lock_for_deletion(struct mapped_device *md)
{
int r = 0;
spin_lock(&_minor_lock);
if (dm_open_count(md))
r = -EBUSY;
else
set_bit(DMF_DELETING, &md->flags);
spin_unlock(&_minor_lock);
return r;
}
static int dm_blk_getgeo(struct block_device *bdev, struct hd_geometry *geo)
{
struct mapped_device *md = bdev->bd_disk->private_data;
return dm_get_geometry(md, geo);
}
static int dm_blk_ioctl(struct block_device *bdev, fmode_t mode,
unsigned int cmd, unsigned long arg)
{
struct mapped_device *md = bdev->bd_disk->private_data;
struct dm_table *map = dm_get_live_table(md);
struct dm_target *tgt;
int r = -ENOTTY;
if (!map || !dm_table_get_size(map))
goto out;
/* We only support devices that have a single target */
if (dm_table_get_num_targets(map) != 1)
goto out;
tgt = dm_table_get_target(map, 0);
if (dm_suspended_md(md)) {
r = -EAGAIN;
goto out;
}
if (tgt->type->ioctl)
r = tgt->type->ioctl(tgt, cmd, arg);
out:
dm_table_put(map);
return r;
}
static struct dm_io *alloc_io(struct mapped_device *md)
{
return mempool_alloc(md->io_pool, GFP_NOIO);
}
static void free_io(struct mapped_device *md, struct dm_io *io)
{
mempool_free(io, md->io_pool);
}
static void free_tio(struct mapped_device *md, struct dm_target_io *tio)
{
mempool_free(tio, md->tio_pool);
}
static struct dm_rq_target_io *alloc_rq_tio(struct mapped_device *md,
gfp_t gfp_mask)
dm: prepare for request based option This patch adds core functions for request-based dm. When struct mapped device (md) is initialized, md->queue has an I/O scheduler and the following functions are used for request-based dm as the queue functions: make_request_fn: dm_make_request() pref_fn: dm_prep_fn() request_fn: dm_request_fn() softirq_done_fn: dm_softirq_done() lld_busy_fn: dm_lld_busy() Actual initializations are done in another patch (PATCH 2). Below is a brief summary of how request-based dm behaves, including: - making request from bio - cloning, mapping and dispatching request - completing request and bio - suspending md - resuming md bio to request ============== md->queue->make_request_fn() (dm_make_request()) calls __make_request() for a bio submitted to the md. Then, the bio is kept in the queue as a new request or merged into another request in the queue if possible. Cloning and Mapping =================== Cloning and mapping are done in md->queue->request_fn() (dm_request_fn()), when requests are dispatched after they are sorted by the I/O scheduler. dm_request_fn() checks busy state of underlying devices using target's busy() function and stops dispatching requests to keep them on the dm device's queue if busy. It helps better I/O merging, since no merge is done for a request once it is dispatched to underlying devices. Actual cloning and mapping are done in dm_prep_fn() and map_request() called from dm_request_fn(). dm_prep_fn() clones not only request but also bios of the request so that dm can hold bio completion in error cases and prevent the bio submitter from noticing the error. (See the "Completion" section below for details.) After the cloning, the clone is mapped by target's map_rq() function and inserted to underlying device's queue using blk_insert_cloned_request(). Completion ========== Request completion can be hooked by rq->end_io(), but then, all bios in the request will have been completed even error cases, and the bio submitter will have noticed the error. To prevent the bio completion in error cases, request-based dm clones both bio and request and hooks both bio->bi_end_io() and rq->end_io(): bio->bi_end_io(): end_clone_bio() rq->end_io(): end_clone_request() Summary of the request completion flow is below: blk_end_request() for a clone request => blk_update_request() => bio->bi_end_io() == end_clone_bio() for each clone bio => Free the clone bio => Success: Complete the original bio (blk_update_request()) Error: Don't complete the original bio => blk_finish_request() => rq->end_io() == end_clone_request() => blk_complete_request() => dm_softirq_done() => Free the clone request => Success: Complete the original request (blk_end_request()) Error: Requeue the original request end_clone_bio() completes the original request on the size of the original bio in successful cases. Even if all bios in the original request are completed by that completion, the original request must not be completed yet to keep the ordering of request completion for the stacking. So end_clone_bio() uses blk_update_request() instead of blk_end_request(). In error cases, end_clone_bio() doesn't complete the original bio. It just frees the cloned bio and gives over the error handling to end_clone_request(). end_clone_request(), which is called with queue lock held, completes the clone request and the original request in a softirq context (dm_softirq_done()), which has no queue lock, to avoid a deadlock issue on submission of another request during the completion: - The submitted request may be mapped to the same device - Request submission requires queue lock, but the queue lock has been held by itself and it doesn't know that The clone request has no clone bio when dm_softirq_done() is called. So target drivers can't resubmit it again even error cases. Instead, they can ask dm core for requeueing and remapping the original request in that cases. suspend ======= Request-based dm uses stopping md->queue as suspend of the md. For noflush suspend, just stops md->queue. For flush suspend, inserts a marker request to the tail of md->queue. And dispatches all requests in md->queue until the marker comes to the front of md->queue. Then, stops dispatching request and waits for the all dispatched requests to complete. After that, completes the marker request, stops md->queue and wake up the waiter on the suspend queue, md->wait. resume ====== Starts md->queue. Signed-off-by: Kiyoshi Ueda <k-ueda@ct.jp.nec.com> Signed-off-by: Jun'ichi Nomura <j-nomura@ce.jp.nec.com> Signed-off-by: Alasdair G Kergon <agk@redhat.com>
2009-06-22 17:12:35 +08:00
{
return mempool_alloc(md->tio_pool, gfp_mask);
dm: prepare for request based option This patch adds core functions for request-based dm. When struct mapped device (md) is initialized, md->queue has an I/O scheduler and the following functions are used for request-based dm as the queue functions: make_request_fn: dm_make_request() pref_fn: dm_prep_fn() request_fn: dm_request_fn() softirq_done_fn: dm_softirq_done() lld_busy_fn: dm_lld_busy() Actual initializations are done in another patch (PATCH 2). Below is a brief summary of how request-based dm behaves, including: - making request from bio - cloning, mapping and dispatching request - completing request and bio - suspending md - resuming md bio to request ============== md->queue->make_request_fn() (dm_make_request()) calls __make_request() for a bio submitted to the md. Then, the bio is kept in the queue as a new request or merged into another request in the queue if possible. Cloning and Mapping =================== Cloning and mapping are done in md->queue->request_fn() (dm_request_fn()), when requests are dispatched after they are sorted by the I/O scheduler. dm_request_fn() checks busy state of underlying devices using target's busy() function and stops dispatching requests to keep them on the dm device's queue if busy. It helps better I/O merging, since no merge is done for a request once it is dispatched to underlying devices. Actual cloning and mapping are done in dm_prep_fn() and map_request() called from dm_request_fn(). dm_prep_fn() clones not only request but also bios of the request so that dm can hold bio completion in error cases and prevent the bio submitter from noticing the error. (See the "Completion" section below for details.) After the cloning, the clone is mapped by target's map_rq() function and inserted to underlying device's queue using blk_insert_cloned_request(). Completion ========== Request completion can be hooked by rq->end_io(), but then, all bios in the request will have been completed even error cases, and the bio submitter will have noticed the error. To prevent the bio completion in error cases, request-based dm clones both bio and request and hooks both bio->bi_end_io() and rq->end_io(): bio->bi_end_io(): end_clone_bio() rq->end_io(): end_clone_request() Summary of the request completion flow is below: blk_end_request() for a clone request => blk_update_request() => bio->bi_end_io() == end_clone_bio() for each clone bio => Free the clone bio => Success: Complete the original bio (blk_update_request()) Error: Don't complete the original bio => blk_finish_request() => rq->end_io() == end_clone_request() => blk_complete_request() => dm_softirq_done() => Free the clone request => Success: Complete the original request (blk_end_request()) Error: Requeue the original request end_clone_bio() completes the original request on the size of the original bio in successful cases. Even if all bios in the original request are completed by that completion, the original request must not be completed yet to keep the ordering of request completion for the stacking. So end_clone_bio() uses blk_update_request() instead of blk_end_request(). In error cases, end_clone_bio() doesn't complete the original bio. It just frees the cloned bio and gives over the error handling to end_clone_request(). end_clone_request(), which is called with queue lock held, completes the clone request and the original request in a softirq context (dm_softirq_done()), which has no queue lock, to avoid a deadlock issue on submission of another request during the completion: - The submitted request may be mapped to the same device - Request submission requires queue lock, but the queue lock has been held by itself and it doesn't know that The clone request has no clone bio when dm_softirq_done() is called. So target drivers can't resubmit it again even error cases. Instead, they can ask dm core for requeueing and remapping the original request in that cases. suspend ======= Request-based dm uses stopping md->queue as suspend of the md. For noflush suspend, just stops md->queue. For flush suspend, inserts a marker request to the tail of md->queue. And dispatches all requests in md->queue until the marker comes to the front of md->queue. Then, stops dispatching request and waits for the all dispatched requests to complete. After that, completes the marker request, stops md->queue and wake up the waiter on the suspend queue, md->wait. resume ====== Starts md->queue. Signed-off-by: Kiyoshi Ueda <k-ueda@ct.jp.nec.com> Signed-off-by: Jun'ichi Nomura <j-nomura@ce.jp.nec.com> Signed-off-by: Alasdair G Kergon <agk@redhat.com>
2009-06-22 17:12:35 +08:00
}
static void free_rq_tio(struct dm_rq_target_io *tio)
{
mempool_free(tio, tio->md->tio_pool);
}
static struct dm_rq_clone_bio_info *alloc_bio_info(struct mapped_device *md)
{
return mempool_alloc(md->io_pool, GFP_ATOMIC);
}
static void free_bio_info(struct dm_rq_clone_bio_info *info)
{
mempool_free(info, info->tio->md->io_pool);
}
static int md_in_flight(struct mapped_device *md)
{
return atomic_read(&md->pending[READ]) +
atomic_read(&md->pending[WRITE]);
}
static void start_io_acct(struct dm_io *io)
{
struct mapped_device *md = io->md;
int cpu;
int rw = bio_data_dir(io->bio);
io->start_time = jiffies;
cpu = part_stat_lock();
part_round_stats(cpu, &dm_disk(md)->part0);
part_stat_unlock();
atomic_set(&dm_disk(md)->part0.in_flight[rw],
atomic_inc_return(&md->pending[rw]));
}
static void end_io_acct(struct dm_io *io)
{
struct mapped_device *md = io->md;
struct bio *bio = io->bio;
unsigned long duration = jiffies - io->start_time;
int pending, cpu;
int rw = bio_data_dir(bio);
cpu = part_stat_lock();
part_round_stats(cpu, &dm_disk(md)->part0);
part_stat_add(cpu, &dm_disk(md)->part0, ticks[rw], duration);
part_stat_unlock();
/*
* After this is decremented the bio must not be touched if it is
dm: implement REQ_FLUSH/FUA support for bio-based dm This patch converts bio-based dm to support REQ_FLUSH/FUA instead of now deprecated REQ_HARDBARRIER. * -EOPNOTSUPP handling logic dropped. * Preflush is handled as before but postflush is dropped and replaced with passing down REQ_FUA to member request_queues. This replaces one array wide cache flush w/ member specific FUA writes. * __split_and_process_bio() now calls __clone_and_map_flush() directly for flushes and guarantees all FLUSH bio's going to targets are zero ` length. * It's now guaranteed that all FLUSH bio's which are passed onto dm targets are zero length. bio_empty_barrier() tests are replaced with REQ_FLUSH tests. * Empty WRITE_BARRIERs are replaced with WRITE_FLUSHes. * Dropped unlikely() around REQ_FLUSH tests. Flushes are not unlikely enough to be marked with unlikely(). * Block layer now filters out REQ_FLUSH/FUA bio's if the request_queue doesn't support cache flushing. Advertise REQ_FLUSH | REQ_FUA capability. * Request based dm isn't converted yet. dm_init_request_based_queue() resets flush support to 0 for now. To avoid disturbing request based dm code, dm->flush_error is added for bio based dm while requested based dm continues to use dm->barrier_error. Lightly tested linear, stripe, raid1, snap and crypt targets. Please proceed with caution as I'm not familiar with the code base. Signed-off-by: Tejun Heo <tj@kernel.org> Cc: dm-devel@redhat.com Cc: Christoph Hellwig <hch@lst.de> Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
2010-09-03 17:56:19 +08:00
* a flush.
*/
pending = atomic_dec_return(&md->pending[rw]);
atomic_set(&dm_disk(md)->part0.in_flight[rw], pending);
pending += atomic_read(&md->pending[rw^0x1]);
/* nudge anyone waiting on suspend queue */
if (!pending)
wake_up(&md->wait);
}
/*
* Add the bio to the list of deferred io.
*/
static void queue_io(struct mapped_device *md, struct bio *bio)
{
unsigned long flags;
spin_lock_irqsave(&md->deferred_lock, flags);
bio_list_add(&md->deferred, bio);
spin_unlock_irqrestore(&md->deferred_lock, flags);
dm: relax ordering of bio-based flush implementation Unlike REQ_HARDBARRIER, REQ_FLUSH/FUA doesn't mandate any ordering against other bio's. This patch relaxes ordering around flushes. * A flush bio is no longer deferred to workqueue directly. It's processed like other bio's but __split_and_process_bio() uses md->flush_bio as the clone source. md->flush_bio is initialized to empty flush during md initialization and shared for all flushes. * As a flush bio now travels through the same execution path as other bio's, there's no need for dedicated error handling path either. It can use the same error handling path in dec_pending(). Dedicated error handling removed along with md->flush_error. * When dec_pending() detects that a flush has completed, it checks whether the original bio has data. If so, the bio is queued to the deferred list w/ REQ_FLUSH cleared; otherwise, it's completed. * As flush sequencing is handled in the usual issue/completion path, dm_wq_work() no longer needs to handle flushes differently. Now its only responsibility is re-issuing deferred bio's the same way as _dm_request() would. REQ_FLUSH handling logic including process_flush() is dropped. * There's no reason for queue_io() and dm_wq_work() write lock dm->io_lock. queue_io() now only uses md->deferred_lock and dm_wq_work() read locks dm->io_lock. * bio's no longer need to be queued on the deferred list while a flush is in progress making DMF_QUEUE_IO_TO_THREAD unncessary. Drop it. This avoids stalling the device during flushes and simplifies the implementation. Signed-off-by: Tejun Heo <tj@kernel.org> Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
2010-09-09 00:07:00 +08:00
queue_work(md->wq, &md->work);
}
/*
* Everyone (including functions in this file), should use this
* function to access the md->map field, and make sure they call
* dm_table_put() when finished.
*/
struct dm_table *dm_get_live_table(struct mapped_device *md)
{
struct dm_table *t;
unsigned long flags;
read_lock_irqsave(&md->map_lock, flags);
t = md->map;
if (t)
dm_table_get(t);
read_unlock_irqrestore(&md->map_lock, flags);
return t;
}
/*
* Get the geometry associated with a dm device
*/
int dm_get_geometry(struct mapped_device *md, struct hd_geometry *geo)
{
*geo = md->geometry;
return 0;
}
/*
* Set the geometry of a device.
*/
int dm_set_geometry(struct mapped_device *md, struct hd_geometry *geo)
{
sector_t sz = (sector_t)geo->cylinders * geo->heads * geo->sectors;
if (geo->start > sz) {
DMWARN("Start sector is beyond the geometry limits.");
return -EINVAL;
}
md->geometry = *geo;
return 0;
}
/*-----------------------------------------------------------------
* CRUD START:
* A more elegant soln is in the works that uses the queue
* merge fn, unfortunately there are a couple of changes to
* the block layer that I want to make for this. So in the
* interests of getting something for people to use I give
* you this clearly demarcated crap.
*---------------------------------------------------------------*/
[PATCH] dm: suspend: add noflush pushback In device-mapper I/O is sometimes queued within targets for later processing. For example the multipath target can be configured to store I/O when no paths are available instead of returning it -EIO. This patch allows the device-mapper core to instruct a target to transfer the contents of any such in-target queue back into the core. This frees up the resources used by the target so the core can replace that target with an alternative one and then resend the I/O to it. Without this patch the only way to change the target in such circumstances involves returning the I/O with an error back to the filesystem/application. In the multipath case, this patch will let us add new paths for existing I/O to try after all the existing paths have failed. DMF_NOFLUSH_SUSPENDING ---------------------- If the DM_NOFLUSH_FLAG ioctl option is specified at suspend time, the DMF_NOFLUSH_SUSPENDING flag is set in md->flags during dm_suspend(). It is always cleared before dm_suspend() returns. The flag must be visible while the target is flushing pending I/Os so it is set before presuspend where the flush starts and unset after the wait for md->pending where the flush ends. Target drivers can check this flag by calling dm_noflush_suspending(). DM_MAPIO_REQUEUE / DM_ENDIO_REQUEUE ----------------------------------- A target's map() function can now return DM_MAPIO_REQUEUE to request the device mapper core queue the bio. Similarly, a target's end_io() function can return DM_ENDIO_REQUEUE to request the same. This has been labelled 'pushback'. The __map_bio() and clone_endio() functions in the core treat these return values as errors and call dec_pending() to end the I/O. dec_pending ----------- dec_pending() saves the pushback request in struct dm_io->error. Once all the split clones have ended, dec_pending() will put the original bio on the md->pushback list. Note that this supercedes any I/O errors. It is possible for the suspend with DM_NOFLUSH_FLAG to be aborted while in progress (e.g. by user interrupt). dec_pending() checks for this and returns -EIO if it happened. pushdback list and pushback_lock -------------------------------- The bio is queued on md->pushback temporarily in dec_pending(), and after all pending I/Os return, md->pushback is merged into md->deferred in dm_suspend() for re-issuing at resume time. md->pushback_lock protects md->pushback. The lock should be held with irq disabled because dec_pending() can be called from interrupt context. Queueing bios to md->pushback in dec_pending() must be done atomically with the check for DMF_NOFLUSH_SUSPENDING flag. So md->pushback_lock is held when checking the flag. Otherwise dec_pending() may queue a bio to md->pushback after the interrupted dm_suspend() flushes md->pushback. Then the bio would be left in md->pushback. Flag setting in dm_suspend() can be done without md->pushback_lock because the flag is checked only after presuspend and the set value is already made visible via the target's presuspend function. The flag can be checked without md->pushback_lock (e.g. the first part of the dec_pending() or target drivers), because the flag is checked again with md->pushback_lock held when the bio is really queued to md->pushback as described above. So even if the flag is cleared after the lockless checkings, the bio isn't left in md->pushback but returned to applications with -EIO. Other notes on the current patch -------------------------------- - md->pushback is added to the struct mapped_device instead of using md->deferred directly because md->io_lock which protects md->deferred is rw_semaphore and can't be used in interrupt context like dec_pending(), and md->io_lock protects the DMF_BLOCK_IO flag of md->flags too. - Don't issue lock_fs() in dm_suspend() if the DM_NOFLUSH_FLAG ioctl option is specified, because I/Os generated by lock_fs() would be pushed back and never return if there were no valid devices. - If an error occurs in dm_suspend() after the DMF_NOFLUSH_SUSPENDING flag is set, md->pushback must be flushed because I/Os may be queued to the list already. (flush_and_out label in dm_suspend()) Test results ------------ I have tested using multipath target with the next patch. The following tests are for regression/compatibility: - I/Os succeed when valid paths exist; - I/Os fail when there are no valid paths and queue_if_no_path is not set; - I/Os are queued in the multipath target when there are no valid paths and queue_if_no_path is set; - The queued I/Os above fail when suspend is issued without the DM_NOFLUSH_FLAG ioctl option. I/Os spanning 2 multipath targets also fail. The following tests are for the normal code path of new pushback feature: - Queued I/Os in the multipath target are flushed from the target but don't return when suspend is issued with the DM_NOFLUSH_FLAG ioctl option; - The I/Os above are queued in the multipath target again when resume is issued without path recovery; - The I/Os above succeed when resume is issued after path recovery or table load; - Queued I/Os in the multipath target succeed when resume is issued with the DM_NOFLUSH_FLAG ioctl option after table load. I/Os spanning 2 multipath targets also succeed. The following tests are for the error paths of the new pushback feature: - When the bdget_disk() fails in dm_suspend(), the DMF_NOFLUSH_SUSPENDING flag is cleared and I/Os already queued to the pushback list are flushed properly. - When suspend with the DM_NOFLUSH_FLAG ioctl option is interrupted, o I/Os which had already been queued to the pushback list at the time don't return, and are re-issued at resume time; o I/Os which hadn't been returned at the time return with EIO. Signed-off-by: Kiyoshi Ueda <k-ueda@ct.jp.nec.com> Signed-off-by: Jun'ichi Nomura <j-nomura@ce.jp.nec.com> Signed-off-by: Alasdair G Kergon <agk@redhat.com> Cc: dm-devel@redhat.com Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-12-08 18:41:09 +08:00
static int __noflush_suspending(struct mapped_device *md)
{
return test_bit(DMF_NOFLUSH_SUSPENDING, &md->flags);
}
/*
* Decrements the number of outstanding ios that a bio has been
* cloned into, completing the original io if necc.
*/
static void dec_pending(struct dm_io *io, int error)
{
[PATCH] dm: suspend: add noflush pushback In device-mapper I/O is sometimes queued within targets for later processing. For example the multipath target can be configured to store I/O when no paths are available instead of returning it -EIO. This patch allows the device-mapper core to instruct a target to transfer the contents of any such in-target queue back into the core. This frees up the resources used by the target so the core can replace that target with an alternative one and then resend the I/O to it. Without this patch the only way to change the target in such circumstances involves returning the I/O with an error back to the filesystem/application. In the multipath case, this patch will let us add new paths for existing I/O to try after all the existing paths have failed. DMF_NOFLUSH_SUSPENDING ---------------------- If the DM_NOFLUSH_FLAG ioctl option is specified at suspend time, the DMF_NOFLUSH_SUSPENDING flag is set in md->flags during dm_suspend(). It is always cleared before dm_suspend() returns. The flag must be visible while the target is flushing pending I/Os so it is set before presuspend where the flush starts and unset after the wait for md->pending where the flush ends. Target drivers can check this flag by calling dm_noflush_suspending(). DM_MAPIO_REQUEUE / DM_ENDIO_REQUEUE ----------------------------------- A target's map() function can now return DM_MAPIO_REQUEUE to request the device mapper core queue the bio. Similarly, a target's end_io() function can return DM_ENDIO_REQUEUE to request the same. This has been labelled 'pushback'. The __map_bio() and clone_endio() functions in the core treat these return values as errors and call dec_pending() to end the I/O. dec_pending ----------- dec_pending() saves the pushback request in struct dm_io->error. Once all the split clones have ended, dec_pending() will put the original bio on the md->pushback list. Note that this supercedes any I/O errors. It is possible for the suspend with DM_NOFLUSH_FLAG to be aborted while in progress (e.g. by user interrupt). dec_pending() checks for this and returns -EIO if it happened. pushdback list and pushback_lock -------------------------------- The bio is queued on md->pushback temporarily in dec_pending(), and after all pending I/Os return, md->pushback is merged into md->deferred in dm_suspend() for re-issuing at resume time. md->pushback_lock protects md->pushback. The lock should be held with irq disabled because dec_pending() can be called from interrupt context. Queueing bios to md->pushback in dec_pending() must be done atomically with the check for DMF_NOFLUSH_SUSPENDING flag. So md->pushback_lock is held when checking the flag. Otherwise dec_pending() may queue a bio to md->pushback after the interrupted dm_suspend() flushes md->pushback. Then the bio would be left in md->pushback. Flag setting in dm_suspend() can be done without md->pushback_lock because the flag is checked only after presuspend and the set value is already made visible via the target's presuspend function. The flag can be checked without md->pushback_lock (e.g. the first part of the dec_pending() or target drivers), because the flag is checked again with md->pushback_lock held when the bio is really queued to md->pushback as described above. So even if the flag is cleared after the lockless checkings, the bio isn't left in md->pushback but returned to applications with -EIO. Other notes on the current patch -------------------------------- - md->pushback is added to the struct mapped_device instead of using md->deferred directly because md->io_lock which protects md->deferred is rw_semaphore and can't be used in interrupt context like dec_pending(), and md->io_lock protects the DMF_BLOCK_IO flag of md->flags too. - Don't issue lock_fs() in dm_suspend() if the DM_NOFLUSH_FLAG ioctl option is specified, because I/Os generated by lock_fs() would be pushed back and never return if there were no valid devices. - If an error occurs in dm_suspend() after the DMF_NOFLUSH_SUSPENDING flag is set, md->pushback must be flushed because I/Os may be queued to the list already. (flush_and_out label in dm_suspend()) Test results ------------ I have tested using multipath target with the next patch. The following tests are for regression/compatibility: - I/Os succeed when valid paths exist; - I/Os fail when there are no valid paths and queue_if_no_path is not set; - I/Os are queued in the multipath target when there are no valid paths and queue_if_no_path is set; - The queued I/Os above fail when suspend is issued without the DM_NOFLUSH_FLAG ioctl option. I/Os spanning 2 multipath targets also fail. The following tests are for the normal code path of new pushback feature: - Queued I/Os in the multipath target are flushed from the target but don't return when suspend is issued with the DM_NOFLUSH_FLAG ioctl option; - The I/Os above are queued in the multipath target again when resume is issued without path recovery; - The I/Os above succeed when resume is issued after path recovery or table load; - Queued I/Os in the multipath target succeed when resume is issued with the DM_NOFLUSH_FLAG ioctl option after table load. I/Os spanning 2 multipath targets also succeed. The following tests are for the error paths of the new pushback feature: - When the bdget_disk() fails in dm_suspend(), the DMF_NOFLUSH_SUSPENDING flag is cleared and I/Os already queued to the pushback list are flushed properly. - When suspend with the DM_NOFLUSH_FLAG ioctl option is interrupted, o I/Os which had already been queued to the pushback list at the time don't return, and are re-issued at resume time; o I/Os which hadn't been returned at the time return with EIO. Signed-off-by: Kiyoshi Ueda <k-ueda@ct.jp.nec.com> Signed-off-by: Jun'ichi Nomura <j-nomura@ce.jp.nec.com> Signed-off-by: Alasdair G Kergon <agk@redhat.com> Cc: dm-devel@redhat.com Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-12-08 18:41:09 +08:00
unsigned long flags;
int io_error;
struct bio *bio;
struct mapped_device *md = io->md;
[PATCH] dm: suspend: add noflush pushback In device-mapper I/O is sometimes queued within targets for later processing. For example the multipath target can be configured to store I/O when no paths are available instead of returning it -EIO. This patch allows the device-mapper core to instruct a target to transfer the contents of any such in-target queue back into the core. This frees up the resources used by the target so the core can replace that target with an alternative one and then resend the I/O to it. Without this patch the only way to change the target in such circumstances involves returning the I/O with an error back to the filesystem/application. In the multipath case, this patch will let us add new paths for existing I/O to try after all the existing paths have failed. DMF_NOFLUSH_SUSPENDING ---------------------- If the DM_NOFLUSH_FLAG ioctl option is specified at suspend time, the DMF_NOFLUSH_SUSPENDING flag is set in md->flags during dm_suspend(). It is always cleared before dm_suspend() returns. The flag must be visible while the target is flushing pending I/Os so it is set before presuspend where the flush starts and unset after the wait for md->pending where the flush ends. Target drivers can check this flag by calling dm_noflush_suspending(). DM_MAPIO_REQUEUE / DM_ENDIO_REQUEUE ----------------------------------- A target's map() function can now return DM_MAPIO_REQUEUE to request the device mapper core queue the bio. Similarly, a target's end_io() function can return DM_ENDIO_REQUEUE to request the same. This has been labelled 'pushback'. The __map_bio() and clone_endio() functions in the core treat these return values as errors and call dec_pending() to end the I/O. dec_pending ----------- dec_pending() saves the pushback request in struct dm_io->error. Once all the split clones have ended, dec_pending() will put the original bio on the md->pushback list. Note that this supercedes any I/O errors. It is possible for the suspend with DM_NOFLUSH_FLAG to be aborted while in progress (e.g. by user interrupt). dec_pending() checks for this and returns -EIO if it happened. pushdback list and pushback_lock -------------------------------- The bio is queued on md->pushback temporarily in dec_pending(), and after all pending I/Os return, md->pushback is merged into md->deferred in dm_suspend() for re-issuing at resume time. md->pushback_lock protects md->pushback. The lock should be held with irq disabled because dec_pending() can be called from interrupt context. Queueing bios to md->pushback in dec_pending() must be done atomically with the check for DMF_NOFLUSH_SUSPENDING flag. So md->pushback_lock is held when checking the flag. Otherwise dec_pending() may queue a bio to md->pushback after the interrupted dm_suspend() flushes md->pushback. Then the bio would be left in md->pushback. Flag setting in dm_suspend() can be done without md->pushback_lock because the flag is checked only after presuspend and the set value is already made visible via the target's presuspend function. The flag can be checked without md->pushback_lock (e.g. the first part of the dec_pending() or target drivers), because the flag is checked again with md->pushback_lock held when the bio is really queued to md->pushback as described above. So even if the flag is cleared after the lockless checkings, the bio isn't left in md->pushback but returned to applications with -EIO. Other notes on the current patch -------------------------------- - md->pushback is added to the struct mapped_device instead of using md->deferred directly because md->io_lock which protects md->deferred is rw_semaphore and can't be used in interrupt context like dec_pending(), and md->io_lock protects the DMF_BLOCK_IO flag of md->flags too. - Don't issue lock_fs() in dm_suspend() if the DM_NOFLUSH_FLAG ioctl option is specified, because I/Os generated by lock_fs() would be pushed back and never return if there were no valid devices. - If an error occurs in dm_suspend() after the DMF_NOFLUSH_SUSPENDING flag is set, md->pushback must be flushed because I/Os may be queued to the list already. (flush_and_out label in dm_suspend()) Test results ------------ I have tested using multipath target with the next patch. The following tests are for regression/compatibility: - I/Os succeed when valid paths exist; - I/Os fail when there are no valid paths and queue_if_no_path is not set; - I/Os are queued in the multipath target when there are no valid paths and queue_if_no_path is set; - The queued I/Os above fail when suspend is issued without the DM_NOFLUSH_FLAG ioctl option. I/Os spanning 2 multipath targets also fail. The following tests are for the normal code path of new pushback feature: - Queued I/Os in the multipath target are flushed from the target but don't return when suspend is issued with the DM_NOFLUSH_FLAG ioctl option; - The I/Os above are queued in the multipath target again when resume is issued without path recovery; - The I/Os above succeed when resume is issued after path recovery or table load; - Queued I/Os in the multipath target succeed when resume is issued with the DM_NOFLUSH_FLAG ioctl option after table load. I/Os spanning 2 multipath targets also succeed. The following tests are for the error paths of the new pushback feature: - When the bdget_disk() fails in dm_suspend(), the DMF_NOFLUSH_SUSPENDING flag is cleared and I/Os already queued to the pushback list are flushed properly. - When suspend with the DM_NOFLUSH_FLAG ioctl option is interrupted, o I/Os which had already been queued to the pushback list at the time don't return, and are re-issued at resume time; o I/Os which hadn't been returned at the time return with EIO. Signed-off-by: Kiyoshi Ueda <k-ueda@ct.jp.nec.com> Signed-off-by: Jun'ichi Nomura <j-nomura@ce.jp.nec.com> Signed-off-by: Alasdair G Kergon <agk@redhat.com> Cc: dm-devel@redhat.com Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-12-08 18:41:09 +08:00
/* Push-back supersedes any I/O errors */
if (unlikely(error)) {
spin_lock_irqsave(&io->endio_lock, flags);
if (!(io->error > 0 && __noflush_suspending(md)))
io->error = error;
spin_unlock_irqrestore(&io->endio_lock, flags);
}
if (atomic_dec_and_test(&io->io_count)) {
[PATCH] dm: suspend: add noflush pushback In device-mapper I/O is sometimes queued within targets for later processing. For example the multipath target can be configured to store I/O when no paths are available instead of returning it -EIO. This patch allows the device-mapper core to instruct a target to transfer the contents of any such in-target queue back into the core. This frees up the resources used by the target so the core can replace that target with an alternative one and then resend the I/O to it. Without this patch the only way to change the target in such circumstances involves returning the I/O with an error back to the filesystem/application. In the multipath case, this patch will let us add new paths for existing I/O to try after all the existing paths have failed. DMF_NOFLUSH_SUSPENDING ---------------------- If the DM_NOFLUSH_FLAG ioctl option is specified at suspend time, the DMF_NOFLUSH_SUSPENDING flag is set in md->flags during dm_suspend(). It is always cleared before dm_suspend() returns. The flag must be visible while the target is flushing pending I/Os so it is set before presuspend where the flush starts and unset after the wait for md->pending where the flush ends. Target drivers can check this flag by calling dm_noflush_suspending(). DM_MAPIO_REQUEUE / DM_ENDIO_REQUEUE ----------------------------------- A target's map() function can now return DM_MAPIO_REQUEUE to request the device mapper core queue the bio. Similarly, a target's end_io() function can return DM_ENDIO_REQUEUE to request the same. This has been labelled 'pushback'. The __map_bio() and clone_endio() functions in the core treat these return values as errors and call dec_pending() to end the I/O. dec_pending ----------- dec_pending() saves the pushback request in struct dm_io->error. Once all the split clones have ended, dec_pending() will put the original bio on the md->pushback list. Note that this supercedes any I/O errors. It is possible for the suspend with DM_NOFLUSH_FLAG to be aborted while in progress (e.g. by user interrupt). dec_pending() checks for this and returns -EIO if it happened. pushdback list and pushback_lock -------------------------------- The bio is queued on md->pushback temporarily in dec_pending(), and after all pending I/Os return, md->pushback is merged into md->deferred in dm_suspend() for re-issuing at resume time. md->pushback_lock protects md->pushback. The lock should be held with irq disabled because dec_pending() can be called from interrupt context. Queueing bios to md->pushback in dec_pending() must be done atomically with the check for DMF_NOFLUSH_SUSPENDING flag. So md->pushback_lock is held when checking the flag. Otherwise dec_pending() may queue a bio to md->pushback after the interrupted dm_suspend() flushes md->pushback. Then the bio would be left in md->pushback. Flag setting in dm_suspend() can be done without md->pushback_lock because the flag is checked only after presuspend and the set value is already made visible via the target's presuspend function. The flag can be checked without md->pushback_lock (e.g. the first part of the dec_pending() or target drivers), because the flag is checked again with md->pushback_lock held when the bio is really queued to md->pushback as described above. So even if the flag is cleared after the lockless checkings, the bio isn't left in md->pushback but returned to applications with -EIO. Other notes on the current patch -------------------------------- - md->pushback is added to the struct mapped_device instead of using md->deferred directly because md->io_lock which protects md->deferred is rw_semaphore and can't be used in interrupt context like dec_pending(), and md->io_lock protects the DMF_BLOCK_IO flag of md->flags too. - Don't issue lock_fs() in dm_suspend() if the DM_NOFLUSH_FLAG ioctl option is specified, because I/Os generated by lock_fs() would be pushed back and never return if there were no valid devices. - If an error occurs in dm_suspend() after the DMF_NOFLUSH_SUSPENDING flag is set, md->pushback must be flushed because I/Os may be queued to the list already. (flush_and_out label in dm_suspend()) Test results ------------ I have tested using multipath target with the next patch. The following tests are for regression/compatibility: - I/Os succeed when valid paths exist; - I/Os fail when there are no valid paths and queue_if_no_path is not set; - I/Os are queued in the multipath target when there are no valid paths and queue_if_no_path is set; - The queued I/Os above fail when suspend is issued without the DM_NOFLUSH_FLAG ioctl option. I/Os spanning 2 multipath targets also fail. The following tests are for the normal code path of new pushback feature: - Queued I/Os in the multipath target are flushed from the target but don't return when suspend is issued with the DM_NOFLUSH_FLAG ioctl option; - The I/Os above are queued in the multipath target again when resume is issued without path recovery; - The I/Os above succeed when resume is issued after path recovery or table load; - Queued I/Os in the multipath target succeed when resume is issued with the DM_NOFLUSH_FLAG ioctl option after table load. I/Os spanning 2 multipath targets also succeed. The following tests are for the error paths of the new pushback feature: - When the bdget_disk() fails in dm_suspend(), the DMF_NOFLUSH_SUSPENDING flag is cleared and I/Os already queued to the pushback list are flushed properly. - When suspend with the DM_NOFLUSH_FLAG ioctl option is interrupted, o I/Os which had already been queued to the pushback list at the time don't return, and are re-issued at resume time; o I/Os which hadn't been returned at the time return with EIO. Signed-off-by: Kiyoshi Ueda <k-ueda@ct.jp.nec.com> Signed-off-by: Jun'ichi Nomura <j-nomura@ce.jp.nec.com> Signed-off-by: Alasdair G Kergon <agk@redhat.com> Cc: dm-devel@redhat.com Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-12-08 18:41:09 +08:00
if (io->error == DM_ENDIO_REQUEUE) {
/*
* Target requested pushing back the I/O.
*/
spin_lock_irqsave(&md->deferred_lock, flags);
dm: relax ordering of bio-based flush implementation Unlike REQ_HARDBARRIER, REQ_FLUSH/FUA doesn't mandate any ordering against other bio's. This patch relaxes ordering around flushes. * A flush bio is no longer deferred to workqueue directly. It's processed like other bio's but __split_and_process_bio() uses md->flush_bio as the clone source. md->flush_bio is initialized to empty flush during md initialization and shared for all flushes. * As a flush bio now travels through the same execution path as other bio's, there's no need for dedicated error handling path either. It can use the same error handling path in dec_pending(). Dedicated error handling removed along with md->flush_error. * When dec_pending() detects that a flush has completed, it checks whether the original bio has data. If so, the bio is queued to the deferred list w/ REQ_FLUSH cleared; otherwise, it's completed. * As flush sequencing is handled in the usual issue/completion path, dm_wq_work() no longer needs to handle flushes differently. Now its only responsibility is re-issuing deferred bio's the same way as _dm_request() would. REQ_FLUSH handling logic including process_flush() is dropped. * There's no reason for queue_io() and dm_wq_work() write lock dm->io_lock. queue_io() now only uses md->deferred_lock and dm_wq_work() read locks dm->io_lock. * bio's no longer need to be queued on the deferred list while a flush is in progress making DMF_QUEUE_IO_TO_THREAD unncessary. Drop it. This avoids stalling the device during flushes and simplifies the implementation. Signed-off-by: Tejun Heo <tj@kernel.org> Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
2010-09-09 00:07:00 +08:00
if (__noflush_suspending(md))
bio_list_add_head(&md->deferred, io->bio);
else
[PATCH] dm: suspend: add noflush pushback In device-mapper I/O is sometimes queued within targets for later processing. For example the multipath target can be configured to store I/O when no paths are available instead of returning it -EIO. This patch allows the device-mapper core to instruct a target to transfer the contents of any such in-target queue back into the core. This frees up the resources used by the target so the core can replace that target with an alternative one and then resend the I/O to it. Without this patch the only way to change the target in such circumstances involves returning the I/O with an error back to the filesystem/application. In the multipath case, this patch will let us add new paths for existing I/O to try after all the existing paths have failed. DMF_NOFLUSH_SUSPENDING ---------------------- If the DM_NOFLUSH_FLAG ioctl option is specified at suspend time, the DMF_NOFLUSH_SUSPENDING flag is set in md->flags during dm_suspend(). It is always cleared before dm_suspend() returns. The flag must be visible while the target is flushing pending I/Os so it is set before presuspend where the flush starts and unset after the wait for md->pending where the flush ends. Target drivers can check this flag by calling dm_noflush_suspending(). DM_MAPIO_REQUEUE / DM_ENDIO_REQUEUE ----------------------------------- A target's map() function can now return DM_MAPIO_REQUEUE to request the device mapper core queue the bio. Similarly, a target's end_io() function can return DM_ENDIO_REQUEUE to request the same. This has been labelled 'pushback'. The __map_bio() and clone_endio() functions in the core treat these return values as errors and call dec_pending() to end the I/O. dec_pending ----------- dec_pending() saves the pushback request in struct dm_io->error. Once all the split clones have ended, dec_pending() will put the original bio on the md->pushback list. Note that this supercedes any I/O errors. It is possible for the suspend with DM_NOFLUSH_FLAG to be aborted while in progress (e.g. by user interrupt). dec_pending() checks for this and returns -EIO if it happened. pushdback list and pushback_lock -------------------------------- The bio is queued on md->pushback temporarily in dec_pending(), and after all pending I/Os return, md->pushback is merged into md->deferred in dm_suspend() for re-issuing at resume time. md->pushback_lock protects md->pushback. The lock should be held with irq disabled because dec_pending() can be called from interrupt context. Queueing bios to md->pushback in dec_pending() must be done atomically with the check for DMF_NOFLUSH_SUSPENDING flag. So md->pushback_lock is held when checking the flag. Otherwise dec_pending() may queue a bio to md->pushback after the interrupted dm_suspend() flushes md->pushback. Then the bio would be left in md->pushback. Flag setting in dm_suspend() can be done without md->pushback_lock because the flag is checked only after presuspend and the set value is already made visible via the target's presuspend function. The flag can be checked without md->pushback_lock (e.g. the first part of the dec_pending() or target drivers), because the flag is checked again with md->pushback_lock held when the bio is really queued to md->pushback as described above. So even if the flag is cleared after the lockless checkings, the bio isn't left in md->pushback but returned to applications with -EIO. Other notes on the current patch -------------------------------- - md->pushback is added to the struct mapped_device instead of using md->deferred directly because md->io_lock which protects md->deferred is rw_semaphore and can't be used in interrupt context like dec_pending(), and md->io_lock protects the DMF_BLOCK_IO flag of md->flags too. - Don't issue lock_fs() in dm_suspend() if the DM_NOFLUSH_FLAG ioctl option is specified, because I/Os generated by lock_fs() would be pushed back and never return if there were no valid devices. - If an error occurs in dm_suspend() after the DMF_NOFLUSH_SUSPENDING flag is set, md->pushback must be flushed because I/Os may be queued to the list already. (flush_and_out label in dm_suspend()) Test results ------------ I have tested using multipath target with the next patch. The following tests are for regression/compatibility: - I/Os succeed when valid paths exist; - I/Os fail when there are no valid paths and queue_if_no_path is not set; - I/Os are queued in the multipath target when there are no valid paths and queue_if_no_path is set; - The queued I/Os above fail when suspend is issued without the DM_NOFLUSH_FLAG ioctl option. I/Os spanning 2 multipath targets also fail. The following tests are for the normal code path of new pushback feature: - Queued I/Os in the multipath target are flushed from the target but don't return when suspend is issued with the DM_NOFLUSH_FLAG ioctl option; - The I/Os above are queued in the multipath target again when resume is issued without path recovery; - The I/Os above succeed when resume is issued after path recovery or table load; - Queued I/Os in the multipath target succeed when resume is issued with the DM_NOFLUSH_FLAG ioctl option after table load. I/Os spanning 2 multipath targets also succeed. The following tests are for the error paths of the new pushback feature: - When the bdget_disk() fails in dm_suspend(), the DMF_NOFLUSH_SUSPENDING flag is cleared and I/Os already queued to the pushback list are flushed properly. - When suspend with the DM_NOFLUSH_FLAG ioctl option is interrupted, o I/Os which had already been queued to the pushback list at the time don't return, and are re-issued at resume time; o I/Os which hadn't been returned at the time return with EIO. Signed-off-by: Kiyoshi Ueda <k-ueda@ct.jp.nec.com> Signed-off-by: Jun'ichi Nomura <j-nomura@ce.jp.nec.com> Signed-off-by: Alasdair G Kergon <agk@redhat.com> Cc: dm-devel@redhat.com Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-12-08 18:41:09 +08:00
/* noflush suspend was interrupted. */
io->error = -EIO;
spin_unlock_irqrestore(&md->deferred_lock, flags);
[PATCH] dm: suspend: add noflush pushback In device-mapper I/O is sometimes queued within targets for later processing. For example the multipath target can be configured to store I/O when no paths are available instead of returning it -EIO. This patch allows the device-mapper core to instruct a target to transfer the contents of any such in-target queue back into the core. This frees up the resources used by the target so the core can replace that target with an alternative one and then resend the I/O to it. Without this patch the only way to change the target in such circumstances involves returning the I/O with an error back to the filesystem/application. In the multipath case, this patch will let us add new paths for existing I/O to try after all the existing paths have failed. DMF_NOFLUSH_SUSPENDING ---------------------- If the DM_NOFLUSH_FLAG ioctl option is specified at suspend time, the DMF_NOFLUSH_SUSPENDING flag is set in md->flags during dm_suspend(). It is always cleared before dm_suspend() returns. The flag must be visible while the target is flushing pending I/Os so it is set before presuspend where the flush starts and unset after the wait for md->pending where the flush ends. Target drivers can check this flag by calling dm_noflush_suspending(). DM_MAPIO_REQUEUE / DM_ENDIO_REQUEUE ----------------------------------- A target's map() function can now return DM_MAPIO_REQUEUE to request the device mapper core queue the bio. Similarly, a target's end_io() function can return DM_ENDIO_REQUEUE to request the same. This has been labelled 'pushback'. The __map_bio() and clone_endio() functions in the core treat these return values as errors and call dec_pending() to end the I/O. dec_pending ----------- dec_pending() saves the pushback request in struct dm_io->error. Once all the split clones have ended, dec_pending() will put the original bio on the md->pushback list. Note that this supercedes any I/O errors. It is possible for the suspend with DM_NOFLUSH_FLAG to be aborted while in progress (e.g. by user interrupt). dec_pending() checks for this and returns -EIO if it happened. pushdback list and pushback_lock -------------------------------- The bio is queued on md->pushback temporarily in dec_pending(), and after all pending I/Os return, md->pushback is merged into md->deferred in dm_suspend() for re-issuing at resume time. md->pushback_lock protects md->pushback. The lock should be held with irq disabled because dec_pending() can be called from interrupt context. Queueing bios to md->pushback in dec_pending() must be done atomically with the check for DMF_NOFLUSH_SUSPENDING flag. So md->pushback_lock is held when checking the flag. Otherwise dec_pending() may queue a bio to md->pushback after the interrupted dm_suspend() flushes md->pushback. Then the bio would be left in md->pushback. Flag setting in dm_suspend() can be done without md->pushback_lock because the flag is checked only after presuspend and the set value is already made visible via the target's presuspend function. The flag can be checked without md->pushback_lock (e.g. the first part of the dec_pending() or target drivers), because the flag is checked again with md->pushback_lock held when the bio is really queued to md->pushback as described above. So even if the flag is cleared after the lockless checkings, the bio isn't left in md->pushback but returned to applications with -EIO. Other notes on the current patch -------------------------------- - md->pushback is added to the struct mapped_device instead of using md->deferred directly because md->io_lock which protects md->deferred is rw_semaphore and can't be used in interrupt context like dec_pending(), and md->io_lock protects the DMF_BLOCK_IO flag of md->flags too. - Don't issue lock_fs() in dm_suspend() if the DM_NOFLUSH_FLAG ioctl option is specified, because I/Os generated by lock_fs() would be pushed back and never return if there were no valid devices. - If an error occurs in dm_suspend() after the DMF_NOFLUSH_SUSPENDING flag is set, md->pushback must be flushed because I/Os may be queued to the list already. (flush_and_out label in dm_suspend()) Test results ------------ I have tested using multipath target with the next patch. The following tests are for regression/compatibility: - I/Os succeed when valid paths exist; - I/Os fail when there are no valid paths and queue_if_no_path is not set; - I/Os are queued in the multipath target when there are no valid paths and queue_if_no_path is set; - The queued I/Os above fail when suspend is issued without the DM_NOFLUSH_FLAG ioctl option. I/Os spanning 2 multipath targets also fail. The following tests are for the normal code path of new pushback feature: - Queued I/Os in the multipath target are flushed from the target but don't return when suspend is issued with the DM_NOFLUSH_FLAG ioctl option; - The I/Os above are queued in the multipath target again when resume is issued without path recovery; - The I/Os above succeed when resume is issued after path recovery or table load; - Queued I/Os in the multipath target succeed when resume is issued with the DM_NOFLUSH_FLAG ioctl option after table load. I/Os spanning 2 multipath targets also succeed. The following tests are for the error paths of the new pushback feature: - When the bdget_disk() fails in dm_suspend(), the DMF_NOFLUSH_SUSPENDING flag is cleared and I/Os already queued to the pushback list are flushed properly. - When suspend with the DM_NOFLUSH_FLAG ioctl option is interrupted, o I/Os which had already been queued to the pushback list at the time don't return, and are re-issued at resume time; o I/Os which hadn't been returned at the time return with EIO. Signed-off-by: Kiyoshi Ueda <k-ueda@ct.jp.nec.com> Signed-off-by: Jun'ichi Nomura <j-nomura@ce.jp.nec.com> Signed-off-by: Alasdair G Kergon <agk@redhat.com> Cc: dm-devel@redhat.com Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-12-08 18:41:09 +08:00
}
io_error = io->error;
bio = io->bio;
dm: relax ordering of bio-based flush implementation Unlike REQ_HARDBARRIER, REQ_FLUSH/FUA doesn't mandate any ordering against other bio's. This patch relaxes ordering around flushes. * A flush bio is no longer deferred to workqueue directly. It's processed like other bio's but __split_and_process_bio() uses md->flush_bio as the clone source. md->flush_bio is initialized to empty flush during md initialization and shared for all flushes. * As a flush bio now travels through the same execution path as other bio's, there's no need for dedicated error handling path either. It can use the same error handling path in dec_pending(). Dedicated error handling removed along with md->flush_error. * When dec_pending() detects that a flush has completed, it checks whether the original bio has data. If so, the bio is queued to the deferred list w/ REQ_FLUSH cleared; otherwise, it's completed. * As flush sequencing is handled in the usual issue/completion path, dm_wq_work() no longer needs to handle flushes differently. Now its only responsibility is re-issuing deferred bio's the same way as _dm_request() would. REQ_FLUSH handling logic including process_flush() is dropped. * There's no reason for queue_io() and dm_wq_work() write lock dm->io_lock. queue_io() now only uses md->deferred_lock and dm_wq_work() read locks dm->io_lock. * bio's no longer need to be queued on the deferred list while a flush is in progress making DMF_QUEUE_IO_TO_THREAD unncessary. Drop it. This avoids stalling the device during flushes and simplifies the implementation. Signed-off-by: Tejun Heo <tj@kernel.org> Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
2010-09-09 00:07:00 +08:00
end_io_acct(io);
free_io(md, io);
if (io_error == DM_ENDIO_REQUEUE)
return;
[PATCH] dm: suspend: add noflush pushback In device-mapper I/O is sometimes queued within targets for later processing. For example the multipath target can be configured to store I/O when no paths are available instead of returning it -EIO. This patch allows the device-mapper core to instruct a target to transfer the contents of any such in-target queue back into the core. This frees up the resources used by the target so the core can replace that target with an alternative one and then resend the I/O to it. Without this patch the only way to change the target in such circumstances involves returning the I/O with an error back to the filesystem/application. In the multipath case, this patch will let us add new paths for existing I/O to try after all the existing paths have failed. DMF_NOFLUSH_SUSPENDING ---------------------- If the DM_NOFLUSH_FLAG ioctl option is specified at suspend time, the DMF_NOFLUSH_SUSPENDING flag is set in md->flags during dm_suspend(). It is always cleared before dm_suspend() returns. The flag must be visible while the target is flushing pending I/Os so it is set before presuspend where the flush starts and unset after the wait for md->pending where the flush ends. Target drivers can check this flag by calling dm_noflush_suspending(). DM_MAPIO_REQUEUE / DM_ENDIO_REQUEUE ----------------------------------- A target's map() function can now return DM_MAPIO_REQUEUE to request the device mapper core queue the bio. Similarly, a target's end_io() function can return DM_ENDIO_REQUEUE to request the same. This has been labelled 'pushback'. The __map_bio() and clone_endio() functions in the core treat these return values as errors and call dec_pending() to end the I/O. dec_pending ----------- dec_pending() saves the pushback request in struct dm_io->error. Once all the split clones have ended, dec_pending() will put the original bio on the md->pushback list. Note that this supercedes any I/O errors. It is possible for the suspend with DM_NOFLUSH_FLAG to be aborted while in progress (e.g. by user interrupt). dec_pending() checks for this and returns -EIO if it happened. pushdback list and pushback_lock -------------------------------- The bio is queued on md->pushback temporarily in dec_pending(), and after all pending I/Os return, md->pushback is merged into md->deferred in dm_suspend() for re-issuing at resume time. md->pushback_lock protects md->pushback. The lock should be held with irq disabled because dec_pending() can be called from interrupt context. Queueing bios to md->pushback in dec_pending() must be done atomically with the check for DMF_NOFLUSH_SUSPENDING flag. So md->pushback_lock is held when checking the flag. Otherwise dec_pending() may queue a bio to md->pushback after the interrupted dm_suspend() flushes md->pushback. Then the bio would be left in md->pushback. Flag setting in dm_suspend() can be done without md->pushback_lock because the flag is checked only after presuspend and the set value is already made visible via the target's presuspend function. The flag can be checked without md->pushback_lock (e.g. the first part of the dec_pending() or target drivers), because the flag is checked again with md->pushback_lock held when the bio is really queued to md->pushback as described above. So even if the flag is cleared after the lockless checkings, the bio isn't left in md->pushback but returned to applications with -EIO. Other notes on the current patch -------------------------------- - md->pushback is added to the struct mapped_device instead of using md->deferred directly because md->io_lock which protects md->deferred is rw_semaphore and can't be used in interrupt context like dec_pending(), and md->io_lock protects the DMF_BLOCK_IO flag of md->flags too. - Don't issue lock_fs() in dm_suspend() if the DM_NOFLUSH_FLAG ioctl option is specified, because I/Os generated by lock_fs() would be pushed back and never return if there were no valid devices. - If an error occurs in dm_suspend() after the DMF_NOFLUSH_SUSPENDING flag is set, md->pushback must be flushed because I/Os may be queued to the list already. (flush_and_out label in dm_suspend()) Test results ------------ I have tested using multipath target with the next patch. The following tests are for regression/compatibility: - I/Os succeed when valid paths exist; - I/Os fail when there are no valid paths and queue_if_no_path is not set; - I/Os are queued in the multipath target when there are no valid paths and queue_if_no_path is set; - The queued I/Os above fail when suspend is issued without the DM_NOFLUSH_FLAG ioctl option. I/Os spanning 2 multipath targets also fail. The following tests are for the normal code path of new pushback feature: - Queued I/Os in the multipath target are flushed from the target but don't return when suspend is issued with the DM_NOFLUSH_FLAG ioctl option; - The I/Os above are queued in the multipath target again when resume is issued without path recovery; - The I/Os above succeed when resume is issued after path recovery or table load; - Queued I/Os in the multipath target succeed when resume is issued with the DM_NOFLUSH_FLAG ioctl option after table load. I/Os spanning 2 multipath targets also succeed. The following tests are for the error paths of the new pushback feature: - When the bdget_disk() fails in dm_suspend(), the DMF_NOFLUSH_SUSPENDING flag is cleared and I/Os already queued to the pushback list are flushed properly. - When suspend with the DM_NOFLUSH_FLAG ioctl option is interrupted, o I/Os which had already been queued to the pushback list at the time don't return, and are re-issued at resume time; o I/Os which hadn't been returned at the time return with EIO. Signed-off-by: Kiyoshi Ueda <k-ueda@ct.jp.nec.com> Signed-off-by: Jun'ichi Nomura <j-nomura@ce.jp.nec.com> Signed-off-by: Alasdair G Kergon <agk@redhat.com> Cc: dm-devel@redhat.com Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-12-08 18:41:09 +08:00
if ((bio->bi_rw & REQ_FLUSH) && bio->bi_size) {
/*
dm: relax ordering of bio-based flush implementation Unlike REQ_HARDBARRIER, REQ_FLUSH/FUA doesn't mandate any ordering against other bio's. This patch relaxes ordering around flushes. * A flush bio is no longer deferred to workqueue directly. It's processed like other bio's but __split_and_process_bio() uses md->flush_bio as the clone source. md->flush_bio is initialized to empty flush during md initialization and shared for all flushes. * As a flush bio now travels through the same execution path as other bio's, there's no need for dedicated error handling path either. It can use the same error handling path in dec_pending(). Dedicated error handling removed along with md->flush_error. * When dec_pending() detects that a flush has completed, it checks whether the original bio has data. If so, the bio is queued to the deferred list w/ REQ_FLUSH cleared; otherwise, it's completed. * As flush sequencing is handled in the usual issue/completion path, dm_wq_work() no longer needs to handle flushes differently. Now its only responsibility is re-issuing deferred bio's the same way as _dm_request() would. REQ_FLUSH handling logic including process_flush() is dropped. * There's no reason for queue_io() and dm_wq_work() write lock dm->io_lock. queue_io() now only uses md->deferred_lock and dm_wq_work() read locks dm->io_lock. * bio's no longer need to be queued on the deferred list while a flush is in progress making DMF_QUEUE_IO_TO_THREAD unncessary. Drop it. This avoids stalling the device during flushes and simplifies the implementation. Signed-off-by: Tejun Heo <tj@kernel.org> Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
2010-09-09 00:07:00 +08:00
* Preflush done for flush with data, reissue
* without REQ_FLUSH.
*/
dm: relax ordering of bio-based flush implementation Unlike REQ_HARDBARRIER, REQ_FLUSH/FUA doesn't mandate any ordering against other bio's. This patch relaxes ordering around flushes. * A flush bio is no longer deferred to workqueue directly. It's processed like other bio's but __split_and_process_bio() uses md->flush_bio as the clone source. md->flush_bio is initialized to empty flush during md initialization and shared for all flushes. * As a flush bio now travels through the same execution path as other bio's, there's no need for dedicated error handling path either. It can use the same error handling path in dec_pending(). Dedicated error handling removed along with md->flush_error. * When dec_pending() detects that a flush has completed, it checks whether the original bio has data. If so, the bio is queued to the deferred list w/ REQ_FLUSH cleared; otherwise, it's completed. * As flush sequencing is handled in the usual issue/completion path, dm_wq_work() no longer needs to handle flushes differently. Now its only responsibility is re-issuing deferred bio's the same way as _dm_request() would. REQ_FLUSH handling logic including process_flush() is dropped. * There's no reason for queue_io() and dm_wq_work() write lock dm->io_lock. queue_io() now only uses md->deferred_lock and dm_wq_work() read locks dm->io_lock. * bio's no longer need to be queued on the deferred list while a flush is in progress making DMF_QUEUE_IO_TO_THREAD unncessary. Drop it. This avoids stalling the device during flushes and simplifies the implementation. Signed-off-by: Tejun Heo <tj@kernel.org> Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
2010-09-09 00:07:00 +08:00
bio->bi_rw &= ~REQ_FLUSH;
queue_io(md, bio);
} else {
/* done with normal IO or empty flush */
trace_block_bio_complete(md->queue, bio, io_error);
bio_endio(bio, io_error);
}
}
}
static void clone_endio(struct bio *bio, int error)
{
int r = 0;
struct dm_target_io *tio = bio->bi_private;
struct dm_io *io = tio->io;
struct mapped_device *md = tio->io->md;
dm_endio_fn endio = tio->ti->type->end_io;
if (!bio_flagged(bio, BIO_UPTODATE) && !error)
error = -EIO;
if (endio) {
r = endio(tio->ti, bio, error, &tio->info);
[PATCH] dm: suspend: add noflush pushback In device-mapper I/O is sometimes queued within targets for later processing. For example the multipath target can be configured to store I/O when no paths are available instead of returning it -EIO. This patch allows the device-mapper core to instruct a target to transfer the contents of any such in-target queue back into the core. This frees up the resources used by the target so the core can replace that target with an alternative one and then resend the I/O to it. Without this patch the only way to change the target in such circumstances involves returning the I/O with an error back to the filesystem/application. In the multipath case, this patch will let us add new paths for existing I/O to try after all the existing paths have failed. DMF_NOFLUSH_SUSPENDING ---------------------- If the DM_NOFLUSH_FLAG ioctl option is specified at suspend time, the DMF_NOFLUSH_SUSPENDING flag is set in md->flags during dm_suspend(). It is always cleared before dm_suspend() returns. The flag must be visible while the target is flushing pending I/Os so it is set before presuspend where the flush starts and unset after the wait for md->pending where the flush ends. Target drivers can check this flag by calling dm_noflush_suspending(). DM_MAPIO_REQUEUE / DM_ENDIO_REQUEUE ----------------------------------- A target's map() function can now return DM_MAPIO_REQUEUE to request the device mapper core queue the bio. Similarly, a target's end_io() function can return DM_ENDIO_REQUEUE to request the same. This has been labelled 'pushback'. The __map_bio() and clone_endio() functions in the core treat these return values as errors and call dec_pending() to end the I/O. dec_pending ----------- dec_pending() saves the pushback request in struct dm_io->error. Once all the split clones have ended, dec_pending() will put the original bio on the md->pushback list. Note that this supercedes any I/O errors. It is possible for the suspend with DM_NOFLUSH_FLAG to be aborted while in progress (e.g. by user interrupt). dec_pending() checks for this and returns -EIO if it happened. pushdback list and pushback_lock -------------------------------- The bio is queued on md->pushback temporarily in dec_pending(), and after all pending I/Os return, md->pushback is merged into md->deferred in dm_suspend() for re-issuing at resume time. md->pushback_lock protects md->pushback. The lock should be held with irq disabled because dec_pending() can be called from interrupt context. Queueing bios to md->pushback in dec_pending() must be done atomically with the check for DMF_NOFLUSH_SUSPENDING flag. So md->pushback_lock is held when checking the flag. Otherwise dec_pending() may queue a bio to md->pushback after the interrupted dm_suspend() flushes md->pushback. Then the bio would be left in md->pushback. Flag setting in dm_suspend() can be done without md->pushback_lock because the flag is checked only after presuspend and the set value is already made visible via the target's presuspend function. The flag can be checked without md->pushback_lock (e.g. the first part of the dec_pending() or target drivers), because the flag is checked again with md->pushback_lock held when the bio is really queued to md->pushback as described above. So even if the flag is cleared after the lockless checkings, the bio isn't left in md->pushback but returned to applications with -EIO. Other notes on the current patch -------------------------------- - md->pushback is added to the struct mapped_device instead of using md->deferred directly because md->io_lock which protects md->deferred is rw_semaphore and can't be used in interrupt context like dec_pending(), and md->io_lock protects the DMF_BLOCK_IO flag of md->flags too. - Don't issue lock_fs() in dm_suspend() if the DM_NOFLUSH_FLAG ioctl option is specified, because I/Os generated by lock_fs() would be pushed back and never return if there were no valid devices. - If an error occurs in dm_suspend() after the DMF_NOFLUSH_SUSPENDING flag is set, md->pushback must be flushed because I/Os may be queued to the list already. (flush_and_out label in dm_suspend()) Test results ------------ I have tested using multipath target with the next patch. The following tests are for regression/compatibility: - I/Os succeed when valid paths exist; - I/Os fail when there are no valid paths and queue_if_no_path is not set; - I/Os are queued in the multipath target when there are no valid paths and queue_if_no_path is set; - The queued I/Os above fail when suspend is issued without the DM_NOFLUSH_FLAG ioctl option. I/Os spanning 2 multipath targets also fail. The following tests are for the normal code path of new pushback feature: - Queued I/Os in the multipath target are flushed from the target but don't return when suspend is issued with the DM_NOFLUSH_FLAG ioctl option; - The I/Os above are queued in the multipath target again when resume is issued without path recovery; - The I/Os above succeed when resume is issued after path recovery or table load; - Queued I/Os in the multipath target succeed when resume is issued with the DM_NOFLUSH_FLAG ioctl option after table load. I/Os spanning 2 multipath targets also succeed. The following tests are for the error paths of the new pushback feature: - When the bdget_disk() fails in dm_suspend(), the DMF_NOFLUSH_SUSPENDING flag is cleared and I/Os already queued to the pushback list are flushed properly. - When suspend with the DM_NOFLUSH_FLAG ioctl option is interrupted, o I/Os which had already been queued to the pushback list at the time don't return, and are re-issued at resume time; o I/Os which hadn't been returned at the time return with EIO. Signed-off-by: Kiyoshi Ueda <k-ueda@ct.jp.nec.com> Signed-off-by: Jun'ichi Nomura <j-nomura@ce.jp.nec.com> Signed-off-by: Alasdair G Kergon <agk@redhat.com> Cc: dm-devel@redhat.com Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-12-08 18:41:09 +08:00
if (r < 0 || r == DM_ENDIO_REQUEUE)
/*
* error and requeue request are handled
* in dec_pending().
*/
error = r;
2006-12-08 18:41:05 +08:00
else if (r == DM_ENDIO_INCOMPLETE)
/* The target will handle the io */
return;
2006-12-08 18:41:05 +08:00
else if (r) {
DMWARN("unimplemented target endio return value: %d", r);
BUG();
}
}
/*
* Store md for cleanup instead of tio which is about to get freed.
*/
bio->bi_private = md->bs;
free_tio(md, tio);
bio_put(bio);
dec_pending(io, error);
}
dm: prepare for request based option This patch adds core functions for request-based dm. When struct mapped device (md) is initialized, md->queue has an I/O scheduler and the following functions are used for request-based dm as the queue functions: make_request_fn: dm_make_request() pref_fn: dm_prep_fn() request_fn: dm_request_fn() softirq_done_fn: dm_softirq_done() lld_busy_fn: dm_lld_busy() Actual initializations are done in another patch (PATCH 2). Below is a brief summary of how request-based dm behaves, including: - making request from bio - cloning, mapping and dispatching request - completing request and bio - suspending md - resuming md bio to request ============== md->queue->make_request_fn() (dm_make_request()) calls __make_request() for a bio submitted to the md. Then, the bio is kept in the queue as a new request or merged into another request in the queue if possible. Cloning and Mapping =================== Cloning and mapping are done in md->queue->request_fn() (dm_request_fn()), when requests are dispatched after they are sorted by the I/O scheduler. dm_request_fn() checks busy state of underlying devices using target's busy() function and stops dispatching requests to keep them on the dm device's queue if busy. It helps better I/O merging, since no merge is done for a request once it is dispatched to underlying devices. Actual cloning and mapping are done in dm_prep_fn() and map_request() called from dm_request_fn(). dm_prep_fn() clones not only request but also bios of the request so that dm can hold bio completion in error cases and prevent the bio submitter from noticing the error. (See the "Completion" section below for details.) After the cloning, the clone is mapped by target's map_rq() function and inserted to underlying device's queue using blk_insert_cloned_request(). Completion ========== Request completion can be hooked by rq->end_io(), but then, all bios in the request will have been completed even error cases, and the bio submitter will have noticed the error. To prevent the bio completion in error cases, request-based dm clones both bio and request and hooks both bio->bi_end_io() and rq->end_io(): bio->bi_end_io(): end_clone_bio() rq->end_io(): end_clone_request() Summary of the request completion flow is below: blk_end_request() for a clone request => blk_update_request() => bio->bi_end_io() == end_clone_bio() for each clone bio => Free the clone bio => Success: Complete the original bio (blk_update_request()) Error: Don't complete the original bio => blk_finish_request() => rq->end_io() == end_clone_request() => blk_complete_request() => dm_softirq_done() => Free the clone request => Success: Complete the original request (blk_end_request()) Error: Requeue the original request end_clone_bio() completes the original request on the size of the original bio in successful cases. Even if all bios in the original request are completed by that completion, the original request must not be completed yet to keep the ordering of request completion for the stacking. So end_clone_bio() uses blk_update_request() instead of blk_end_request(). In error cases, end_clone_bio() doesn't complete the original bio. It just frees the cloned bio and gives over the error handling to end_clone_request(). end_clone_request(), which is called with queue lock held, completes the clone request and the original request in a softirq context (dm_softirq_done()), which has no queue lock, to avoid a deadlock issue on submission of another request during the completion: - The submitted request may be mapped to the same device - Request submission requires queue lock, but the queue lock has been held by itself and it doesn't know that The clone request has no clone bio when dm_softirq_done() is called. So target drivers can't resubmit it again even error cases. Instead, they can ask dm core for requeueing and remapping the original request in that cases. suspend ======= Request-based dm uses stopping md->queue as suspend of the md. For noflush suspend, just stops md->queue. For flush suspend, inserts a marker request to the tail of md->queue. And dispatches all requests in md->queue until the marker comes to the front of md->queue. Then, stops dispatching request and waits for the all dispatched requests to complete. After that, completes the marker request, stops md->queue and wake up the waiter on the suspend queue, md->wait. resume ====== Starts md->queue. Signed-off-by: Kiyoshi Ueda <k-ueda@ct.jp.nec.com> Signed-off-by: Jun'ichi Nomura <j-nomura@ce.jp.nec.com> Signed-off-by: Alasdair G Kergon <agk@redhat.com>
2009-06-22 17:12:35 +08:00
/*
* Partial completion handling for request-based dm
*/
static void end_clone_bio(struct bio *clone, int error)
{
struct dm_rq_clone_bio_info *info = clone->bi_private;
struct dm_rq_target_io *tio = info->tio;
struct bio *bio = info->orig;
unsigned int nr_bytes = info->orig->bi_size;
bio_put(clone);
if (tio->error)
/*
* An error has already been detected on the request.
* Once error occurred, just let clone->end_io() handle
* the remainder.
*/
return;
else if (error) {
/*
* Don't notice the error to the upper layer yet.
* The error handling decision is made by the target driver,
* when the request is completed.
*/
tio->error = error;
return;
}
/*
* I/O for the bio successfully completed.
* Notice the data completion to the upper layer.
*/
/*
* bios are processed from the head of the list.
* So the completing bio should always be rq->bio.
* If it's not, something wrong is happening.
*/
if (tio->orig->bio != bio)
DMERR("bio completion is going in the middle of the request");
/*
* Update the original request.
* Do not use blk_end_request() here, because it may complete
* the original request before the clone, and break the ordering.
*/
blk_update_request(tio->orig, 0, nr_bytes);
}
/*
* Don't touch any member of the md after calling this function because
* the md may be freed in dm_put() at the end of this function.
* Or do dm_get() before calling this function and dm_put() later.
*/
static void rq_completed(struct mapped_device *md, int rw, int run_queue)
dm: prepare for request based option This patch adds core functions for request-based dm. When struct mapped device (md) is initialized, md->queue has an I/O scheduler and the following functions are used for request-based dm as the queue functions: make_request_fn: dm_make_request() pref_fn: dm_prep_fn() request_fn: dm_request_fn() softirq_done_fn: dm_softirq_done() lld_busy_fn: dm_lld_busy() Actual initializations are done in another patch (PATCH 2). Below is a brief summary of how request-based dm behaves, including: - making request from bio - cloning, mapping and dispatching request - completing request and bio - suspending md - resuming md bio to request ============== md->queue->make_request_fn() (dm_make_request()) calls __make_request() for a bio submitted to the md. Then, the bio is kept in the queue as a new request or merged into another request in the queue if possible. Cloning and Mapping =================== Cloning and mapping are done in md->queue->request_fn() (dm_request_fn()), when requests are dispatched after they are sorted by the I/O scheduler. dm_request_fn() checks busy state of underlying devices using target's busy() function and stops dispatching requests to keep them on the dm device's queue if busy. It helps better I/O merging, since no merge is done for a request once it is dispatched to underlying devices. Actual cloning and mapping are done in dm_prep_fn() and map_request() called from dm_request_fn(). dm_prep_fn() clones not only request but also bios of the request so that dm can hold bio completion in error cases and prevent the bio submitter from noticing the error. (See the "Completion" section below for details.) After the cloning, the clone is mapped by target's map_rq() function and inserted to underlying device's queue using blk_insert_cloned_request(). Completion ========== Request completion can be hooked by rq->end_io(), but then, all bios in the request will have been completed even error cases, and the bio submitter will have noticed the error. To prevent the bio completion in error cases, request-based dm clones both bio and request and hooks both bio->bi_end_io() and rq->end_io(): bio->bi_end_io(): end_clone_bio() rq->end_io(): end_clone_request() Summary of the request completion flow is below: blk_end_request() for a clone request => blk_update_request() => bio->bi_end_io() == end_clone_bio() for each clone bio => Free the clone bio => Success: Complete the original bio (blk_update_request()) Error: Don't complete the original bio => blk_finish_request() => rq->end_io() == end_clone_request() => blk_complete_request() => dm_softirq_done() => Free the clone request => Success: Complete the original request (blk_end_request()) Error: Requeue the original request end_clone_bio() completes the original request on the size of the original bio in successful cases. Even if all bios in the original request are completed by that completion, the original request must not be completed yet to keep the ordering of request completion for the stacking. So end_clone_bio() uses blk_update_request() instead of blk_end_request(). In error cases, end_clone_bio() doesn't complete the original bio. It just frees the cloned bio and gives over the error handling to end_clone_request(). end_clone_request(), which is called with queue lock held, completes the clone request and the original request in a softirq context (dm_softirq_done()), which has no queue lock, to avoid a deadlock issue on submission of another request during the completion: - The submitted request may be mapped to the same device - Request submission requires queue lock, but the queue lock has been held by itself and it doesn't know that The clone request has no clone bio when dm_softirq_done() is called. So target drivers can't resubmit it again even error cases. Instead, they can ask dm core for requeueing and remapping the original request in that cases. suspend ======= Request-based dm uses stopping md->queue as suspend of the md. For noflush suspend, just stops md->queue. For flush suspend, inserts a marker request to the tail of md->queue. And dispatches all requests in md->queue until the marker comes to the front of md->queue. Then, stops dispatching request and waits for the all dispatched requests to complete. After that, completes the marker request, stops md->queue and wake up the waiter on the suspend queue, md->wait. resume ====== Starts md->queue. Signed-off-by: Kiyoshi Ueda <k-ueda@ct.jp.nec.com> Signed-off-by: Jun'ichi Nomura <j-nomura@ce.jp.nec.com> Signed-off-by: Alasdair G Kergon <agk@redhat.com>
2009-06-22 17:12:35 +08:00
{
atomic_dec(&md->pending[rw]);
dm: prepare for request based option This patch adds core functions for request-based dm. When struct mapped device (md) is initialized, md->queue has an I/O scheduler and the following functions are used for request-based dm as the queue functions: make_request_fn: dm_make_request() pref_fn: dm_prep_fn() request_fn: dm_request_fn() softirq_done_fn: dm_softirq_done() lld_busy_fn: dm_lld_busy() Actual initializations are done in another patch (PATCH 2). Below is a brief summary of how request-based dm behaves, including: - making request from bio - cloning, mapping and dispatching request - completing request and bio - suspending md - resuming md bio to request ============== md->queue->make_request_fn() (dm_make_request()) calls __make_request() for a bio submitted to the md. Then, the bio is kept in the queue as a new request or merged into another request in the queue if possible. Cloning and Mapping =================== Cloning and mapping are done in md->queue->request_fn() (dm_request_fn()), when requests are dispatched after they are sorted by the I/O scheduler. dm_request_fn() checks busy state of underlying devices using target's busy() function and stops dispatching requests to keep them on the dm device's queue if busy. It helps better I/O merging, since no merge is done for a request once it is dispatched to underlying devices. Actual cloning and mapping are done in dm_prep_fn() and map_request() called from dm_request_fn(). dm_prep_fn() clones not only request but also bios of the request so that dm can hold bio completion in error cases and prevent the bio submitter from noticing the error. (See the "Completion" section below for details.) After the cloning, the clone is mapped by target's map_rq() function and inserted to underlying device's queue using blk_insert_cloned_request(). Completion ========== Request completion can be hooked by rq->end_io(), but then, all bios in the request will have been completed even error cases, and the bio submitter will have noticed the error. To prevent the bio completion in error cases, request-based dm clones both bio and request and hooks both bio->bi_end_io() and rq->end_io(): bio->bi_end_io(): end_clone_bio() rq->end_io(): end_clone_request() Summary of the request completion flow is below: blk_end_request() for a clone request => blk_update_request() => bio->bi_end_io() == end_clone_bio() for each clone bio => Free the clone bio => Success: Complete the original bio (blk_update_request()) Error: Don't complete the original bio => blk_finish_request() => rq->end_io() == end_clone_request() => blk_complete_request() => dm_softirq_done() => Free the clone request => Success: Complete the original request (blk_end_request()) Error: Requeue the original request end_clone_bio() completes the original request on the size of the original bio in successful cases. Even if all bios in the original request are completed by that completion, the original request must not be completed yet to keep the ordering of request completion for the stacking. So end_clone_bio() uses blk_update_request() instead of blk_end_request(). In error cases, end_clone_bio() doesn't complete the original bio. It just frees the cloned bio and gives over the error handling to end_clone_request(). end_clone_request(), which is called with queue lock held, completes the clone request and the original request in a softirq context (dm_softirq_done()), which has no queue lock, to avoid a deadlock issue on submission of another request during the completion: - The submitted request may be mapped to the same device - Request submission requires queue lock, but the queue lock has been held by itself and it doesn't know that The clone request has no clone bio when dm_softirq_done() is called. So target drivers can't resubmit it again even error cases. Instead, they can ask dm core for requeueing and remapping the original request in that cases. suspend ======= Request-based dm uses stopping md->queue as suspend of the md. For noflush suspend, just stops md->queue. For flush suspend, inserts a marker request to the tail of md->queue. And dispatches all requests in md->queue until the marker comes to the front of md->queue. Then, stops dispatching request and waits for the all dispatched requests to complete. After that, completes the marker request, stops md->queue and wake up the waiter on the suspend queue, md->wait. resume ====== Starts md->queue. Signed-off-by: Kiyoshi Ueda <k-ueda@ct.jp.nec.com> Signed-off-by: Jun'ichi Nomura <j-nomura@ce.jp.nec.com> Signed-off-by: Alasdair G Kergon <agk@redhat.com>
2009-06-22 17:12:35 +08:00
/* nudge anyone waiting on suspend queue */
if (!md_in_flight(md))
dm: prepare for request based option This patch adds core functions for request-based dm. When struct mapped device (md) is initialized, md->queue has an I/O scheduler and the following functions are used for request-based dm as the queue functions: make_request_fn: dm_make_request() pref_fn: dm_prep_fn() request_fn: dm_request_fn() softirq_done_fn: dm_softirq_done() lld_busy_fn: dm_lld_busy() Actual initializations are done in another patch (PATCH 2). Below is a brief summary of how request-based dm behaves, including: - making request from bio - cloning, mapping and dispatching request - completing request and bio - suspending md - resuming md bio to request ============== md->queue->make_request_fn() (dm_make_request()) calls __make_request() for a bio submitted to the md. Then, the bio is kept in the queue as a new request or merged into another request in the queue if possible. Cloning and Mapping =================== Cloning and mapping are done in md->queue->request_fn() (dm_request_fn()), when requests are dispatched after they are sorted by the I/O scheduler. dm_request_fn() checks busy state of underlying devices using target's busy() function and stops dispatching requests to keep them on the dm device's queue if busy. It helps better I/O merging, since no merge is done for a request once it is dispatched to underlying devices. Actual cloning and mapping are done in dm_prep_fn() and map_request() called from dm_request_fn(). dm_prep_fn() clones not only request but also bios of the request so that dm can hold bio completion in error cases and prevent the bio submitter from noticing the error. (See the "Completion" section below for details.) After the cloning, the clone is mapped by target's map_rq() function and inserted to underlying device's queue using blk_insert_cloned_request(). Completion ========== Request completion can be hooked by rq->end_io(), but then, all bios in the request will have been completed even error cases, and the bio submitter will have noticed the error. To prevent the bio completion in error cases, request-based dm clones both bio and request and hooks both bio->bi_end_io() and rq->end_io(): bio->bi_end_io(): end_clone_bio() rq->end_io(): end_clone_request() Summary of the request completion flow is below: blk_end_request() for a clone request => blk_update_request() => bio->bi_end_io() == end_clone_bio() for each clone bio => Free the clone bio => Success: Complete the original bio (blk_update_request()) Error: Don't complete the original bio => blk_finish_request() => rq->end_io() == end_clone_request() => blk_complete_request() => dm_softirq_done() => Free the clone request => Success: Complete the original request (blk_end_request()) Error: Requeue the original request end_clone_bio() completes the original request on the size of the original bio in successful cases. Even if all bios in the original request are completed by that completion, the original request must not be completed yet to keep the ordering of request completion for the stacking. So end_clone_bio() uses blk_update_request() instead of blk_end_request(). In error cases, end_clone_bio() doesn't complete the original bio. It just frees the cloned bio and gives over the error handling to end_clone_request(). end_clone_request(), which is called with queue lock held, completes the clone request and the original request in a softirq context (dm_softirq_done()), which has no queue lock, to avoid a deadlock issue on submission of another request during the completion: - The submitted request may be mapped to the same device - Request submission requires queue lock, but the queue lock has been held by itself and it doesn't know that The clone request has no clone bio when dm_softirq_done() is called. So target drivers can't resubmit it again even error cases. Instead, they can ask dm core for requeueing and remapping the original request in that cases. suspend ======= Request-based dm uses stopping md->queue as suspend of the md. For noflush suspend, just stops md->queue. For flush suspend, inserts a marker request to the tail of md->queue. And dispatches all requests in md->queue until the marker comes to the front of md->queue. Then, stops dispatching request and waits for the all dispatched requests to complete. After that, completes the marker request, stops md->queue and wake up the waiter on the suspend queue, md->wait. resume ====== Starts md->queue. Signed-off-by: Kiyoshi Ueda <k-ueda@ct.jp.nec.com> Signed-off-by: Jun'ichi Nomura <j-nomura@ce.jp.nec.com> Signed-off-by: Alasdair G Kergon <agk@redhat.com>
2009-06-22 17:12:35 +08:00
wake_up(&md->wait);
if (run_queue)
blk_run_queue(md->queue);
dm: prepare for request based option This patch adds core functions for request-based dm. When struct mapped device (md) is initialized, md->queue has an I/O scheduler and the following functions are used for request-based dm as the queue functions: make_request_fn: dm_make_request() pref_fn: dm_prep_fn() request_fn: dm_request_fn() softirq_done_fn: dm_softirq_done() lld_busy_fn: dm_lld_busy() Actual initializations are done in another patch (PATCH 2). Below is a brief summary of how request-based dm behaves, including: - making request from bio - cloning, mapping and dispatching request - completing request and bio - suspending md - resuming md bio to request ============== md->queue->make_request_fn() (dm_make_request()) calls __make_request() for a bio submitted to the md. Then, the bio is kept in the queue as a new request or merged into another request in the queue if possible. Cloning and Mapping =================== Cloning and mapping are done in md->queue->request_fn() (dm_request_fn()), when requests are dispatched after they are sorted by the I/O scheduler. dm_request_fn() checks busy state of underlying devices using target's busy() function and stops dispatching requests to keep them on the dm device's queue if busy. It helps better I/O merging, since no merge is done for a request once it is dispatched to underlying devices. Actual cloning and mapping are done in dm_prep_fn() and map_request() called from dm_request_fn(). dm_prep_fn() clones not only request but also bios of the request so that dm can hold bio completion in error cases and prevent the bio submitter from noticing the error. (See the "Completion" section below for details.) After the cloning, the clone is mapped by target's map_rq() function and inserted to underlying device's queue using blk_insert_cloned_request(). Completion ========== Request completion can be hooked by rq->end_io(), but then, all bios in the request will have been completed even error cases, and the bio submitter will have noticed the error. To prevent the bio completion in error cases, request-based dm clones both bio and request and hooks both bio->bi_end_io() and rq->end_io(): bio->bi_end_io(): end_clone_bio() rq->end_io(): end_clone_request() Summary of the request completion flow is below: blk_end_request() for a clone request => blk_update_request() => bio->bi_end_io() == end_clone_bio() for each clone bio => Free the clone bio => Success: Complete the original bio (blk_update_request()) Error: Don't complete the original bio => blk_finish_request() => rq->end_io() == end_clone_request() => blk_complete_request() => dm_softirq_done() => Free the clone request => Success: Complete the original request (blk_end_request()) Error: Requeue the original request end_clone_bio() completes the original request on the size of the original bio in successful cases. Even if all bios in the original request are completed by that completion, the original request must not be completed yet to keep the ordering of request completion for the stacking. So end_clone_bio() uses blk_update_request() instead of blk_end_request(). In error cases, end_clone_bio() doesn't complete the original bio. It just frees the cloned bio and gives over the error handling to end_clone_request(). end_clone_request(), which is called with queue lock held, completes the clone request and the original request in a softirq context (dm_softirq_done()), which has no queue lock, to avoid a deadlock issue on submission of another request during the completion: - The submitted request may be mapped to the same device - Request submission requires queue lock, but the queue lock has been held by itself and it doesn't know that The clone request has no clone bio when dm_softirq_done() is called. So target drivers can't resubmit it again even error cases. Instead, they can ask dm core for requeueing and remapping the original request in that cases. suspend ======= Request-based dm uses stopping md->queue as suspend of the md. For noflush suspend, just stops md->queue. For flush suspend, inserts a marker request to the tail of md->queue. And dispatches all requests in md->queue until the marker comes to the front of md->queue. Then, stops dispatching request and waits for the all dispatched requests to complete. After that, completes the marker request, stops md->queue and wake up the waiter on the suspend queue, md->wait. resume ====== Starts md->queue. Signed-off-by: Kiyoshi Ueda <k-ueda@ct.jp.nec.com> Signed-off-by: Jun'ichi Nomura <j-nomura@ce.jp.nec.com> Signed-off-by: Alasdair G Kergon <agk@redhat.com>
2009-06-22 17:12:35 +08:00
/*
* dm_put() must be at the end of this function. See the comment above
*/
dm_put(md);
}
dm multipath: fix oops when request based io fails when no paths The patch posted at http://marc.info/?l=dm-devel&m=124539787228784&w=2 which was merged into cec47e3d4a861e1d942b3a580d0bbef2700d2bb2 ("dm: prepare for request based option") introduced a regression in request-based dm. If map_request() calls dm_kill_unmapped_request() to complete a cloned bio without dispatching it, clone->bio is still set when dm_end_request() is called and the BUG_ON(clone->bio) is incorrect. The patch fixes this bug by freeing bio in dm_end_request() if the clone has bio. I've redone my tests to cover all I/O paths and confirmed there's no other regression. Here is the oops I hit in request-based dm when I do I/O to a multipath device which doesn't have any active path nor queue_if_no_path setting: ------------[ cut here ]------------ kernel BUG at /root/2.6.31-rc4.rqdm/drivers/md/dm.c:828! invalid opcode: 0000 [#1] SMP last sysfs file: /sys/devices/system/cpu/cpu3/cache/index2/shared_cpu_map CPU 1 Modules linked in: autofs4 sunrpc cpufreq_ondemand acpi_cpufreq dm_mirror dm_region_hash dm_log dm_service_time dm_multipath scsi_dh dm_mod video output sbs sbshc battery ac sg sr_mod e1000e button cdrom serio_raw rtc_cmos rtc_core rtc_lib piix lpfc scsi_transport_fc ata_piix libata megaraid_sas sd_mod scsi_mod crc_t10dif ext3 jbd uhci_hcd ohci_hcd ehci_hcd [last unloaded: microcode] Pid: 7, comm: ksoftirqd/1 Not tainted 2.6.31-rc4.rqdm #1 Express5800/120Lj [N8100-1417] RIP: 0010:[<ffffffffa023629d>] [<ffffffffa023629d>] dm_softirq_done+0xbd/0x100 [dm_mod] RSP: 0018:ffff8800280a1f08 EFLAGS: 00010282 RAX: ffffffffa02544e0 RBX: ffff8802aa1111d0 RCX: ffff8802aa1111e0 RDX: ffff8802ab913e70 RSI: 0000000000000000 RDI: ffff8802ab913e70 RBP: ffff8800280a1f28 R08: ffffc90005457040 R09: 0000000000000000 R10: 0000000000000001 R11: 0000000000000000 R12: 00000000fffffffb R13: ffff8802ab913e88 R14: ffff8802ab9c1438 R15: 0000000000000100 FS: 0000000000000000(0000) GS:ffff88002809e000(0000) knlGS:0000000000000000 CS: 0010 DS: 0018 ES: 0018 CR0: 000000008005003b CR2: 0000003d54a98640 CR3: 000000029f0a1000 CR4: 00000000000006e0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400 Process ksoftirqd/1 (pid: 7, threadinfo ffff8802ae50e000, task ffff8802ae4f8040) Stack: ffff8800280a1f38 0000000000000020 ffffffff814f30a0 0000000000000004 <0> ffff8800280a1f58 ffffffff8116b245 ffff8800280a1f38 ffff8800280a1f38 <0> ffff8800280a1f58 0000000000000001 ffff8800280a1fa8 ffffffff810477bc Call Trace: <IRQ> [<ffffffff8116b245>] blk_done_softirq+0x75/0x90 [<ffffffff810477bc>] __do_softirq+0xcc/0x210 [<ffffffff81047170>] ? ksoftirqd+0x0/0x110 [<ffffffff8100ce7c>] call_softirq+0x1c/0x50 <EOI> [<ffffffff8100e785>] do_softirq+0x65/0xa0 [<ffffffff81047170>] ? ksoftirqd+0x0/0x110 [<ffffffff810471e0>] ksoftirqd+0x70/0x110 [<ffffffff81059559>] kthread+0x99/0xb0 [<ffffffff8100cd7a>] child_rip+0xa/0x20 [<ffffffff8100c73c>] ? restore_args+0x0/0x30 [<ffffffff810594c0>] ? kthread+0x0/0xb0 [<ffffffff8100cd70>] ? child_rip+0x0/0x20 Code: 44 89 e6 48 89 df e8 23 fb f2 e0 be 01 00 00 00 4c 89 f7 e8 f6 fd ff ff 5b 41 5c 41 5d 41 5e c9 c3 4c 89 ef e8 85 fe ff ff eb ed <0f> 0b eb fe 41 8b 85 dc 00 00 00 48 83 bb 10 01 00 00 00 89 83 RIP [<ffffffffa023629d>] dm_softirq_done+0xbd/0x100 [dm_mod] RSP <ffff8800280a1f08> ---[ end trace 16af0a1d8542da55 ]--- Signed-off-by: Kiyoshi Ueda <k-ueda@ct.jp.nec.com> Signed-off-by: Jun'ichi Nomura <j-nomura@ce.jp.nec.com> Signed-off-by: Alasdair G Kergon <agk@redhat.com>
2009-09-05 03:40:16 +08:00
static void free_rq_clone(struct request *clone)
{
struct dm_rq_target_io *tio = clone->end_io_data;
blk_rq_unprep_clone(clone);
free_rq_tio(tio);
}
/*
* Complete the clone and the original request.
* Must be called without queue lock.
*/
static void dm_end_request(struct request *clone, int error)
{
int rw = rq_data_dir(clone);
struct dm_rq_target_io *tio = clone->end_io_data;
struct mapped_device *md = tio->md;
struct request *rq = tio->orig;
if (rq->cmd_type == REQ_TYPE_BLOCK_PC) {
rq->errors = clone->errors;
rq->resid_len = clone->resid_len;
if (rq->sense)
/*
* We are using the sense buffer of the original
* request.
* So setting the length of the sense data is enough.
*/
rq->sense_len = clone->sense_len;
}
free_rq_clone(clone);
blk_end_request_all(rq, error);
rq_completed(md, rw, true);
}
dm: prepare for request based option This patch adds core functions for request-based dm. When struct mapped device (md) is initialized, md->queue has an I/O scheduler and the following functions are used for request-based dm as the queue functions: make_request_fn: dm_make_request() pref_fn: dm_prep_fn() request_fn: dm_request_fn() softirq_done_fn: dm_softirq_done() lld_busy_fn: dm_lld_busy() Actual initializations are done in another patch (PATCH 2). Below is a brief summary of how request-based dm behaves, including: - making request from bio - cloning, mapping and dispatching request - completing request and bio - suspending md - resuming md bio to request ============== md->queue->make_request_fn() (dm_make_request()) calls __make_request() for a bio submitted to the md. Then, the bio is kept in the queue as a new request or merged into another request in the queue if possible. Cloning and Mapping =================== Cloning and mapping are done in md->queue->request_fn() (dm_request_fn()), when requests are dispatched after they are sorted by the I/O scheduler. dm_request_fn() checks busy state of underlying devices using target's busy() function and stops dispatching requests to keep them on the dm device's queue if busy. It helps better I/O merging, since no merge is done for a request once it is dispatched to underlying devices. Actual cloning and mapping are done in dm_prep_fn() and map_request() called from dm_request_fn(). dm_prep_fn() clones not only request but also bios of the request so that dm can hold bio completion in error cases and prevent the bio submitter from noticing the error. (See the "Completion" section below for details.) After the cloning, the clone is mapped by target's map_rq() function and inserted to underlying device's queue using blk_insert_cloned_request(). Completion ========== Request completion can be hooked by rq->end_io(), but then, all bios in the request will have been completed even error cases, and the bio submitter will have noticed the error. To prevent the bio completion in error cases, request-based dm clones both bio and request and hooks both bio->bi_end_io() and rq->end_io(): bio->bi_end_io(): end_clone_bio() rq->end_io(): end_clone_request() Summary of the request completion flow is below: blk_end_request() for a clone request => blk_update_request() => bio->bi_end_io() == end_clone_bio() for each clone bio => Free the clone bio => Success: Complete the original bio (blk_update_request()) Error: Don't complete the original bio => blk_finish_request() => rq->end_io() == end_clone_request() => blk_complete_request() => dm_softirq_done() => Free the clone request => Success: Complete the original request (blk_end_request()) Error: Requeue the original request end_clone_bio() completes the original request on the size of the original bio in successful cases. Even if all bios in the original request are completed by that completion, the original request must not be completed yet to keep the ordering of request completion for the stacking. So end_clone_bio() uses blk_update_request() instead of blk_end_request(). In error cases, end_clone_bio() doesn't complete the original bio. It just frees the cloned bio and gives over the error handling to end_clone_request(). end_clone_request(), which is called with queue lock held, completes the clone request and the original request in a softirq context (dm_softirq_done()), which has no queue lock, to avoid a deadlock issue on submission of another request during the completion: - The submitted request may be mapped to the same device - Request submission requires queue lock, but the queue lock has been held by itself and it doesn't know that The clone request has no clone bio when dm_softirq_done() is called. So target drivers can't resubmit it again even error cases. Instead, they can ask dm core for requeueing and remapping the original request in that cases. suspend ======= Request-based dm uses stopping md->queue as suspend of the md. For noflush suspend, just stops md->queue. For flush suspend, inserts a marker request to the tail of md->queue. And dispatches all requests in md->queue until the marker comes to the front of md->queue. Then, stops dispatching request and waits for the all dispatched requests to complete. After that, completes the marker request, stops md->queue and wake up the waiter on the suspend queue, md->wait. resume ====== Starts md->queue. Signed-off-by: Kiyoshi Ueda <k-ueda@ct.jp.nec.com> Signed-off-by: Jun'ichi Nomura <j-nomura@ce.jp.nec.com> Signed-off-by: Alasdair G Kergon <agk@redhat.com>
2009-06-22 17:12:35 +08:00
static void dm_unprep_request(struct request *rq)
{
struct request *clone = rq->special;
rq->special = NULL;
rq->cmd_flags &= ~REQ_DONTPREP;
dm multipath: fix oops when request based io fails when no paths The patch posted at http://marc.info/?l=dm-devel&m=124539787228784&w=2 which was merged into cec47e3d4a861e1d942b3a580d0bbef2700d2bb2 ("dm: prepare for request based option") introduced a regression in request-based dm. If map_request() calls dm_kill_unmapped_request() to complete a cloned bio without dispatching it, clone->bio is still set when dm_end_request() is called and the BUG_ON(clone->bio) is incorrect. The patch fixes this bug by freeing bio in dm_end_request() if the clone has bio. I've redone my tests to cover all I/O paths and confirmed there's no other regression. Here is the oops I hit in request-based dm when I do I/O to a multipath device which doesn't have any active path nor queue_if_no_path setting: ------------[ cut here ]------------ kernel BUG at /root/2.6.31-rc4.rqdm/drivers/md/dm.c:828! invalid opcode: 0000 [#1] SMP last sysfs file: /sys/devices/system/cpu/cpu3/cache/index2/shared_cpu_map CPU 1 Modules linked in: autofs4 sunrpc cpufreq_ondemand acpi_cpufreq dm_mirror dm_region_hash dm_log dm_service_time dm_multipath scsi_dh dm_mod video output sbs sbshc battery ac sg sr_mod e1000e button cdrom serio_raw rtc_cmos rtc_core rtc_lib piix lpfc scsi_transport_fc ata_piix libata megaraid_sas sd_mod scsi_mod crc_t10dif ext3 jbd uhci_hcd ohci_hcd ehci_hcd [last unloaded: microcode] Pid: 7, comm: ksoftirqd/1 Not tainted 2.6.31-rc4.rqdm #1 Express5800/120Lj [N8100-1417] RIP: 0010:[<ffffffffa023629d>] [<ffffffffa023629d>] dm_softirq_done+0xbd/0x100 [dm_mod] RSP: 0018:ffff8800280a1f08 EFLAGS: 00010282 RAX: ffffffffa02544e0 RBX: ffff8802aa1111d0 RCX: ffff8802aa1111e0 RDX: ffff8802ab913e70 RSI: 0000000000000000 RDI: ffff8802ab913e70 RBP: ffff8800280a1f28 R08: ffffc90005457040 R09: 0000000000000000 R10: 0000000000000001 R11: 0000000000000000 R12: 00000000fffffffb R13: ffff8802ab913e88 R14: ffff8802ab9c1438 R15: 0000000000000100 FS: 0000000000000000(0000) GS:ffff88002809e000(0000) knlGS:0000000000000000 CS: 0010 DS: 0018 ES: 0018 CR0: 000000008005003b CR2: 0000003d54a98640 CR3: 000000029f0a1000 CR4: 00000000000006e0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400 Process ksoftirqd/1 (pid: 7, threadinfo ffff8802ae50e000, task ffff8802ae4f8040) Stack: ffff8800280a1f38 0000000000000020 ffffffff814f30a0 0000000000000004 <0> ffff8800280a1f58 ffffffff8116b245 ffff8800280a1f38 ffff8800280a1f38 <0> ffff8800280a1f58 0000000000000001 ffff8800280a1fa8 ffffffff810477bc Call Trace: <IRQ> [<ffffffff8116b245>] blk_done_softirq+0x75/0x90 [<ffffffff810477bc>] __do_softirq+0xcc/0x210 [<ffffffff81047170>] ? ksoftirqd+0x0/0x110 [<ffffffff8100ce7c>] call_softirq+0x1c/0x50 <EOI> [<ffffffff8100e785>] do_softirq+0x65/0xa0 [<ffffffff81047170>] ? ksoftirqd+0x0/0x110 [<ffffffff810471e0>] ksoftirqd+0x70/0x110 [<ffffffff81059559>] kthread+0x99/0xb0 [<ffffffff8100cd7a>] child_rip+0xa/0x20 [<ffffffff8100c73c>] ? restore_args+0x0/0x30 [<ffffffff810594c0>] ? kthread+0x0/0xb0 [<ffffffff8100cd70>] ? child_rip+0x0/0x20 Code: 44 89 e6 48 89 df e8 23 fb f2 e0 be 01 00 00 00 4c 89 f7 e8 f6 fd ff ff 5b 41 5c 41 5d 41 5e c9 c3 4c 89 ef e8 85 fe ff ff eb ed <0f> 0b eb fe 41 8b 85 dc 00 00 00 48 83 bb 10 01 00 00 00 89 83 RIP [<ffffffffa023629d>] dm_softirq_done+0xbd/0x100 [dm_mod] RSP <ffff8800280a1f08> ---[ end trace 16af0a1d8542da55 ]--- Signed-off-by: Kiyoshi Ueda <k-ueda@ct.jp.nec.com> Signed-off-by: Jun'ichi Nomura <j-nomura@ce.jp.nec.com> Signed-off-by: Alasdair G Kergon <agk@redhat.com>
2009-09-05 03:40:16 +08:00
free_rq_clone(clone);
dm: prepare for request based option This patch adds core functions for request-based dm. When struct mapped device (md) is initialized, md->queue has an I/O scheduler and the following functions are used for request-based dm as the queue functions: make_request_fn: dm_make_request() pref_fn: dm_prep_fn() request_fn: dm_request_fn() softirq_done_fn: dm_softirq_done() lld_busy_fn: dm_lld_busy() Actual initializations are done in another patch (PATCH 2). Below is a brief summary of how request-based dm behaves, including: - making request from bio - cloning, mapping and dispatching request - completing request and bio - suspending md - resuming md bio to request ============== md->queue->make_request_fn() (dm_make_request()) calls __make_request() for a bio submitted to the md. Then, the bio is kept in the queue as a new request or merged into another request in the queue if possible. Cloning and Mapping =================== Cloning and mapping are done in md->queue->request_fn() (dm_request_fn()), when requests are dispatched after they are sorted by the I/O scheduler. dm_request_fn() checks busy state of underlying devices using target's busy() function and stops dispatching requests to keep them on the dm device's queue if busy. It helps better I/O merging, since no merge is done for a request once it is dispatched to underlying devices. Actual cloning and mapping are done in dm_prep_fn() and map_request() called from dm_request_fn(). dm_prep_fn() clones not only request but also bios of the request so that dm can hold bio completion in error cases and prevent the bio submitter from noticing the error. (See the "Completion" section below for details.) After the cloning, the clone is mapped by target's map_rq() function and inserted to underlying device's queue using blk_insert_cloned_request(). Completion ========== Request completion can be hooked by rq->end_io(), but then, all bios in the request will have been completed even error cases, and the bio submitter will have noticed the error. To prevent the bio completion in error cases, request-based dm clones both bio and request and hooks both bio->bi_end_io() and rq->end_io(): bio->bi_end_io(): end_clone_bio() rq->end_io(): end_clone_request() Summary of the request completion flow is below: blk_end_request() for a clone request => blk_update_request() => bio->bi_end_io() == end_clone_bio() for each clone bio => Free the clone bio => Success: Complete the original bio (blk_update_request()) Error: Don't complete the original bio => blk_finish_request() => rq->end_io() == end_clone_request() => blk_complete_request() => dm_softirq_done() => Free the clone request => Success: Complete the original request (blk_end_request()) Error: Requeue the original request end_clone_bio() completes the original request on the size of the original bio in successful cases. Even if all bios in the original request are completed by that completion, the original request must not be completed yet to keep the ordering of request completion for the stacking. So end_clone_bio() uses blk_update_request() instead of blk_end_request(). In error cases, end_clone_bio() doesn't complete the original bio. It just frees the cloned bio and gives over the error handling to end_clone_request(). end_clone_request(), which is called with queue lock held, completes the clone request and the original request in a softirq context (dm_softirq_done()), which has no queue lock, to avoid a deadlock issue on submission of another request during the completion: - The submitted request may be mapped to the same device - Request submission requires queue lock, but the queue lock has been held by itself and it doesn't know that The clone request has no clone bio when dm_softirq_done() is called. So target drivers can't resubmit it again even error cases. Instead, they can ask dm core for requeueing and remapping the original request in that cases. suspend ======= Request-based dm uses stopping md->queue as suspend of the md. For noflush suspend, just stops md->queue. For flush suspend, inserts a marker request to the tail of md->queue. And dispatches all requests in md->queue until the marker comes to the front of md->queue. Then, stops dispatching request and waits for the all dispatched requests to complete. After that, completes the marker request, stops md->queue and wake up the waiter on the suspend queue, md->wait. resume ====== Starts md->queue. Signed-off-by: Kiyoshi Ueda <k-ueda@ct.jp.nec.com> Signed-off-by: Jun'ichi Nomura <j-nomura@ce.jp.nec.com> Signed-off-by: Alasdair G Kergon <agk@redhat.com>
2009-06-22 17:12:35 +08:00
}
/*
* Requeue the original request of a clone.
*/
void dm_requeue_unmapped_request(struct request *clone)
{
int rw = rq_data_dir(clone);
dm: prepare for request based option This patch adds core functions for request-based dm. When struct mapped device (md) is initialized, md->queue has an I/O scheduler and the following functions are used for request-based dm as the queue functions: make_request_fn: dm_make_request() pref_fn: dm_prep_fn() request_fn: dm_request_fn() softirq_done_fn: dm_softirq_done() lld_busy_fn: dm_lld_busy() Actual initializations are done in another patch (PATCH 2). Below is a brief summary of how request-based dm behaves, including: - making request from bio - cloning, mapping and dispatching request - completing request and bio - suspending md - resuming md bio to request ============== md->queue->make_request_fn() (dm_make_request()) calls __make_request() for a bio submitted to the md. Then, the bio is kept in the queue as a new request or merged into another request in the queue if possible. Cloning and Mapping =================== Cloning and mapping are done in md->queue->request_fn() (dm_request_fn()), when requests are dispatched after they are sorted by the I/O scheduler. dm_request_fn() checks busy state of underlying devices using target's busy() function and stops dispatching requests to keep them on the dm device's queue if busy. It helps better I/O merging, since no merge is done for a request once it is dispatched to underlying devices. Actual cloning and mapping are done in dm_prep_fn() and map_request() called from dm_request_fn(). dm_prep_fn() clones not only request but also bios of the request so that dm can hold bio completion in error cases and prevent the bio submitter from noticing the error. (See the "Completion" section below for details.) After the cloning, the clone is mapped by target's map_rq() function and inserted to underlying device's queue using blk_insert_cloned_request(). Completion ========== Request completion can be hooked by rq->end_io(), but then, all bios in the request will have been completed even error cases, and the bio submitter will have noticed the error. To prevent the bio completion in error cases, request-based dm clones both bio and request and hooks both bio->bi_end_io() and rq->end_io(): bio->bi_end_io(): end_clone_bio() rq->end_io(): end_clone_request() Summary of the request completion flow is below: blk_end_request() for a clone request => blk_update_request() => bio->bi_end_io() == end_clone_bio() for each clone bio => Free the clone bio => Success: Complete the original bio (blk_update_request()) Error: Don't complete the original bio => blk_finish_request() => rq->end_io() == end_clone_request() => blk_complete_request() => dm_softirq_done() => Free the clone request => Success: Complete the original request (blk_end_request()) Error: Requeue the original request end_clone_bio() completes the original request on the size of the original bio in successful cases. Even if all bios in the original request are completed by that completion, the original request must not be completed yet to keep the ordering of request completion for the stacking. So end_clone_bio() uses blk_update_request() instead of blk_end_request(). In error cases, end_clone_bio() doesn't complete the original bio. It just frees the cloned bio and gives over the error handling to end_clone_request(). end_clone_request(), which is called with queue lock held, completes the clone request and the original request in a softirq context (dm_softirq_done()), which has no queue lock, to avoid a deadlock issue on submission of another request during the completion: - The submitted request may be mapped to the same device - Request submission requires queue lock, but the queue lock has been held by itself and it doesn't know that The clone request has no clone bio when dm_softirq_done() is called. So target drivers can't resubmit it again even error cases. Instead, they can ask dm core for requeueing and remapping the original request in that cases. suspend ======= Request-based dm uses stopping md->queue as suspend of the md. For noflush suspend, just stops md->queue. For flush suspend, inserts a marker request to the tail of md->queue. And dispatches all requests in md->queue until the marker comes to the front of md->queue. Then, stops dispatching request and waits for the all dispatched requests to complete. After that, completes the marker request, stops md->queue and wake up the waiter on the suspend queue, md->wait. resume ====== Starts md->queue. Signed-off-by: Kiyoshi Ueda <k-ueda@ct.jp.nec.com> Signed-off-by: Jun'ichi Nomura <j-nomura@ce.jp.nec.com> Signed-off-by: Alasdair G Kergon <agk@redhat.com>
2009-06-22 17:12:35 +08:00
struct dm_rq_target_io *tio = clone->end_io_data;
struct mapped_device *md = tio->md;
struct request *rq = tio->orig;
struct request_queue *q = rq->q;
unsigned long flags;
dm_unprep_request(rq);
spin_lock_irqsave(q->queue_lock, flags);
blk_requeue_request(q, rq);
spin_unlock_irqrestore(q->queue_lock, flags);
rq_completed(md, rw, 0);
dm: prepare for request based option This patch adds core functions for request-based dm. When struct mapped device (md) is initialized, md->queue has an I/O scheduler and the following functions are used for request-based dm as the queue functions: make_request_fn: dm_make_request() pref_fn: dm_prep_fn() request_fn: dm_request_fn() softirq_done_fn: dm_softirq_done() lld_busy_fn: dm_lld_busy() Actual initializations are done in another patch (PATCH 2). Below is a brief summary of how request-based dm behaves, including: - making request from bio - cloning, mapping and dispatching request - completing request and bio - suspending md - resuming md bio to request ============== md->queue->make_request_fn() (dm_make_request()) calls __make_request() for a bio submitted to the md. Then, the bio is kept in the queue as a new request or merged into another request in the queue if possible. Cloning and Mapping =================== Cloning and mapping are done in md->queue->request_fn() (dm_request_fn()), when requests are dispatched after they are sorted by the I/O scheduler. dm_request_fn() checks busy state of underlying devices using target's busy() function and stops dispatching requests to keep them on the dm device's queue if busy. It helps better I/O merging, since no merge is done for a request once it is dispatched to underlying devices. Actual cloning and mapping are done in dm_prep_fn() and map_request() called from dm_request_fn(). dm_prep_fn() clones not only request but also bios of the request so that dm can hold bio completion in error cases and prevent the bio submitter from noticing the error. (See the "Completion" section below for details.) After the cloning, the clone is mapped by target's map_rq() function and inserted to underlying device's queue using blk_insert_cloned_request(). Completion ========== Request completion can be hooked by rq->end_io(), but then, all bios in the request will have been completed even error cases, and the bio submitter will have noticed the error. To prevent the bio completion in error cases, request-based dm clones both bio and request and hooks both bio->bi_end_io() and rq->end_io(): bio->bi_end_io(): end_clone_bio() rq->end_io(): end_clone_request() Summary of the request completion flow is below: blk_end_request() for a clone request => blk_update_request() => bio->bi_end_io() == end_clone_bio() for each clone bio => Free the clone bio => Success: Complete the original bio (blk_update_request()) Error: Don't complete the original bio => blk_finish_request() => rq->end_io() == end_clone_request() => blk_complete_request() => dm_softirq_done() => Free the clone request => Success: Complete the original request (blk_end_request()) Error: Requeue the original request end_clone_bio() completes the original request on the size of the original bio in successful cases. Even if all bios in the original request are completed by that completion, the original request must not be completed yet to keep the ordering of request completion for the stacking. So end_clone_bio() uses blk_update_request() instead of blk_end_request(). In error cases, end_clone_bio() doesn't complete the original bio. It just frees the cloned bio and gives over the error handling to end_clone_request(). end_clone_request(), which is called with queue lock held, completes the clone request and the original request in a softirq context (dm_softirq_done()), which has no queue lock, to avoid a deadlock issue on submission of another request during the completion: - The submitted request may be mapped to the same device - Request submission requires queue lock, but the queue lock has been held by itself and it doesn't know that The clone request has no clone bio when dm_softirq_done() is called. So target drivers can't resubmit it again even error cases. Instead, they can ask dm core for requeueing and remapping the original request in that cases. suspend ======= Request-based dm uses stopping md->queue as suspend of the md. For noflush suspend, just stops md->queue. For flush suspend, inserts a marker request to the tail of md->queue. And dispatches all requests in md->queue until the marker comes to the front of md->queue. Then, stops dispatching request and waits for the all dispatched requests to complete. After that, completes the marker request, stops md->queue and wake up the waiter on the suspend queue, md->wait. resume ====== Starts md->queue. Signed-off-by: Kiyoshi Ueda <k-ueda@ct.jp.nec.com> Signed-off-by: Jun'ichi Nomura <j-nomura@ce.jp.nec.com> Signed-off-by: Alasdair G Kergon <agk@redhat.com>
2009-06-22 17:12:35 +08:00
}
EXPORT_SYMBOL_GPL(dm_requeue_unmapped_request);
static void __stop_queue(struct request_queue *q)
{
blk_stop_queue(q);
}
static void stop_queue(struct request_queue *q)
{
unsigned long flags;
spin_lock_irqsave(q->queue_lock, flags);
__stop_queue(q);
spin_unlock_irqrestore(q->queue_lock, flags);
}
static void __start_queue(struct request_queue *q)
{
if (blk_queue_stopped(q))
blk_start_queue(q);
}
static void start_queue(struct request_queue *q)
{
unsigned long flags;
spin_lock_irqsave(q->queue_lock, flags);
__start_queue(q);
spin_unlock_irqrestore(q->queue_lock, flags);
}
static void dm_done(struct request *clone, int error, bool mapped)
dm: prepare for request based option This patch adds core functions for request-based dm. When struct mapped device (md) is initialized, md->queue has an I/O scheduler and the following functions are used for request-based dm as the queue functions: make_request_fn: dm_make_request() pref_fn: dm_prep_fn() request_fn: dm_request_fn() softirq_done_fn: dm_softirq_done() lld_busy_fn: dm_lld_busy() Actual initializations are done in another patch (PATCH 2). Below is a brief summary of how request-based dm behaves, including: - making request from bio - cloning, mapping and dispatching request - completing request and bio - suspending md - resuming md bio to request ============== md->queue->make_request_fn() (dm_make_request()) calls __make_request() for a bio submitted to the md. Then, the bio is kept in the queue as a new request or merged into another request in the queue if possible. Cloning and Mapping =================== Cloning and mapping are done in md->queue->request_fn() (dm_request_fn()), when requests are dispatched after they are sorted by the I/O scheduler. dm_request_fn() checks busy state of underlying devices using target's busy() function and stops dispatching requests to keep them on the dm device's queue if busy. It helps better I/O merging, since no merge is done for a request once it is dispatched to underlying devices. Actual cloning and mapping are done in dm_prep_fn() and map_request() called from dm_request_fn(). dm_prep_fn() clones not only request but also bios of the request so that dm can hold bio completion in error cases and prevent the bio submitter from noticing the error. (See the "Completion" section below for details.) After the cloning, the clone is mapped by target's map_rq() function and inserted to underlying device's queue using blk_insert_cloned_request(). Completion ========== Request completion can be hooked by rq->end_io(), but then, all bios in the request will have been completed even error cases, and the bio submitter will have noticed the error. To prevent the bio completion in error cases, request-based dm clones both bio and request and hooks both bio->bi_end_io() and rq->end_io(): bio->bi_end_io(): end_clone_bio() rq->end_io(): end_clone_request() Summary of the request completion flow is below: blk_end_request() for a clone request => blk_update_request() => bio->bi_end_io() == end_clone_bio() for each clone bio => Free the clone bio => Success: Complete the original bio (blk_update_request()) Error: Don't complete the original bio => blk_finish_request() => rq->end_io() == end_clone_request() => blk_complete_request() => dm_softirq_done() => Free the clone request => Success: Complete the original request (blk_end_request()) Error: Requeue the original request end_clone_bio() completes the original request on the size of the original bio in successful cases. Even if all bios in the original request are completed by that completion, the original request must not be completed yet to keep the ordering of request completion for the stacking. So end_clone_bio() uses blk_update_request() instead of blk_end_request(). In error cases, end_clone_bio() doesn't complete the original bio. It just frees the cloned bio and gives over the error handling to end_clone_request(). end_clone_request(), which is called with queue lock held, completes the clone request and the original request in a softirq context (dm_softirq_done()), which has no queue lock, to avoid a deadlock issue on submission of another request during the completion: - The submitted request may be mapped to the same device - Request submission requires queue lock, but the queue lock has been held by itself and it doesn't know that The clone request has no clone bio when dm_softirq_done() is called. So target drivers can't resubmit it again even error cases. Instead, they can ask dm core for requeueing and remapping the original request in that cases. suspend ======= Request-based dm uses stopping md->queue as suspend of the md. For noflush suspend, just stops md->queue. For flush suspend, inserts a marker request to the tail of md->queue. And dispatches all requests in md->queue until the marker comes to the front of md->queue. Then, stops dispatching request and waits for the all dispatched requests to complete. After that, completes the marker request, stops md->queue and wake up the waiter on the suspend queue, md->wait. resume ====== Starts md->queue. Signed-off-by: Kiyoshi Ueda <k-ueda@ct.jp.nec.com> Signed-off-by: Jun'ichi Nomura <j-nomura@ce.jp.nec.com> Signed-off-by: Alasdair G Kergon <agk@redhat.com>
2009-06-22 17:12:35 +08:00
{
int r = error;
dm: prepare for request based option This patch adds core functions for request-based dm. When struct mapped device (md) is initialized, md->queue has an I/O scheduler and the following functions are used for request-based dm as the queue functions: make_request_fn: dm_make_request() pref_fn: dm_prep_fn() request_fn: dm_request_fn() softirq_done_fn: dm_softirq_done() lld_busy_fn: dm_lld_busy() Actual initializations are done in another patch (PATCH 2). Below is a brief summary of how request-based dm behaves, including: - making request from bio - cloning, mapping and dispatching request - completing request and bio - suspending md - resuming md bio to request ============== md->queue->make_request_fn() (dm_make_request()) calls __make_request() for a bio submitted to the md. Then, the bio is kept in the queue as a new request or merged into another request in the queue if possible. Cloning and Mapping =================== Cloning and mapping are done in md->queue->request_fn() (dm_request_fn()), when requests are dispatched after they are sorted by the I/O scheduler. dm_request_fn() checks busy state of underlying devices using target's busy() function and stops dispatching requests to keep them on the dm device's queue if busy. It helps better I/O merging, since no merge is done for a request once it is dispatched to underlying devices. Actual cloning and mapping are done in dm_prep_fn() and map_request() called from dm_request_fn(). dm_prep_fn() clones not only request but also bios of the request so that dm can hold bio completion in error cases and prevent the bio submitter from noticing the error. (See the "Completion" section below for details.) After the cloning, the clone is mapped by target's map_rq() function and inserted to underlying device's queue using blk_insert_cloned_request(). Completion ========== Request completion can be hooked by rq->end_io(), but then, all bios in the request will have been completed even error cases, and the bio submitter will have noticed the error. To prevent the bio completion in error cases, request-based dm clones both bio and request and hooks both bio->bi_end_io() and rq->end_io(): bio->bi_end_io(): end_clone_bio() rq->end_io(): end_clone_request() Summary of the request completion flow is below: blk_end_request() for a clone request => blk_update_request() => bio->bi_end_io() == end_clone_bio() for each clone bio => Free the clone bio => Success: Complete the original bio (blk_update_request()) Error: Don't complete the original bio => blk_finish_request() => rq->end_io() == end_clone_request() => blk_complete_request() => dm_softirq_done() => Free the clone request => Success: Complete the original request (blk_end_request()) Error: Requeue the original request end_clone_bio() completes the original request on the size of the original bio in successful cases. Even if all bios in the original request are completed by that completion, the original request must not be completed yet to keep the ordering of request completion for the stacking. So end_clone_bio() uses blk_update_request() instead of blk_end_request(). In error cases, end_clone_bio() doesn't complete the original bio. It just frees the cloned bio and gives over the error handling to end_clone_request(). end_clone_request(), which is called with queue lock held, completes the clone request and the original request in a softirq context (dm_softirq_done()), which has no queue lock, to avoid a deadlock issue on submission of another request during the completion: - The submitted request may be mapped to the same device - Request submission requires queue lock, but the queue lock has been held by itself and it doesn't know that The clone request has no clone bio when dm_softirq_done() is called. So target drivers can't resubmit it again even error cases. Instead, they can ask dm core for requeueing and remapping the original request in that cases. suspend ======= Request-based dm uses stopping md->queue as suspend of the md. For noflush suspend, just stops md->queue. For flush suspend, inserts a marker request to the tail of md->queue. And dispatches all requests in md->queue until the marker comes to the front of md->queue. Then, stops dispatching request and waits for the all dispatched requests to complete. After that, completes the marker request, stops md->queue and wake up the waiter on the suspend queue, md->wait. resume ====== Starts md->queue. Signed-off-by: Kiyoshi Ueda <k-ueda@ct.jp.nec.com> Signed-off-by: Jun'ichi Nomura <j-nomura@ce.jp.nec.com> Signed-off-by: Alasdair G Kergon <agk@redhat.com>
2009-06-22 17:12:35 +08:00
struct dm_rq_target_io *tio = clone->end_io_data;
dm_request_endio_fn rq_end_io = tio->ti->type->rq_end_io;
if (mapped && rq_end_io)
r = rq_end_io(tio->ti, clone, error, &tio->info);
dm: prepare for request based option This patch adds core functions for request-based dm. When struct mapped device (md) is initialized, md->queue has an I/O scheduler and the following functions are used for request-based dm as the queue functions: make_request_fn: dm_make_request() pref_fn: dm_prep_fn() request_fn: dm_request_fn() softirq_done_fn: dm_softirq_done() lld_busy_fn: dm_lld_busy() Actual initializations are done in another patch (PATCH 2). Below is a brief summary of how request-based dm behaves, including: - making request from bio - cloning, mapping and dispatching request - completing request and bio - suspending md - resuming md bio to request ============== md->queue->make_request_fn() (dm_make_request()) calls __make_request() for a bio submitted to the md. Then, the bio is kept in the queue as a new request or merged into another request in the queue if possible. Cloning and Mapping =================== Cloning and mapping are done in md->queue->request_fn() (dm_request_fn()), when requests are dispatched after they are sorted by the I/O scheduler. dm_request_fn() checks busy state of underlying devices using target's busy() function and stops dispatching requests to keep them on the dm device's queue if busy. It helps better I/O merging, since no merge is done for a request once it is dispatched to underlying devices. Actual cloning and mapping are done in dm_prep_fn() and map_request() called from dm_request_fn(). dm_prep_fn() clones not only request but also bios of the request so that dm can hold bio completion in error cases and prevent the bio submitter from noticing the error. (See the "Completion" section below for details.) After the cloning, the clone is mapped by target's map_rq() function and inserted to underlying device's queue using blk_insert_cloned_request(). Completion ========== Request completion can be hooked by rq->end_io(), but then, all bios in the request will have been completed even error cases, and the bio submitter will have noticed the error. To prevent the bio completion in error cases, request-based dm clones both bio and request and hooks both bio->bi_end_io() and rq->end_io(): bio->bi_end_io(): end_clone_bio() rq->end_io(): end_clone_request() Summary of the request completion flow is below: blk_end_request() for a clone request => blk_update_request() => bio->bi_end_io() == end_clone_bio() for each clone bio => Free the clone bio => Success: Complete the original bio (blk_update_request()) Error: Don't complete the original bio => blk_finish_request() => rq->end_io() == end_clone_request() => blk_complete_request() => dm_softirq_done() => Free the clone request => Success: Complete the original request (blk_end_request()) Error: Requeue the original request end_clone_bio() completes the original request on the size of the original bio in successful cases. Even if all bios in the original request are completed by that completion, the original request must not be completed yet to keep the ordering of request completion for the stacking. So end_clone_bio() uses blk_update_request() instead of blk_end_request(). In error cases, end_clone_bio() doesn't complete the original bio. It just frees the cloned bio and gives over the error handling to end_clone_request(). end_clone_request(), which is called with queue lock held, completes the clone request and the original request in a softirq context (dm_softirq_done()), which has no queue lock, to avoid a deadlock issue on submission of another request during the completion: - The submitted request may be mapped to the same device - Request submission requires queue lock, but the queue lock has been held by itself and it doesn't know that The clone request has no clone bio when dm_softirq_done() is called. So target drivers can't resubmit it again even error cases. Instead, they can ask dm core for requeueing and remapping the original request in that cases. suspend ======= Request-based dm uses stopping md->queue as suspend of the md. For noflush suspend, just stops md->queue. For flush suspend, inserts a marker request to the tail of md->queue. And dispatches all requests in md->queue until the marker comes to the front of md->queue. Then, stops dispatching request and waits for the all dispatched requests to complete. After that, completes the marker request, stops md->queue and wake up the waiter on the suspend queue, md->wait. resume ====== Starts md->queue. Signed-off-by: Kiyoshi Ueda <k-ueda@ct.jp.nec.com> Signed-off-by: Jun'ichi Nomura <j-nomura@ce.jp.nec.com> Signed-off-by: Alasdair G Kergon <agk@redhat.com>
2009-06-22 17:12:35 +08:00
if (r <= 0)
dm: prepare for request based option This patch adds core functions for request-based dm. When struct mapped device (md) is initialized, md->queue has an I/O scheduler and the following functions are used for request-based dm as the queue functions: make_request_fn: dm_make_request() pref_fn: dm_prep_fn() request_fn: dm_request_fn() softirq_done_fn: dm_softirq_done() lld_busy_fn: dm_lld_busy() Actual initializations are done in another patch (PATCH 2). Below is a brief summary of how request-based dm behaves, including: - making request from bio - cloning, mapping and dispatching request - completing request and bio - suspending md - resuming md bio to request ============== md->queue->make_request_fn() (dm_make_request()) calls __make_request() for a bio submitted to the md. Then, the bio is kept in the queue as a new request or merged into another request in the queue if possible. Cloning and Mapping =================== Cloning and mapping are done in md->queue->request_fn() (dm_request_fn()), when requests are dispatched after they are sorted by the I/O scheduler. dm_request_fn() checks busy state of underlying devices using target's busy() function and stops dispatching requests to keep them on the dm device's queue if busy. It helps better I/O merging, since no merge is done for a request once it is dispatched to underlying devices. Actual cloning and mapping are done in dm_prep_fn() and map_request() called from dm_request_fn(). dm_prep_fn() clones not only request but also bios of the request so that dm can hold bio completion in error cases and prevent the bio submitter from noticing the error. (See the "Completion" section below for details.) After the cloning, the clone is mapped by target's map_rq() function and inserted to underlying device's queue using blk_insert_cloned_request(). Completion ========== Request completion can be hooked by rq->end_io(), but then, all bios in the request will have been completed even error cases, and the bio submitter will have noticed the error. To prevent the bio completion in error cases, request-based dm clones both bio and request and hooks both bio->bi_end_io() and rq->end_io(): bio->bi_end_io(): end_clone_bio() rq->end_io(): end_clone_request() Summary of the request completion flow is below: blk_end_request() for a clone request => blk_update_request() => bio->bi_end_io() == end_clone_bio() for each clone bio => Free the clone bio => Success: Complete the original bio (blk_update_request()) Error: Don't complete the original bio => blk_finish_request() => rq->end_io() == end_clone_request() => blk_complete_request() => dm_softirq_done() => Free the clone request => Success: Complete the original request (blk_end_request()) Error: Requeue the original request end_clone_bio() completes the original request on the size of the original bio in successful cases. Even if all bios in the original request are completed by that completion, the original request must not be completed yet to keep the ordering of request completion for the stacking. So end_clone_bio() uses blk_update_request() instead of blk_end_request(). In error cases, end_clone_bio() doesn't complete the original bio. It just frees the cloned bio and gives over the error handling to end_clone_request(). end_clone_request(), which is called with queue lock held, completes the clone request and the original request in a softirq context (dm_softirq_done()), which has no queue lock, to avoid a deadlock issue on submission of another request during the completion: - The submitted request may be mapped to the same device - Request submission requires queue lock, but the queue lock has been held by itself and it doesn't know that The clone request has no clone bio when dm_softirq_done() is called. So target drivers can't resubmit it again even error cases. Instead, they can ask dm core for requeueing and remapping the original request in that cases. suspend ======= Request-based dm uses stopping md->queue as suspend of the md. For noflush suspend, just stops md->queue. For flush suspend, inserts a marker request to the tail of md->queue. And dispatches all requests in md->queue until the marker comes to the front of md->queue. Then, stops dispatching request and waits for the all dispatched requests to complete. After that, completes the marker request, stops md->queue and wake up the waiter on the suspend queue, md->wait. resume ====== Starts md->queue. Signed-off-by: Kiyoshi Ueda <k-ueda@ct.jp.nec.com> Signed-off-by: Jun'ichi Nomura <j-nomura@ce.jp.nec.com> Signed-off-by: Alasdair G Kergon <agk@redhat.com>
2009-06-22 17:12:35 +08:00
/* The target wants to complete the I/O */
dm_end_request(clone, r);
else if (r == DM_ENDIO_INCOMPLETE)
dm: prepare for request based option This patch adds core functions for request-based dm. When struct mapped device (md) is initialized, md->queue has an I/O scheduler and the following functions are used for request-based dm as the queue functions: make_request_fn: dm_make_request() pref_fn: dm_prep_fn() request_fn: dm_request_fn() softirq_done_fn: dm_softirq_done() lld_busy_fn: dm_lld_busy() Actual initializations are done in another patch (PATCH 2). Below is a brief summary of how request-based dm behaves, including: - making request from bio - cloning, mapping and dispatching request - completing request and bio - suspending md - resuming md bio to request ============== md->queue->make_request_fn() (dm_make_request()) calls __make_request() for a bio submitted to the md. Then, the bio is kept in the queue as a new request or merged into another request in the queue if possible. Cloning and Mapping =================== Cloning and mapping are done in md->queue->request_fn() (dm_request_fn()), when requests are dispatched after they are sorted by the I/O scheduler. dm_request_fn() checks busy state of underlying devices using target's busy() function and stops dispatching requests to keep them on the dm device's queue if busy. It helps better I/O merging, since no merge is done for a request once it is dispatched to underlying devices. Actual cloning and mapping are done in dm_prep_fn() and map_request() called from dm_request_fn(). dm_prep_fn() clones not only request but also bios of the request so that dm can hold bio completion in error cases and prevent the bio submitter from noticing the error. (See the "Completion" section below for details.) After the cloning, the clone is mapped by target's map_rq() function and inserted to underlying device's queue using blk_insert_cloned_request(). Completion ========== Request completion can be hooked by rq->end_io(), but then, all bios in the request will have been completed even error cases, and the bio submitter will have noticed the error. To prevent the bio completion in error cases, request-based dm clones both bio and request and hooks both bio->bi_end_io() and rq->end_io(): bio->bi_end_io(): end_clone_bio() rq->end_io(): end_clone_request() Summary of the request completion flow is below: blk_end_request() for a clone request => blk_update_request() => bio->bi_end_io() == end_clone_bio() for each clone bio => Free the clone bio => Success: Complete the original bio (blk_update_request()) Error: Don't complete the original bio => blk_finish_request() => rq->end_io() == end_clone_request() => blk_complete_request() => dm_softirq_done() => Free the clone request => Success: Complete the original request (blk_end_request()) Error: Requeue the original request end_clone_bio() completes the original request on the size of the original bio in successful cases. Even if all bios in the original request are completed by that completion, the original request must not be completed yet to keep the ordering of request completion for the stacking. So end_clone_bio() uses blk_update_request() instead of blk_end_request(). In error cases, end_clone_bio() doesn't complete the original bio. It just frees the cloned bio and gives over the error handling to end_clone_request(). end_clone_request(), which is called with queue lock held, completes the clone request and the original request in a softirq context (dm_softirq_done()), which has no queue lock, to avoid a deadlock issue on submission of another request during the completion: - The submitted request may be mapped to the same device - Request submission requires queue lock, but the queue lock has been held by itself and it doesn't know that The clone request has no clone bio when dm_softirq_done() is called. So target drivers can't resubmit it again even error cases. Instead, they can ask dm core for requeueing and remapping the original request in that cases. suspend ======= Request-based dm uses stopping md->queue as suspend of the md. For noflush suspend, just stops md->queue. For flush suspend, inserts a marker request to the tail of md->queue. And dispatches all requests in md->queue until the marker comes to the front of md->queue. Then, stops dispatching request and waits for the all dispatched requests to complete. After that, completes the marker request, stops md->queue and wake up the waiter on the suspend queue, md->wait. resume ====== Starts md->queue. Signed-off-by: Kiyoshi Ueda <k-ueda@ct.jp.nec.com> Signed-off-by: Jun'ichi Nomura <j-nomura@ce.jp.nec.com> Signed-off-by: Alasdair G Kergon <agk@redhat.com>
2009-06-22 17:12:35 +08:00
/* The target will handle the I/O */
return;
else if (r == DM_ENDIO_REQUEUE)
dm: prepare for request based option This patch adds core functions for request-based dm. When struct mapped device (md) is initialized, md->queue has an I/O scheduler and the following functions are used for request-based dm as the queue functions: make_request_fn: dm_make_request() pref_fn: dm_prep_fn() request_fn: dm_request_fn() softirq_done_fn: dm_softirq_done() lld_busy_fn: dm_lld_busy() Actual initializations are done in another patch (PATCH 2). Below is a brief summary of how request-based dm behaves, including: - making request from bio - cloning, mapping and dispatching request - completing request and bio - suspending md - resuming md bio to request ============== md->queue->make_request_fn() (dm_make_request()) calls __make_request() for a bio submitted to the md. Then, the bio is kept in the queue as a new request or merged into another request in the queue if possible. Cloning and Mapping =================== Cloning and mapping are done in md->queue->request_fn() (dm_request_fn()), when requests are dispatched after they are sorted by the I/O scheduler. dm_request_fn() checks busy state of underlying devices using target's busy() function and stops dispatching requests to keep them on the dm device's queue if busy. It helps better I/O merging, since no merge is done for a request once it is dispatched to underlying devices. Actual cloning and mapping are done in dm_prep_fn() and map_request() called from dm_request_fn(). dm_prep_fn() clones not only request but also bios of the request so that dm can hold bio completion in error cases and prevent the bio submitter from noticing the error. (See the "Completion" section below for details.) After the cloning, the clone is mapped by target's map_rq() function and inserted to underlying device's queue using blk_insert_cloned_request(). Completion ========== Request completion can be hooked by rq->end_io(), but then, all bios in the request will have been completed even error cases, and the bio submitter will have noticed the error. To prevent the bio completion in error cases, request-based dm clones both bio and request and hooks both bio->bi_end_io() and rq->end_io(): bio->bi_end_io(): end_clone_bio() rq->end_io(): end_clone_request() Summary of the request completion flow is below: blk_end_request() for a clone request => blk_update_request() => bio->bi_end_io() == end_clone_bio() for each clone bio => Free the clone bio => Success: Complete the original bio (blk_update_request()) Error: Don't complete the original bio => blk_finish_request() => rq->end_io() == end_clone_request() => blk_complete_request() => dm_softirq_done() => Free the clone request => Success: Complete the original request (blk_end_request()) Error: Requeue the original request end_clone_bio() completes the original request on the size of the original bio in successful cases. Even if all bios in the original request are completed by that completion, the original request must not be completed yet to keep the ordering of request completion for the stacking. So end_clone_bio() uses blk_update_request() instead of blk_end_request(). In error cases, end_clone_bio() doesn't complete the original bio. It just frees the cloned bio and gives over the error handling to end_clone_request(). end_clone_request(), which is called with queue lock held, completes the clone request and the original request in a softirq context (dm_softirq_done()), which has no queue lock, to avoid a deadlock issue on submission of another request during the completion: - The submitted request may be mapped to the same device - Request submission requires queue lock, but the queue lock has been held by itself and it doesn't know that The clone request has no clone bio when dm_softirq_done() is called. So target drivers can't resubmit it again even error cases. Instead, they can ask dm core for requeueing and remapping the original request in that cases. suspend ======= Request-based dm uses stopping md->queue as suspend of the md. For noflush suspend, just stops md->queue. For flush suspend, inserts a marker request to the tail of md->queue. And dispatches all requests in md->queue until the marker comes to the front of md->queue. Then, stops dispatching request and waits for the all dispatched requests to complete. After that, completes the marker request, stops md->queue and wake up the waiter on the suspend queue, md->wait. resume ====== Starts md->queue. Signed-off-by: Kiyoshi Ueda <k-ueda@ct.jp.nec.com> Signed-off-by: Jun'ichi Nomura <j-nomura@ce.jp.nec.com> Signed-off-by: Alasdair G Kergon <agk@redhat.com>
2009-06-22 17:12:35 +08:00
/* The target wants to requeue the I/O */
dm_requeue_unmapped_request(clone);
else {
DMWARN("unimplemented target endio return value: %d", r);
dm: prepare for request based option This patch adds core functions for request-based dm. When struct mapped device (md) is initialized, md->queue has an I/O scheduler and the following functions are used for request-based dm as the queue functions: make_request_fn: dm_make_request() pref_fn: dm_prep_fn() request_fn: dm_request_fn() softirq_done_fn: dm_softirq_done() lld_busy_fn: dm_lld_busy() Actual initializations are done in another patch (PATCH 2). Below is a brief summary of how request-based dm behaves, including: - making request from bio - cloning, mapping and dispatching request - completing request and bio - suspending md - resuming md bio to request ============== md->queue->make_request_fn() (dm_make_request()) calls __make_request() for a bio submitted to the md. Then, the bio is kept in the queue as a new request or merged into another request in the queue if possible. Cloning and Mapping =================== Cloning and mapping are done in md->queue->request_fn() (dm_request_fn()), when requests are dispatched after they are sorted by the I/O scheduler. dm_request_fn() checks busy state of underlying devices using target's busy() function and stops dispatching requests to keep them on the dm device's queue if busy. It helps better I/O merging, since no merge is done for a request once it is dispatched to underlying devices. Actual cloning and mapping are done in dm_prep_fn() and map_request() called from dm_request_fn(). dm_prep_fn() clones not only request but also bios of the request so that dm can hold bio completion in error cases and prevent the bio submitter from noticing the error. (See the "Completion" section below for details.) After the cloning, the clone is mapped by target's map_rq() function and inserted to underlying device's queue using blk_insert_cloned_request(). Completion ========== Request completion can be hooked by rq->end_io(), but then, all bios in the request will have been completed even error cases, and the bio submitter will have noticed the error. To prevent the bio completion in error cases, request-based dm clones both bio and request and hooks both bio->bi_end_io() and rq->end_io(): bio->bi_end_io(): end_clone_bio() rq->end_io(): end_clone_request() Summary of the request completion flow is below: blk_end_request() for a clone request => blk_update_request() => bio->bi_end_io() == end_clone_bio() for each clone bio => Free the clone bio => Success: Complete the original bio (blk_update_request()) Error: Don't complete the original bio => blk_finish_request() => rq->end_io() == end_clone_request() => blk_complete_request() => dm_softirq_done() => Free the clone request => Success: Complete the original request (blk_end_request()) Error: Requeue the original request end_clone_bio() completes the original request on the size of the original bio in successful cases. Even if all bios in the original request are completed by that completion, the original request must not be completed yet to keep the ordering of request completion for the stacking. So end_clone_bio() uses blk_update_request() instead of blk_end_request(). In error cases, end_clone_bio() doesn't complete the original bio. It just frees the cloned bio and gives over the error handling to end_clone_request(). end_clone_request(), which is called with queue lock held, completes the clone request and the original request in a softirq context (dm_softirq_done()), which has no queue lock, to avoid a deadlock issue on submission of another request during the completion: - The submitted request may be mapped to the same device - Request submission requires queue lock, but the queue lock has been held by itself and it doesn't know that The clone request has no clone bio when dm_softirq_done() is called. So target drivers can't resubmit it again even error cases. Instead, they can ask dm core for requeueing and remapping the original request in that cases. suspend ======= Request-based dm uses stopping md->queue as suspend of the md. For noflush suspend, just stops md->queue. For flush suspend, inserts a marker request to the tail of md->queue. And dispatches all requests in md->queue until the marker comes to the front of md->queue. Then, stops dispatching request and waits for the all dispatched requests to complete. After that, completes the marker request, stops md->queue and wake up the waiter on the suspend queue, md->wait. resume ====== Starts md->queue. Signed-off-by: Kiyoshi Ueda <k-ueda@ct.jp.nec.com> Signed-off-by: Jun'ichi Nomura <j-nomura@ce.jp.nec.com> Signed-off-by: Alasdair G Kergon <agk@redhat.com>
2009-06-22 17:12:35 +08:00
BUG();
}
}
/*
* Request completion handler for request-based dm
*/
static void dm_softirq_done(struct request *rq)
{
bool mapped = true;
struct request *clone = rq->completion_data;
struct dm_rq_target_io *tio = clone->end_io_data;
if (rq->cmd_flags & REQ_FAILED)
mapped = false;
dm_done(clone, tio->error, mapped);
}
dm: prepare for request based option This patch adds core functions for request-based dm. When struct mapped device (md) is initialized, md->queue has an I/O scheduler and the following functions are used for request-based dm as the queue functions: make_request_fn: dm_make_request() pref_fn: dm_prep_fn() request_fn: dm_request_fn() softirq_done_fn: dm_softirq_done() lld_busy_fn: dm_lld_busy() Actual initializations are done in another patch (PATCH 2). Below is a brief summary of how request-based dm behaves, including: - making request from bio - cloning, mapping and dispatching request - completing request and bio - suspending md - resuming md bio to request ============== md->queue->make_request_fn() (dm_make_request()) calls __make_request() for a bio submitted to the md. Then, the bio is kept in the queue as a new request or merged into another request in the queue if possible. Cloning and Mapping =================== Cloning and mapping are done in md->queue->request_fn() (dm_request_fn()), when requests are dispatched after they are sorted by the I/O scheduler. dm_request_fn() checks busy state of underlying devices using target's busy() function and stops dispatching requests to keep them on the dm device's queue if busy. It helps better I/O merging, since no merge is done for a request once it is dispatched to underlying devices. Actual cloning and mapping are done in dm_prep_fn() and map_request() called from dm_request_fn(). dm_prep_fn() clones not only request but also bios of the request so that dm can hold bio completion in error cases and prevent the bio submitter from noticing the error. (See the "Completion" section below for details.) After the cloning, the clone is mapped by target's map_rq() function and inserted to underlying device's queue using blk_insert_cloned_request(). Completion ========== Request completion can be hooked by rq->end_io(), but then, all bios in the request will have been completed even error cases, and the bio submitter will have noticed the error. To prevent the bio completion in error cases, request-based dm clones both bio and request and hooks both bio->bi_end_io() and rq->end_io(): bio->bi_end_io(): end_clone_bio() rq->end_io(): end_clone_request() Summary of the request completion flow is below: blk_end_request() for a clone request => blk_update_request() => bio->bi_end_io() == end_clone_bio() for each clone bio => Free the clone bio => Success: Complete the original bio (blk_update_request()) Error: Don't complete the original bio => blk_finish_request() => rq->end_io() == end_clone_request() => blk_complete_request() => dm_softirq_done() => Free the clone request => Success: Complete the original request (blk_end_request()) Error: Requeue the original request end_clone_bio() completes the original request on the size of the original bio in successful cases. Even if all bios in the original request are completed by that completion, the original request must not be completed yet to keep the ordering of request completion for the stacking. So end_clone_bio() uses blk_update_request() instead of blk_end_request(). In error cases, end_clone_bio() doesn't complete the original bio. It just frees the cloned bio and gives over the error handling to end_clone_request(). end_clone_request(), which is called with queue lock held, completes the clone request and the original request in a softirq context (dm_softirq_done()), which has no queue lock, to avoid a deadlock issue on submission of another request during the completion: - The submitted request may be mapped to the same device - Request submission requires queue lock, but the queue lock has been held by itself and it doesn't know that The clone request has no clone bio when dm_softirq_done() is called. So target drivers can't resubmit it again even error cases. Instead, they can ask dm core for requeueing and remapping the original request in that cases. suspend ======= Request-based dm uses stopping md->queue as suspend of the md. For noflush suspend, just stops md->queue. For flush suspend, inserts a marker request to the tail of md->queue. And dispatches all requests in md->queue until the marker comes to the front of md->queue. Then, stops dispatching request and waits for the all dispatched requests to complete. After that, completes the marker request, stops md->queue and wake up the waiter on the suspend queue, md->wait. resume ====== Starts md->queue. Signed-off-by: Kiyoshi Ueda <k-ueda@ct.jp.nec.com> Signed-off-by: Jun'ichi Nomura <j-nomura@ce.jp.nec.com> Signed-off-by: Alasdair G Kergon <agk@redhat.com>
2009-06-22 17:12:35 +08:00
/*
* Complete the clone and the original request with the error status
* through softirq context.
*/
static void dm_complete_request(struct request *clone, int error)
{
struct dm_rq_target_io *tio = clone->end_io_data;
struct request *rq = tio->orig;
tio->error = error;
rq->completion_data = clone;
blk_complete_request(rq);
}
/*
* Complete the not-mapped clone and the original request with the error status
* through softirq context.
* Target's rq_end_io() function isn't called.
* This may be used when the target's map_rq() function fails.
*/
void dm_kill_unmapped_request(struct request *clone, int error)
{
struct dm_rq_target_io *tio = clone->end_io_data;
struct request *rq = tio->orig;
rq->cmd_flags |= REQ_FAILED;
dm_complete_request(clone, error);
}
EXPORT_SYMBOL_GPL(dm_kill_unmapped_request);
/*
* Called with the queue lock held
*/
static void end_clone_request(struct request *clone, int error)
{
/*
* For just cleaning up the information of the queue in which
* the clone was dispatched.
* The clone is *NOT* freed actually here because it is alloced from
* dm own mempool and REQ_ALLOCED isn't set in clone->cmd_flags.
*/
__blk_put_request(clone->q, clone);
/*
* Actual request completion is done in a softirq context which doesn't
* hold the queue lock. Otherwise, deadlock could occur because:
* - another request may be submitted by the upper level driver
* of the stacking during the completion
* - the submission which requires queue lock may be done
* against this queue
*/
dm_complete_request(clone, error);
}
/*
* Return maximum size of I/O possible at the supplied sector up to the current
* target boundary.
*/
static sector_t max_io_len_target_boundary(sector_t sector, struct dm_target *ti)
{
sector_t target_offset = dm_target_offset(ti, sector);
return ti->len - target_offset;
}
static sector_t max_io_len(sector_t sector, struct dm_target *ti)
{
sector_t len = max_io_len_target_boundary(sector, ti);
/*
* Does the target need to split even further ?
*/
if (ti->split_io) {
sector_t boundary;
sector_t offset = dm_target_offset(ti, sector);
boundary = ((offset + ti->split_io) & ~(ti->split_io - 1))
- offset;
if (len > boundary)
len = boundary;
}
return len;
}
static void __map_bio(struct dm_target *ti, struct bio *clone,
struct dm_target_io *tio)
{
int r;
sector_t sector;
struct mapped_device *md;
clone->bi_end_io = clone_endio;
clone->bi_private = tio;
/*
* Map the clone. If r == 0 we don't need to do
* anything, the target has assumed ownership of
* this io.
*/
atomic_inc(&tio->io->io_count);
sector = clone->bi_sector;
r = ti->type->map(ti, clone, &tio->info);
2006-12-08 18:41:05 +08:00
if (r == DM_MAPIO_REMAPPED) {
/* the bio has been remapped so dispatch it */
trace_block_bio_remap(bdev_get_queue(clone->bi_bdev), clone,
tio->io->bio->bi_bdev->bd_dev, sector);
generic_make_request(clone);
[PATCH] dm: suspend: add noflush pushback In device-mapper I/O is sometimes queued within targets for later processing. For example the multipath target can be configured to store I/O when no paths are available instead of returning it -EIO. This patch allows the device-mapper core to instruct a target to transfer the contents of any such in-target queue back into the core. This frees up the resources used by the target so the core can replace that target with an alternative one and then resend the I/O to it. Without this patch the only way to change the target in such circumstances involves returning the I/O with an error back to the filesystem/application. In the multipath case, this patch will let us add new paths for existing I/O to try after all the existing paths have failed. DMF_NOFLUSH_SUSPENDING ---------------------- If the DM_NOFLUSH_FLAG ioctl option is specified at suspend time, the DMF_NOFLUSH_SUSPENDING flag is set in md->flags during dm_suspend(). It is always cleared before dm_suspend() returns. The flag must be visible while the target is flushing pending I/Os so it is set before presuspend where the flush starts and unset after the wait for md->pending where the flush ends. Target drivers can check this flag by calling dm_noflush_suspending(). DM_MAPIO_REQUEUE / DM_ENDIO_REQUEUE ----------------------------------- A target's map() function can now return DM_MAPIO_REQUEUE to request the device mapper core queue the bio. Similarly, a target's end_io() function can return DM_ENDIO_REQUEUE to request the same. This has been labelled 'pushback'. The __map_bio() and clone_endio() functions in the core treat these return values as errors and call dec_pending() to end the I/O. dec_pending ----------- dec_pending() saves the pushback request in struct dm_io->error. Once all the split clones have ended, dec_pending() will put the original bio on the md->pushback list. Note that this supercedes any I/O errors. It is possible for the suspend with DM_NOFLUSH_FLAG to be aborted while in progress (e.g. by user interrupt). dec_pending() checks for this and returns -EIO if it happened. pushdback list and pushback_lock -------------------------------- The bio is queued on md->pushback temporarily in dec_pending(), and after all pending I/Os return, md->pushback is merged into md->deferred in dm_suspend() for re-issuing at resume time. md->pushback_lock protects md->pushback. The lock should be held with irq disabled because dec_pending() can be called from interrupt context. Queueing bios to md->pushback in dec_pending() must be done atomically with the check for DMF_NOFLUSH_SUSPENDING flag. So md->pushback_lock is held when checking the flag. Otherwise dec_pending() may queue a bio to md->pushback after the interrupted dm_suspend() flushes md->pushback. Then the bio would be left in md->pushback. Flag setting in dm_suspend() can be done without md->pushback_lock because the flag is checked only after presuspend and the set value is already made visible via the target's presuspend function. The flag can be checked without md->pushback_lock (e.g. the first part of the dec_pending() or target drivers), because the flag is checked again with md->pushback_lock held when the bio is really queued to md->pushback as described above. So even if the flag is cleared after the lockless checkings, the bio isn't left in md->pushback but returned to applications with -EIO. Other notes on the current patch -------------------------------- - md->pushback is added to the struct mapped_device instead of using md->deferred directly because md->io_lock which protects md->deferred is rw_semaphore and can't be used in interrupt context like dec_pending(), and md->io_lock protects the DMF_BLOCK_IO flag of md->flags too. - Don't issue lock_fs() in dm_suspend() if the DM_NOFLUSH_FLAG ioctl option is specified, because I/Os generated by lock_fs() would be pushed back and never return if there were no valid devices. - If an error occurs in dm_suspend() after the DMF_NOFLUSH_SUSPENDING flag is set, md->pushback must be flushed because I/Os may be queued to the list already. (flush_and_out label in dm_suspend()) Test results ------------ I have tested using multipath target with the next patch. The following tests are for regression/compatibility: - I/Os succeed when valid paths exist; - I/Os fail when there are no valid paths and queue_if_no_path is not set; - I/Os are queued in the multipath target when there are no valid paths and queue_if_no_path is set; - The queued I/Os above fail when suspend is issued without the DM_NOFLUSH_FLAG ioctl option. I/Os spanning 2 multipath targets also fail. The following tests are for the normal code path of new pushback feature: - Queued I/Os in the multipath target are flushed from the target but don't return when suspend is issued with the DM_NOFLUSH_FLAG ioctl option; - The I/Os above are queued in the multipath target again when resume is issued without path recovery; - The I/Os above succeed when resume is issued after path recovery or table load; - Queued I/Os in the multipath target succeed when resume is issued with the DM_NOFLUSH_FLAG ioctl option after table load. I/Os spanning 2 multipath targets also succeed. The following tests are for the error paths of the new pushback feature: - When the bdget_disk() fails in dm_suspend(), the DMF_NOFLUSH_SUSPENDING flag is cleared and I/Os already queued to the pushback list are flushed properly. - When suspend with the DM_NOFLUSH_FLAG ioctl option is interrupted, o I/Os which had already been queued to the pushback list at the time don't return, and are re-issued at resume time; o I/Os which hadn't been returned at the time return with EIO. Signed-off-by: Kiyoshi Ueda <k-ueda@ct.jp.nec.com> Signed-off-by: Jun'ichi Nomura <j-nomura@ce.jp.nec.com> Signed-off-by: Alasdair G Kergon <agk@redhat.com> Cc: dm-devel@redhat.com Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-12-08 18:41:09 +08:00
} else if (r < 0 || r == DM_MAPIO_REQUEUE) {
/* error the io and bail out, or requeue it if needed */
md = tio->io->md;
dec_pending(tio->io, r);
/*
* Store bio_set for cleanup.
*/
clone->bi_end_io = NULL;
clone->bi_private = md->bs;
bio_put(clone);
free_tio(md, tio);
2006-12-08 18:41:05 +08:00
} else if (r) {
DMWARN("unimplemented target map return value: %d", r);
BUG();
}
}
struct clone_info {
struct mapped_device *md;
struct dm_table *map;
struct bio *bio;
struct dm_io *io;
sector_t sector;
sector_t sector_count;
unsigned short idx;
};
static void dm_bio_destructor(struct bio *bio)
{
struct bio_set *bs = bio->bi_private;
bio_free(bio, bs);
}
/*
dm: implement REQ_FLUSH/FUA support for bio-based dm This patch converts bio-based dm to support REQ_FLUSH/FUA instead of now deprecated REQ_HARDBARRIER. * -EOPNOTSUPP handling logic dropped. * Preflush is handled as before but postflush is dropped and replaced with passing down REQ_FUA to member request_queues. This replaces one array wide cache flush w/ member specific FUA writes. * __split_and_process_bio() now calls __clone_and_map_flush() directly for flushes and guarantees all FLUSH bio's going to targets are zero ` length. * It's now guaranteed that all FLUSH bio's which are passed onto dm targets are zero length. bio_empty_barrier() tests are replaced with REQ_FLUSH tests. * Empty WRITE_BARRIERs are replaced with WRITE_FLUSHes. * Dropped unlikely() around REQ_FLUSH tests. Flushes are not unlikely enough to be marked with unlikely(). * Block layer now filters out REQ_FLUSH/FUA bio's if the request_queue doesn't support cache flushing. Advertise REQ_FLUSH | REQ_FUA capability. * Request based dm isn't converted yet. dm_init_request_based_queue() resets flush support to 0 for now. To avoid disturbing request based dm code, dm->flush_error is added for bio based dm while requested based dm continues to use dm->barrier_error. Lightly tested linear, stripe, raid1, snap and crypt targets. Please proceed with caution as I'm not familiar with the code base. Signed-off-by: Tejun Heo <tj@kernel.org> Cc: dm-devel@redhat.com Cc: Christoph Hellwig <hch@lst.de> Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
2010-09-03 17:56:19 +08:00
* Creates a little bio that just does part of a bvec.
*/
static struct bio *split_bvec(struct bio *bio, sector_t sector,
unsigned short idx, unsigned int offset,
unsigned int len, struct bio_set *bs)
{
struct bio *clone;
struct bio_vec *bv = bio->bi_io_vec + idx;
clone = bio_alloc_bioset(GFP_NOIO, 1, bs);
clone->bi_destructor = dm_bio_destructor;
*clone->bi_io_vec = *bv;
clone->bi_sector = sector;
clone->bi_bdev = bio->bi_bdev;
dm: implement REQ_FLUSH/FUA support for bio-based dm This patch converts bio-based dm to support REQ_FLUSH/FUA instead of now deprecated REQ_HARDBARRIER. * -EOPNOTSUPP handling logic dropped. * Preflush is handled as before but postflush is dropped and replaced with passing down REQ_FUA to member request_queues. This replaces one array wide cache flush w/ member specific FUA writes. * __split_and_process_bio() now calls __clone_and_map_flush() directly for flushes and guarantees all FLUSH bio's going to targets are zero ` length. * It's now guaranteed that all FLUSH bio's which are passed onto dm targets are zero length. bio_empty_barrier() tests are replaced with REQ_FLUSH tests. * Empty WRITE_BARRIERs are replaced with WRITE_FLUSHes. * Dropped unlikely() around REQ_FLUSH tests. Flushes are not unlikely enough to be marked with unlikely(). * Block layer now filters out REQ_FLUSH/FUA bio's if the request_queue doesn't support cache flushing. Advertise REQ_FLUSH | REQ_FUA capability. * Request based dm isn't converted yet. dm_init_request_based_queue() resets flush support to 0 for now. To avoid disturbing request based dm code, dm->flush_error is added for bio based dm while requested based dm continues to use dm->barrier_error. Lightly tested linear, stripe, raid1, snap and crypt targets. Please proceed with caution as I'm not familiar with the code base. Signed-off-by: Tejun Heo <tj@kernel.org> Cc: dm-devel@redhat.com Cc: Christoph Hellwig <hch@lst.de> Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
2010-09-03 17:56:19 +08:00
clone->bi_rw = bio->bi_rw;
clone->bi_vcnt = 1;
clone->bi_size = to_bytes(len);
clone->bi_io_vec->bv_offset = offset;
clone->bi_io_vec->bv_len = clone->bi_size;
clone->bi_flags |= 1 << BIO_CLONED;
if (bio_integrity(bio)) {
bio_integrity_clone(clone, bio, GFP_NOIO, bs);
bio_integrity_trim(clone,
bio_sector_offset(bio, idx, offset), len);
}
return clone;
}
/*
* Creates a bio that consists of range of complete bvecs.
*/
static struct bio *clone_bio(struct bio *bio, sector_t sector,
unsigned short idx, unsigned short bv_count,
unsigned int len, struct bio_set *bs)
{
struct bio *clone;
clone = bio_alloc_bioset(GFP_NOIO, bio->bi_max_vecs, bs);
__bio_clone(clone, bio);
clone->bi_destructor = dm_bio_destructor;
clone->bi_sector = sector;
clone->bi_idx = idx;
clone->bi_vcnt = idx + bv_count;
clone->bi_size = to_bytes(len);
clone->bi_flags &= ~(1 << BIO_SEG_VALID);
if (bio_integrity(bio)) {
bio_integrity_clone(clone, bio, GFP_NOIO, bs);
if (idx != bio->bi_idx || clone->bi_size < bio->bi_size)
bio_integrity_trim(clone,
bio_sector_offset(bio, idx, 0), len);
}
return clone;
}
static struct dm_target_io *alloc_tio(struct clone_info *ci,
struct dm_target *ti)
{
struct dm_target_io *tio = mempool_alloc(ci->md->tio_pool, GFP_NOIO);
tio->io = ci->io;
tio->ti = ti;
memset(&tio->info, 0, sizeof(tio->info));
return tio;
}
static void __issue_target_request(struct clone_info *ci, struct dm_target *ti,
unsigned request_nr, sector_t len)
{
struct dm_target_io *tio = alloc_tio(ci, ti);
struct bio *clone;
tio->info.target_request_nr = request_nr;
/*
* Discard requests require the bio's inline iovecs be initialized.
* ci->bio->bi_max_vecs is BIO_INLINE_VECS anyway, for both flush
* and discard, so no need for concern about wasted bvec allocations.
*/
clone = bio_alloc_bioset(GFP_NOIO, ci->bio->bi_max_vecs, ci->md->bs);
__bio_clone(clone, ci->bio);
clone->bi_destructor = dm_bio_destructor;
if (len) {
clone->bi_sector = ci->sector;
clone->bi_size = to_bytes(len);
}
__map_bio(ti, clone, tio);
}
static void __issue_target_requests(struct clone_info *ci, struct dm_target *ti,
unsigned num_requests, sector_t len)
{
unsigned request_nr;
for (request_nr = 0; request_nr < num_requests; request_nr++)
__issue_target_request(ci, ti, request_nr, len);
}
static int __clone_and_map_empty_flush(struct clone_info *ci)
{
unsigned target_nr = 0;
struct dm_target *ti;
BUG_ON(bio_has_data(ci->bio));
while ((ti = dm_table_get_target(ci->map, target_nr++)))
__issue_target_requests(ci, ti, ti->num_flush_requests, 0);
return 0;
}
/*
* Perform all io with a single clone.
*/
static void __clone_and_map_simple(struct clone_info *ci, struct dm_target *ti)
{
struct bio *clone, *bio = ci->bio;
struct dm_target_io *tio;
tio = alloc_tio(ci, ti);
clone = clone_bio(bio, ci->sector, ci->idx,
bio->bi_vcnt - ci->idx, ci->sector_count,
ci->md->bs);
__map_bio(ti, clone, tio);
ci->sector_count = 0;
}
static int __clone_and_map_discard(struct clone_info *ci)
{
struct dm_target *ti;
sector_t len;
do {
ti = dm_table_find_target(ci->map, ci->sector);
if (!dm_target_is_valid(ti))
return -EIO;
/*
* Even though the device advertised discard support,
* that does not mean every target supports it, and
* reconfiguration might also have changed that since the
* check was performed.
*/
if (!ti->num_discard_requests)
return -EOPNOTSUPP;
len = min(ci->sector_count, max_io_len_target_boundary(ci->sector, ti));
__issue_target_requests(ci, ti, ti->num_discard_requests, len);
ci->sector += len;
} while (ci->sector_count -= len);
return 0;
}
static int __clone_and_map(struct clone_info *ci)
{
struct bio *clone, *bio = ci->bio;
struct dm_target *ti;
sector_t len = 0, max;
struct dm_target_io *tio;
if (unlikely(bio->bi_rw & REQ_DISCARD))
return __clone_and_map_discard(ci);
ti = dm_table_find_target(ci->map, ci->sector);
if (!dm_target_is_valid(ti))
return -EIO;
max = max_io_len(ci->sector, ti);
if (ci->sector_count <= max) {
/*
* Optimise for the simple case where we can do all of
* the remaining io with a single clone.
*/
__clone_and_map_simple(ci, ti);
} else if (to_sector(bio->bi_io_vec[ci->idx].bv_len) <= max) {
/*
* There are some bvecs that don't span targets.
* Do as many of these as possible.
*/
int i;
sector_t remaining = max;
sector_t bv_len;
for (i = ci->idx; remaining && (i < bio->bi_vcnt); i++) {
bv_len = to_sector(bio->bi_io_vec[i].bv_len);
if (bv_len > remaining)
break;
remaining -= bv_len;
len += bv_len;
}
tio = alloc_tio(ci, ti);
clone = clone_bio(bio, ci->sector, ci->idx, i - ci->idx, len,
ci->md->bs);
__map_bio(ti, clone, tio);
ci->sector += len;
ci->sector_count -= len;
ci->idx = i;
} else {
/*
* Handle a bvec that must be split between two or more targets.
*/
struct bio_vec *bv = bio->bi_io_vec + ci->idx;
sector_t remaining = to_sector(bv->bv_len);
unsigned int offset = 0;
do {
if (offset) {
ti = dm_table_find_target(ci->map, ci->sector);
if (!dm_target_is_valid(ti))
return -EIO;
max = max_io_len(ci->sector, ti);
}
len = min(remaining, max);
tio = alloc_tio(ci, ti);
clone = split_bvec(bio, ci->sector, ci->idx,
bv->bv_offset + offset, len,
ci->md->bs);
__map_bio(ti, clone, tio);
ci->sector += len;
ci->sector_count -= len;
offset += to_bytes(len);
} while (remaining -= len);
ci->idx++;
}
return 0;
}
/*
* Split the bio into several clones and submit it to targets.
*/
static void __split_and_process_bio(struct mapped_device *md, struct bio *bio)
{
struct clone_info ci;
int error = 0;
ci.map = dm_get_live_table(md);
if (unlikely(!ci.map)) {
dm: relax ordering of bio-based flush implementation Unlike REQ_HARDBARRIER, REQ_FLUSH/FUA doesn't mandate any ordering against other bio's. This patch relaxes ordering around flushes. * A flush bio is no longer deferred to workqueue directly. It's processed like other bio's but __split_and_process_bio() uses md->flush_bio as the clone source. md->flush_bio is initialized to empty flush during md initialization and shared for all flushes. * As a flush bio now travels through the same execution path as other bio's, there's no need for dedicated error handling path either. It can use the same error handling path in dec_pending(). Dedicated error handling removed along with md->flush_error. * When dec_pending() detects that a flush has completed, it checks whether the original bio has data. If so, the bio is queued to the deferred list w/ REQ_FLUSH cleared; otherwise, it's completed. * As flush sequencing is handled in the usual issue/completion path, dm_wq_work() no longer needs to handle flushes differently. Now its only responsibility is re-issuing deferred bio's the same way as _dm_request() would. REQ_FLUSH handling logic including process_flush() is dropped. * There's no reason for queue_io() and dm_wq_work() write lock dm->io_lock. queue_io() now only uses md->deferred_lock and dm_wq_work() read locks dm->io_lock. * bio's no longer need to be queued on the deferred list while a flush is in progress making DMF_QUEUE_IO_TO_THREAD unncessary. Drop it. This avoids stalling the device during flushes and simplifies the implementation. Signed-off-by: Tejun Heo <tj@kernel.org> Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
2010-09-09 00:07:00 +08:00
bio_io_error(bio);
return;
}
ci.md = md;
ci.io = alloc_io(md);
ci.io->error = 0;
atomic_set(&ci.io->io_count, 1);
ci.io->bio = bio;
ci.io->md = md;
spin_lock_init(&ci.io->endio_lock);
ci.sector = bio->bi_sector;
ci.idx = bio->bi_idx;
start_io_acct(ci.io);
if (bio->bi_rw & REQ_FLUSH) {
ci.bio = &ci.md->flush_bio;
ci.sector_count = 0;
error = __clone_and_map_empty_flush(&ci);
/* dec_pending submits any data associated with flush */
} else {
dm: relax ordering of bio-based flush implementation Unlike REQ_HARDBARRIER, REQ_FLUSH/FUA doesn't mandate any ordering against other bio's. This patch relaxes ordering around flushes. * A flush bio is no longer deferred to workqueue directly. It's processed like other bio's but __split_and_process_bio() uses md->flush_bio as the clone source. md->flush_bio is initialized to empty flush during md initialization and shared for all flushes. * As a flush bio now travels through the same execution path as other bio's, there's no need for dedicated error handling path either. It can use the same error handling path in dec_pending(). Dedicated error handling removed along with md->flush_error. * When dec_pending() detects that a flush has completed, it checks whether the original bio has data. If so, the bio is queued to the deferred list w/ REQ_FLUSH cleared; otherwise, it's completed. * As flush sequencing is handled in the usual issue/completion path, dm_wq_work() no longer needs to handle flushes differently. Now its only responsibility is re-issuing deferred bio's the same way as _dm_request() would. REQ_FLUSH handling logic including process_flush() is dropped. * There's no reason for queue_io() and dm_wq_work() write lock dm->io_lock. queue_io() now only uses md->deferred_lock and dm_wq_work() read locks dm->io_lock. * bio's no longer need to be queued on the deferred list while a flush is in progress making DMF_QUEUE_IO_TO_THREAD unncessary. Drop it. This avoids stalling the device during flushes and simplifies the implementation. Signed-off-by: Tejun Heo <tj@kernel.org> Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
2010-09-09 00:07:00 +08:00
ci.bio = bio;
dm: implement REQ_FLUSH/FUA support for bio-based dm This patch converts bio-based dm to support REQ_FLUSH/FUA instead of now deprecated REQ_HARDBARRIER. * -EOPNOTSUPP handling logic dropped. * Preflush is handled as before but postflush is dropped and replaced with passing down REQ_FUA to member request_queues. This replaces one array wide cache flush w/ member specific FUA writes. * __split_and_process_bio() now calls __clone_and_map_flush() directly for flushes and guarantees all FLUSH bio's going to targets are zero ` length. * It's now guaranteed that all FLUSH bio's which are passed onto dm targets are zero length. bio_empty_barrier() tests are replaced with REQ_FLUSH tests. * Empty WRITE_BARRIERs are replaced with WRITE_FLUSHes. * Dropped unlikely() around REQ_FLUSH tests. Flushes are not unlikely enough to be marked with unlikely(). * Block layer now filters out REQ_FLUSH/FUA bio's if the request_queue doesn't support cache flushing. Advertise REQ_FLUSH | REQ_FUA capability. * Request based dm isn't converted yet. dm_init_request_based_queue() resets flush support to 0 for now. To avoid disturbing request based dm code, dm->flush_error is added for bio based dm while requested based dm continues to use dm->barrier_error. Lightly tested linear, stripe, raid1, snap and crypt targets. Please proceed with caution as I'm not familiar with the code base. Signed-off-by: Tejun Heo <tj@kernel.org> Cc: dm-devel@redhat.com Cc: Christoph Hellwig <hch@lst.de> Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
2010-09-03 17:56:19 +08:00
ci.sector_count = bio_sectors(bio);
while (ci.sector_count && !error)
dm: implement REQ_FLUSH/FUA support for bio-based dm This patch converts bio-based dm to support REQ_FLUSH/FUA instead of now deprecated REQ_HARDBARRIER. * -EOPNOTSUPP handling logic dropped. * Preflush is handled as before but postflush is dropped and replaced with passing down REQ_FUA to member request_queues. This replaces one array wide cache flush w/ member specific FUA writes. * __split_and_process_bio() now calls __clone_and_map_flush() directly for flushes and guarantees all FLUSH bio's going to targets are zero ` length. * It's now guaranteed that all FLUSH bio's which are passed onto dm targets are zero length. bio_empty_barrier() tests are replaced with REQ_FLUSH tests. * Empty WRITE_BARRIERs are replaced with WRITE_FLUSHes. * Dropped unlikely() around REQ_FLUSH tests. Flushes are not unlikely enough to be marked with unlikely(). * Block layer now filters out REQ_FLUSH/FUA bio's if the request_queue doesn't support cache flushing. Advertise REQ_FLUSH | REQ_FUA capability. * Request based dm isn't converted yet. dm_init_request_based_queue() resets flush support to 0 for now. To avoid disturbing request based dm code, dm->flush_error is added for bio based dm while requested based dm continues to use dm->barrier_error. Lightly tested linear, stripe, raid1, snap and crypt targets. Please proceed with caution as I'm not familiar with the code base. Signed-off-by: Tejun Heo <tj@kernel.org> Cc: dm-devel@redhat.com Cc: Christoph Hellwig <hch@lst.de> Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
2010-09-03 17:56:19 +08:00
error = __clone_and_map(&ci);
}
/* drop the extra reference count */
dec_pending(ci.io, error);
dm_table_put(ci.map);
}
/*-----------------------------------------------------------------
* CRUD END
*---------------------------------------------------------------*/
static int dm_merge_bvec(struct request_queue *q,
struct bvec_merge_data *bvm,
struct bio_vec *biovec)
{
struct mapped_device *md = q->queuedata;
struct dm_table *map = dm_get_live_table(md);
struct dm_target *ti;
sector_t max_sectors;
int max_size = 0;
if (unlikely(!map))
goto out;
ti = dm_table_find_target(map, bvm->bi_sector);
if (!dm_target_is_valid(ti))
goto out_table;
/*
* Find maximum amount of I/O that won't need splitting
*/
max_sectors = min(max_io_len(bvm->bi_sector, ti),
(sector_t) BIO_MAX_SECTORS);
max_size = (max_sectors << SECTOR_SHIFT) - bvm->bi_size;
if (max_size < 0)
max_size = 0;
/*
* merge_bvec_fn() returns number of bytes
* it can accept at this offset
* max is precomputed maximal io size
*/
if (max_size && ti->type->merge)
max_size = ti->type->merge(ti, bvm, biovec, max_size);
/*
* If the target doesn't support merge method and some of the devices
* provided their merge_bvec method (we know this by looking at
* queue_max_hw_sectors), then we can't allow bios with multiple vector
* entries. So always set max_size to 0, and the code below allows
* just one page.
*/
else if (queue_max_hw_sectors(q) <= PAGE_SIZE >> 9)
max_size = 0;
out_table:
dm_table_put(map);
out:
/*
* Always allow an entire first page
*/
if (max_size <= biovec->bv_len && !(bvm->bi_size >> SECTOR_SHIFT))
max_size = biovec->bv_len;
return max_size;
}
/*
* The request function that just remaps the bio built up by
* dm_merge_bvec.
*/
static void _dm_request(struct request_queue *q, struct bio *bio)
{
int rw = bio_data_dir(bio);
struct mapped_device *md = q->queuedata;
int cpu;
down_read(&md->io_lock);
cpu = part_stat_lock();
part_stat_inc(cpu, &dm_disk(md)->part0, ios[rw]);
part_stat_add(cpu, &dm_disk(md)->part0, sectors[rw], bio_sectors(bio));
part_stat_unlock();
dm: relax ordering of bio-based flush implementation Unlike REQ_HARDBARRIER, REQ_FLUSH/FUA doesn't mandate any ordering against other bio's. This patch relaxes ordering around flushes. * A flush bio is no longer deferred to workqueue directly. It's processed like other bio's but __split_and_process_bio() uses md->flush_bio as the clone source. md->flush_bio is initialized to empty flush during md initialization and shared for all flushes. * As a flush bio now travels through the same execution path as other bio's, there's no need for dedicated error handling path either. It can use the same error handling path in dec_pending(). Dedicated error handling removed along with md->flush_error. * When dec_pending() detects that a flush has completed, it checks whether the original bio has data. If so, the bio is queued to the deferred list w/ REQ_FLUSH cleared; otherwise, it's completed. * As flush sequencing is handled in the usual issue/completion path, dm_wq_work() no longer needs to handle flushes differently. Now its only responsibility is re-issuing deferred bio's the same way as _dm_request() would. REQ_FLUSH handling logic including process_flush() is dropped. * There's no reason for queue_io() and dm_wq_work() write lock dm->io_lock. queue_io() now only uses md->deferred_lock and dm_wq_work() read locks dm->io_lock. * bio's no longer need to be queued on the deferred list while a flush is in progress making DMF_QUEUE_IO_TO_THREAD unncessary. Drop it. This avoids stalling the device during flushes and simplifies the implementation. Signed-off-by: Tejun Heo <tj@kernel.org> Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
2010-09-09 00:07:00 +08:00
/* if we're suspended, we have to queue this io for later */
if (unlikely(test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags))) {
up_read(&md->io_lock);
dm: relax ordering of bio-based flush implementation Unlike REQ_HARDBARRIER, REQ_FLUSH/FUA doesn't mandate any ordering against other bio's. This patch relaxes ordering around flushes. * A flush bio is no longer deferred to workqueue directly. It's processed like other bio's but __split_and_process_bio() uses md->flush_bio as the clone source. md->flush_bio is initialized to empty flush during md initialization and shared for all flushes. * As a flush bio now travels through the same execution path as other bio's, there's no need for dedicated error handling path either. It can use the same error handling path in dec_pending(). Dedicated error handling removed along with md->flush_error. * When dec_pending() detects that a flush has completed, it checks whether the original bio has data. If so, the bio is queued to the deferred list w/ REQ_FLUSH cleared; otherwise, it's completed. * As flush sequencing is handled in the usual issue/completion path, dm_wq_work() no longer needs to handle flushes differently. Now its only responsibility is re-issuing deferred bio's the same way as _dm_request() would. REQ_FLUSH handling logic including process_flush() is dropped. * There's no reason for queue_io() and dm_wq_work() write lock dm->io_lock. queue_io() now only uses md->deferred_lock and dm_wq_work() read locks dm->io_lock. * bio's no longer need to be queued on the deferred list while a flush is in progress making DMF_QUEUE_IO_TO_THREAD unncessary. Drop it. This avoids stalling the device during flushes and simplifies the implementation. Signed-off-by: Tejun Heo <tj@kernel.org> Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
2010-09-09 00:07:00 +08:00
if (bio_rw(bio) != READA)
queue_io(md, bio);
else
bio_io_error(bio);
return;
}
__split_and_process_bio(md, bio);
up_read(&md->io_lock);
return;
dm: prepare for request based option This patch adds core functions for request-based dm. When struct mapped device (md) is initialized, md->queue has an I/O scheduler and the following functions are used for request-based dm as the queue functions: make_request_fn: dm_make_request() pref_fn: dm_prep_fn() request_fn: dm_request_fn() softirq_done_fn: dm_softirq_done() lld_busy_fn: dm_lld_busy() Actual initializations are done in another patch (PATCH 2). Below is a brief summary of how request-based dm behaves, including: - making request from bio - cloning, mapping and dispatching request - completing request and bio - suspending md - resuming md bio to request ============== md->queue->make_request_fn() (dm_make_request()) calls __make_request() for a bio submitted to the md. Then, the bio is kept in the queue as a new request or merged into another request in the queue if possible. Cloning and Mapping =================== Cloning and mapping are done in md->queue->request_fn() (dm_request_fn()), when requests are dispatched after they are sorted by the I/O scheduler. dm_request_fn() checks busy state of underlying devices using target's busy() function and stops dispatching requests to keep them on the dm device's queue if busy. It helps better I/O merging, since no merge is done for a request once it is dispatched to underlying devices. Actual cloning and mapping are done in dm_prep_fn() and map_request() called from dm_request_fn(). dm_prep_fn() clones not only request but also bios of the request so that dm can hold bio completion in error cases and prevent the bio submitter from noticing the error. (See the "Completion" section below for details.) After the cloning, the clone is mapped by target's map_rq() function and inserted to underlying device's queue using blk_insert_cloned_request(). Completion ========== Request completion can be hooked by rq->end_io(), but then, all bios in the request will have been completed even error cases, and the bio submitter will have noticed the error. To prevent the bio completion in error cases, request-based dm clones both bio and request and hooks both bio->bi_end_io() and rq->end_io(): bio->bi_end_io(): end_clone_bio() rq->end_io(): end_clone_request() Summary of the request completion flow is below: blk_end_request() for a clone request => blk_update_request() => bio->bi_end_io() == end_clone_bio() for each clone bio => Free the clone bio => Success: Complete the original bio (blk_update_request()) Error: Don't complete the original bio => blk_finish_request() => rq->end_io() == end_clone_request() => blk_complete_request() => dm_softirq_done() => Free the clone request => Success: Complete the original request (blk_end_request()) Error: Requeue the original request end_clone_bio() completes the original request on the size of the original bio in successful cases. Even if all bios in the original request are completed by that completion, the original request must not be completed yet to keep the ordering of request completion for the stacking. So end_clone_bio() uses blk_update_request() instead of blk_end_request(). In error cases, end_clone_bio() doesn't complete the original bio. It just frees the cloned bio and gives over the error handling to end_clone_request(). end_clone_request(), which is called with queue lock held, completes the clone request and the original request in a softirq context (dm_softirq_done()), which has no queue lock, to avoid a deadlock issue on submission of another request during the completion: - The submitted request may be mapped to the same device - Request submission requires queue lock, but the queue lock has been held by itself and it doesn't know that The clone request has no clone bio when dm_softirq_done() is called. So target drivers can't resubmit it again even error cases. Instead, they can ask dm core for requeueing and remapping the original request in that cases. suspend ======= Request-based dm uses stopping md->queue as suspend of the md. For noflush suspend, just stops md->queue. For flush suspend, inserts a marker request to the tail of md->queue. And dispatches all requests in md->queue until the marker comes to the front of md->queue. Then, stops dispatching request and waits for the all dispatched requests to complete. After that, completes the marker request, stops md->queue and wake up the waiter on the suspend queue, md->wait. resume ====== Starts md->queue. Signed-off-by: Kiyoshi Ueda <k-ueda@ct.jp.nec.com> Signed-off-by: Jun'ichi Nomura <j-nomura@ce.jp.nec.com> Signed-off-by: Alasdair G Kergon <agk@redhat.com>
2009-06-22 17:12:35 +08:00
}
static int dm_request_based(struct mapped_device *md)
{
return blk_queue_stackable(md->queue);
}
static void dm_request(struct request_queue *q, struct bio *bio)
dm: prepare for request based option This patch adds core functions for request-based dm. When struct mapped device (md) is initialized, md->queue has an I/O scheduler and the following functions are used for request-based dm as the queue functions: make_request_fn: dm_make_request() pref_fn: dm_prep_fn() request_fn: dm_request_fn() softirq_done_fn: dm_softirq_done() lld_busy_fn: dm_lld_busy() Actual initializations are done in another patch (PATCH 2). Below is a brief summary of how request-based dm behaves, including: - making request from bio - cloning, mapping and dispatching request - completing request and bio - suspending md - resuming md bio to request ============== md->queue->make_request_fn() (dm_make_request()) calls __make_request() for a bio submitted to the md. Then, the bio is kept in the queue as a new request or merged into another request in the queue if possible. Cloning and Mapping =================== Cloning and mapping are done in md->queue->request_fn() (dm_request_fn()), when requests are dispatched after they are sorted by the I/O scheduler. dm_request_fn() checks busy state of underlying devices using target's busy() function and stops dispatching requests to keep them on the dm device's queue if busy. It helps better I/O merging, since no merge is done for a request once it is dispatched to underlying devices. Actual cloning and mapping are done in dm_prep_fn() and map_request() called from dm_request_fn(). dm_prep_fn() clones not only request but also bios of the request so that dm can hold bio completion in error cases and prevent the bio submitter from noticing the error. (See the "Completion" section below for details.) After the cloning, the clone is mapped by target's map_rq() function and inserted to underlying device's queue using blk_insert_cloned_request(). Completion ========== Request completion can be hooked by rq->end_io(), but then, all bios in the request will have been completed even error cases, and the bio submitter will have noticed the error. To prevent the bio completion in error cases, request-based dm clones both bio and request and hooks both bio->bi_end_io() and rq->end_io(): bio->bi_end_io(): end_clone_bio() rq->end_io(): end_clone_request() Summary of the request completion flow is below: blk_end_request() for a clone request => blk_update_request() => bio->bi_end_io() == end_clone_bio() for each clone bio => Free the clone bio => Success: Complete the original bio (blk_update_request()) Error: Don't complete the original bio => blk_finish_request() => rq->end_io() == end_clone_request() => blk_complete_request() => dm_softirq_done() => Free the clone request => Success: Complete the original request (blk_end_request()) Error: Requeue the original request end_clone_bio() completes the original request on the size of the original bio in successful cases. Even if all bios in the original request are completed by that completion, the original request must not be completed yet to keep the ordering of request completion for the stacking. So end_clone_bio() uses blk_update_request() instead of blk_end_request(). In error cases, end_clone_bio() doesn't complete the original bio. It just frees the cloned bio and gives over the error handling to end_clone_request(). end_clone_request(), which is called with queue lock held, completes the clone request and the original request in a softirq context (dm_softirq_done()), which has no queue lock, to avoid a deadlock issue on submission of another request during the completion: - The submitted request may be mapped to the same device - Request submission requires queue lock, but the queue lock has been held by itself and it doesn't know that The clone request has no clone bio when dm_softirq_done() is called. So target drivers can't resubmit it again even error cases. Instead, they can ask dm core for requeueing and remapping the original request in that cases. suspend ======= Request-based dm uses stopping md->queue as suspend of the md. For noflush suspend, just stops md->queue. For flush suspend, inserts a marker request to the tail of md->queue. And dispatches all requests in md->queue until the marker comes to the front of md->queue. Then, stops dispatching request and waits for the all dispatched requests to complete. After that, completes the marker request, stops md->queue and wake up the waiter on the suspend queue, md->wait. resume ====== Starts md->queue. Signed-off-by: Kiyoshi Ueda <k-ueda@ct.jp.nec.com> Signed-off-by: Jun'ichi Nomura <j-nomura@ce.jp.nec.com> Signed-off-by: Alasdair G Kergon <agk@redhat.com>
2009-06-22 17:12:35 +08:00
{
struct mapped_device *md = q->queuedata;
if (dm_request_based(md))
blk_queue_bio(q, bio);
else
_dm_request(q, bio);
dm: prepare for request based option This patch adds core functions for request-based dm. When struct mapped device (md) is initialized, md->queue has an I/O scheduler and the following functions are used for request-based dm as the queue functions: make_request_fn: dm_make_request() pref_fn: dm_prep_fn() request_fn: dm_request_fn() softirq_done_fn: dm_softirq_done() lld_busy_fn: dm_lld_busy() Actual initializations are done in another patch (PATCH 2). Below is a brief summary of how request-based dm behaves, including: - making request from bio - cloning, mapping and dispatching request - completing request and bio - suspending md - resuming md bio to request ============== md->queue->make_request_fn() (dm_make_request()) calls __make_request() for a bio submitted to the md. Then, the bio is kept in the queue as a new request or merged into another request in the queue if possible. Cloning and Mapping =================== Cloning and mapping are done in md->queue->request_fn() (dm_request_fn()), when requests are dispatched after they are sorted by the I/O scheduler. dm_request_fn() checks busy state of underlying devices using target's busy() function and stops dispatching requests to keep them on the dm device's queue if busy. It helps better I/O merging, since no merge is done for a request once it is dispatched to underlying devices. Actual cloning and mapping are done in dm_prep_fn() and map_request() called from dm_request_fn(). dm_prep_fn() clones not only request but also bios of the request so that dm can hold bio completion in error cases and prevent the bio submitter from noticing the error. (See the "Completion" section below for details.) After the cloning, the clone is mapped by target's map_rq() function and inserted to underlying device's queue using blk_insert_cloned_request(). Completion ========== Request completion can be hooked by rq->end_io(), but then, all bios in the request will have been completed even error cases, and the bio submitter will have noticed the error. To prevent the bio completion in error cases, request-based dm clones both bio and request and hooks both bio->bi_end_io() and rq->end_io(): bio->bi_end_io(): end_clone_bio() rq->end_io(): end_clone_request() Summary of the request completion flow is below: blk_end_request() for a clone request => blk_update_request() => bio->bi_end_io() == end_clone_bio() for each clone bio => Free the clone bio => Success: Complete the original bio (blk_update_request()) Error: Don't complete the original bio => blk_finish_request() => rq->end_io() == end_clone_request() => blk_complete_request() => dm_softirq_done() => Free the clone request => Success: Complete the original request (blk_end_request()) Error: Requeue the original request end_clone_bio() completes the original request on the size of the original bio in successful cases. Even if all bios in the original request are completed by that completion, the original request must not be completed yet to keep the ordering of request completion for the stacking. So end_clone_bio() uses blk_update_request() instead of blk_end_request(). In error cases, end_clone_bio() doesn't complete the original bio. It just frees the cloned bio and gives over the error handling to end_clone_request(). end_clone_request(), which is called with queue lock held, completes the clone request and the original request in a softirq context (dm_softirq_done()), which has no queue lock, to avoid a deadlock issue on submission of another request during the completion: - The submitted request may be mapped to the same device - Request submission requires queue lock, but the queue lock has been held by itself and it doesn't know that The clone request has no clone bio when dm_softirq_done() is called. So target drivers can't resubmit it again even error cases. Instead, they can ask dm core for requeueing and remapping the original request in that cases. suspend ======= Request-based dm uses stopping md->queue as suspend of the md. For noflush suspend, just stops md->queue. For flush suspend, inserts a marker request to the tail of md->queue. And dispatches all requests in md->queue until the marker comes to the front of md->queue. Then, stops dispatching request and waits for the all dispatched requests to complete. After that, completes the marker request, stops md->queue and wake up the waiter on the suspend queue, md->wait. resume ====== Starts md->queue. Signed-off-by: Kiyoshi Ueda <k-ueda@ct.jp.nec.com> Signed-off-by: Jun'ichi Nomura <j-nomura@ce.jp.nec.com> Signed-off-by: Alasdair G Kergon <agk@redhat.com>
2009-06-22 17:12:35 +08:00
}
void dm_dispatch_request(struct request *rq)
{
int r;
if (blk_queue_io_stat(rq->q))
rq->cmd_flags |= REQ_IO_STAT;
rq->start_time = jiffies;
r = blk_insert_cloned_request(rq->q, rq);
if (r)
dm_complete_request(rq, r);
}
EXPORT_SYMBOL_GPL(dm_dispatch_request);
static void dm_rq_bio_destructor(struct bio *bio)
{
struct dm_rq_clone_bio_info *info = bio->bi_private;
struct mapped_device *md = info->tio->md;
free_bio_info(info);
bio_free(bio, md->bs);
}
static int dm_rq_bio_constructor(struct bio *bio, struct bio *bio_orig,
void *data)
{
struct dm_rq_target_io *tio = data;
struct mapped_device *md = tio->md;
struct dm_rq_clone_bio_info *info = alloc_bio_info(md);
if (!info)
return -ENOMEM;
info->orig = bio_orig;
info->tio = tio;
bio->bi_end_io = end_clone_bio;
bio->bi_private = info;
bio->bi_destructor = dm_rq_bio_destructor;
return 0;
}
static int setup_clone(struct request *clone, struct request *rq,
struct dm_rq_target_io *tio)
{
dm: add request based barrier support This patch adds barrier support for request-based dm. CORE DESIGN The design is basically same as bio-based dm, which emulates barrier by mapping empty barrier bios before/after a barrier I/O. But request-based dm has been using struct request_queue for I/O queueing, so the block-layer's barrier mechanism can be used. o Summary of the block-layer's behavior (which is depended by dm-core) Request-based dm uses QUEUE_ORDERED_DRAIN_FLUSH ordered mode for I/O barrier. It means that when an I/O requiring barrier is found in the request_queue, the block-layer makes pre-flush request and post-flush request just before and just after the I/O respectively. After the ordered sequence starts, the block-layer waits for all in-flight I/Os to complete, then gives drivers the pre-flush request, the barrier I/O and the post-flush request one by one. It means that the request_queue is stopped automatically by the block-layer until drivers complete each sequence. o dm-core For the barrier I/O, treats it as a normal I/O, so no additional code is needed. For the pre/post-flush request, flushes caches by the followings: 1. Make the number of empty barrier requests required by target's num_flush_requests, and map them (dm_rq_barrier()). 2. Waits for the mapped barriers to complete (dm_rq_barrier()). If error has occurred, save the error value to md->barrier_error (dm_end_request()). (*) Basically, the first reported error is taken. But -EOPNOTSUPP supersedes any error and DM_ENDIO_REQUEUE follows. 3. Requeue the pre/post-flush request if the error value is DM_ENDIO_REQUEUE. Otherwise, completes with the error value (dm_rq_barrier_work()). The pre/post-flush work above is done in the kernel thread (kdmflush) context, since memory allocation which might sleep is needed in dm_rq_barrier() but sleep is not allowed in dm_request_fn(), which is an irq-disabled context. Also, clones of the pre/post-flush request share an original, so such clones can't be completed using the softirq context. Instead, complete them in the context of underlying device drivers. It should be safe since there is no I/O dispatching during the completion of such clones. For suspend, the workqueue of kdmflush needs to be flushed after the request_queue has been stopped. Otherwise, the next flush work can be kicked even after the suspend completes. TARGET INTERFACE No new interface is added. Just use the existing num_flush_requests in struct target_type as same as bio-based dm. Signed-off-by: Kiyoshi Ueda <k-ueda@ct.jp.nec.com> Signed-off-by: Jun'ichi Nomura <j-nomura@ce.jp.nec.com> Signed-off-by: Alasdair G Kergon <agk@redhat.com>
2009-12-11 07:52:18 +08:00
int r;
dm: prepare for request based option This patch adds core functions for request-based dm. When struct mapped device (md) is initialized, md->queue has an I/O scheduler and the following functions are used for request-based dm as the queue functions: make_request_fn: dm_make_request() pref_fn: dm_prep_fn() request_fn: dm_request_fn() softirq_done_fn: dm_softirq_done() lld_busy_fn: dm_lld_busy() Actual initializations are done in another patch (PATCH 2). Below is a brief summary of how request-based dm behaves, including: - making request from bio - cloning, mapping and dispatching request - completing request and bio - suspending md - resuming md bio to request ============== md->queue->make_request_fn() (dm_make_request()) calls __make_request() for a bio submitted to the md. Then, the bio is kept in the queue as a new request or merged into another request in the queue if possible. Cloning and Mapping =================== Cloning and mapping are done in md->queue->request_fn() (dm_request_fn()), when requests are dispatched after they are sorted by the I/O scheduler. dm_request_fn() checks busy state of underlying devices using target's busy() function and stops dispatching requests to keep them on the dm device's queue if busy. It helps better I/O merging, since no merge is done for a request once it is dispatched to underlying devices. Actual cloning and mapping are done in dm_prep_fn() and map_request() called from dm_request_fn(). dm_prep_fn() clones not only request but also bios of the request so that dm can hold bio completion in error cases and prevent the bio submitter from noticing the error. (See the "Completion" section below for details.) After the cloning, the clone is mapped by target's map_rq() function and inserted to underlying device's queue using blk_insert_cloned_request(). Completion ========== Request completion can be hooked by rq->end_io(), but then, all bios in the request will have been completed even error cases, and the bio submitter will have noticed the error. To prevent the bio completion in error cases, request-based dm clones both bio and request and hooks both bio->bi_end_io() and rq->end_io(): bio->bi_end_io(): end_clone_bio() rq->end_io(): end_clone_request() Summary of the request completion flow is below: blk_end_request() for a clone request => blk_update_request() => bio->bi_end_io() == end_clone_bio() for each clone bio => Free the clone bio => Success: Complete the original bio (blk_update_request()) Error: Don't complete the original bio => blk_finish_request() => rq->end_io() == end_clone_request() => blk_complete_request() => dm_softirq_done() => Free the clone request => Success: Complete the original request (blk_end_request()) Error: Requeue the original request end_clone_bio() completes the original request on the size of the original bio in successful cases. Even if all bios in the original request are completed by that completion, the original request must not be completed yet to keep the ordering of request completion for the stacking. So end_clone_bio() uses blk_update_request() instead of blk_end_request(). In error cases, end_clone_bio() doesn't complete the original bio. It just frees the cloned bio and gives over the error handling to end_clone_request(). end_clone_request(), which is called with queue lock held, completes the clone request and the original request in a softirq context (dm_softirq_done()), which has no queue lock, to avoid a deadlock issue on submission of another request during the completion: - The submitted request may be mapped to the same device - Request submission requires queue lock, but the queue lock has been held by itself and it doesn't know that The clone request has no clone bio when dm_softirq_done() is called. So target drivers can't resubmit it again even error cases. Instead, they can ask dm core for requeueing and remapping the original request in that cases. suspend ======= Request-based dm uses stopping md->queue as suspend of the md. For noflush suspend, just stops md->queue. For flush suspend, inserts a marker request to the tail of md->queue. And dispatches all requests in md->queue until the marker comes to the front of md->queue. Then, stops dispatching request and waits for the all dispatched requests to complete. After that, completes the marker request, stops md->queue and wake up the waiter on the suspend queue, md->wait. resume ====== Starts md->queue. Signed-off-by: Kiyoshi Ueda <k-ueda@ct.jp.nec.com> Signed-off-by: Jun'ichi Nomura <j-nomura@ce.jp.nec.com> Signed-off-by: Alasdair G Kergon <agk@redhat.com>
2009-06-22 17:12:35 +08:00
r = blk_rq_prep_clone(clone, rq, tio->md->bs, GFP_ATOMIC,
dm_rq_bio_constructor, tio);
if (r)
return r;
dm: prepare for request based option This patch adds core functions for request-based dm. When struct mapped device (md) is initialized, md->queue has an I/O scheduler and the following functions are used for request-based dm as the queue functions: make_request_fn: dm_make_request() pref_fn: dm_prep_fn() request_fn: dm_request_fn() softirq_done_fn: dm_softirq_done() lld_busy_fn: dm_lld_busy() Actual initializations are done in another patch (PATCH 2). Below is a brief summary of how request-based dm behaves, including: - making request from bio - cloning, mapping and dispatching request - completing request and bio - suspending md - resuming md bio to request ============== md->queue->make_request_fn() (dm_make_request()) calls __make_request() for a bio submitted to the md. Then, the bio is kept in the queue as a new request or merged into another request in the queue if possible. Cloning and Mapping =================== Cloning and mapping are done in md->queue->request_fn() (dm_request_fn()), when requests are dispatched after they are sorted by the I/O scheduler. dm_request_fn() checks busy state of underlying devices using target's busy() function and stops dispatching requests to keep them on the dm device's queue if busy. It helps better I/O merging, since no merge is done for a request once it is dispatched to underlying devices. Actual cloning and mapping are done in dm_prep_fn() and map_request() called from dm_request_fn(). dm_prep_fn() clones not only request but also bios of the request so that dm can hold bio completion in error cases and prevent the bio submitter from noticing the error. (See the "Completion" section below for details.) After the cloning, the clone is mapped by target's map_rq() function and inserted to underlying device's queue using blk_insert_cloned_request(). Completion ========== Request completion can be hooked by rq->end_io(), but then, all bios in the request will have been completed even error cases, and the bio submitter will have noticed the error. To prevent the bio completion in error cases, request-based dm clones both bio and request and hooks both bio->bi_end_io() and rq->end_io(): bio->bi_end_io(): end_clone_bio() rq->end_io(): end_clone_request() Summary of the request completion flow is below: blk_end_request() for a clone request => blk_update_request() => bio->bi_end_io() == end_clone_bio() for each clone bio => Free the clone bio => Success: Complete the original bio (blk_update_request()) Error: Don't complete the original bio => blk_finish_request() => rq->end_io() == end_clone_request() => blk_complete_request() => dm_softirq_done() => Free the clone request => Success: Complete the original request (blk_end_request()) Error: Requeue the original request end_clone_bio() completes the original request on the size of the original bio in successful cases. Even if all bios in the original request are completed by that completion, the original request must not be completed yet to keep the ordering of request completion for the stacking. So end_clone_bio() uses blk_update_request() instead of blk_end_request(). In error cases, end_clone_bio() doesn't complete the original bio. It just frees the cloned bio and gives over the error handling to end_clone_request(). end_clone_request(), which is called with queue lock held, completes the clone request and the original request in a softirq context (dm_softirq_done()), which has no queue lock, to avoid a deadlock issue on submission of another request during the completion: - The submitted request may be mapped to the same device - Request submission requires queue lock, but the queue lock has been held by itself and it doesn't know that The clone request has no clone bio when dm_softirq_done() is called. So target drivers can't resubmit it again even error cases. Instead, they can ask dm core for requeueing and remapping the original request in that cases. suspend ======= Request-based dm uses stopping md->queue as suspend of the md. For noflush suspend, just stops md->queue. For flush suspend, inserts a marker request to the tail of md->queue. And dispatches all requests in md->queue until the marker comes to the front of md->queue. Then, stops dispatching request and waits for the all dispatched requests to complete. After that, completes the marker request, stops md->queue and wake up the waiter on the suspend queue, md->wait. resume ====== Starts md->queue. Signed-off-by: Kiyoshi Ueda <k-ueda@ct.jp.nec.com> Signed-off-by: Jun'ichi Nomura <j-nomura@ce.jp.nec.com> Signed-off-by: Alasdair G Kergon <agk@redhat.com>
2009-06-22 17:12:35 +08:00
clone->cmd = rq->cmd;
clone->cmd_len = rq->cmd_len;
clone->sense = rq->sense;
clone->buffer = rq->buffer;
dm: prepare for request based option This patch adds core functions for request-based dm. When struct mapped device (md) is initialized, md->queue has an I/O scheduler and the following functions are used for request-based dm as the queue functions: make_request_fn: dm_make_request() pref_fn: dm_prep_fn() request_fn: dm_request_fn() softirq_done_fn: dm_softirq_done() lld_busy_fn: dm_lld_busy() Actual initializations are done in another patch (PATCH 2). Below is a brief summary of how request-based dm behaves, including: - making request from bio - cloning, mapping and dispatching request - completing request and bio - suspending md - resuming md bio to request ============== md->queue->make_request_fn() (dm_make_request()) calls __make_request() for a bio submitted to the md. Then, the bio is kept in the queue as a new request or merged into another request in the queue if possible. Cloning and Mapping =================== Cloning and mapping are done in md->queue->request_fn() (dm_request_fn()), when requests are dispatched after they are sorted by the I/O scheduler. dm_request_fn() checks busy state of underlying devices using target's busy() function and stops dispatching requests to keep them on the dm device's queue if busy. It helps better I/O merging, since no merge is done for a request once it is dispatched to underlying devices. Actual cloning and mapping are done in dm_prep_fn() and map_request() called from dm_request_fn(). dm_prep_fn() clones not only request but also bios of the request so that dm can hold bio completion in error cases and prevent the bio submitter from noticing the error. (See the "Completion" section below for details.) After the cloning, the clone is mapped by target's map_rq() function and inserted to underlying device's queue using blk_insert_cloned_request(). Completion ========== Request completion can be hooked by rq->end_io(), but then, all bios in the request will have been completed even error cases, and the bio submitter will have noticed the error. To prevent the bio completion in error cases, request-based dm clones both bio and request and hooks both bio->bi_end_io() and rq->end_io(): bio->bi_end_io(): end_clone_bio() rq->end_io(): end_clone_request() Summary of the request completion flow is below: blk_end_request() for a clone request => blk_update_request() => bio->bi_end_io() == end_clone_bio() for each clone bio => Free the clone bio => Success: Complete the original bio (blk_update_request()) Error: Don't complete the original bio => blk_finish_request() => rq->end_io() == end_clone_request() => blk_complete_request() => dm_softirq_done() => Free the clone request => Success: Complete the original request (blk_end_request()) Error: Requeue the original request end_clone_bio() completes the original request on the size of the original bio in successful cases. Even if all bios in the original request are completed by that completion, the original request must not be completed yet to keep the ordering of request completion for the stacking. So end_clone_bio() uses blk_update_request() instead of blk_end_request(). In error cases, end_clone_bio() doesn't complete the original bio. It just frees the cloned bio and gives over the error handling to end_clone_request(). end_clone_request(), which is called with queue lock held, completes the clone request and the original request in a softirq context (dm_softirq_done()), which has no queue lock, to avoid a deadlock issue on submission of another request during the completion: - The submitted request may be mapped to the same device - Request submission requires queue lock, but the queue lock has been held by itself and it doesn't know that The clone request has no clone bio when dm_softirq_done() is called. So target drivers can't resubmit it again even error cases. Instead, they can ask dm core for requeueing and remapping the original request in that cases. suspend ======= Request-based dm uses stopping md->queue as suspend of the md. For noflush suspend, just stops md->queue. For flush suspend, inserts a marker request to the tail of md->queue. And dispatches all requests in md->queue until the marker comes to the front of md->queue. Then, stops dispatching request and waits for the all dispatched requests to complete. After that, completes the marker request, stops md->queue and wake up the waiter on the suspend queue, md->wait. resume ====== Starts md->queue. Signed-off-by: Kiyoshi Ueda <k-ueda@ct.jp.nec.com> Signed-off-by: Jun'ichi Nomura <j-nomura@ce.jp.nec.com> Signed-off-by: Alasdair G Kergon <agk@redhat.com>
2009-06-22 17:12:35 +08:00
clone->end_io = end_clone_request;
clone->end_io_data = tio;
return 0;
}
static struct request *clone_rq(struct request *rq, struct mapped_device *md,
gfp_t gfp_mask)
{
struct request *clone;
struct dm_rq_target_io *tio;
tio = alloc_rq_tio(md, gfp_mask);
if (!tio)
return NULL;
tio->md = md;
tio->ti = NULL;
tio->orig = rq;
tio->error = 0;
memset(&tio->info, 0, sizeof(tio->info));
clone = &tio->clone;
if (setup_clone(clone, rq, tio)) {
/* -ENOMEM */
free_rq_tio(tio);
return NULL;
}
return clone;
}
dm: prepare for request based option This patch adds core functions for request-based dm. When struct mapped device (md) is initialized, md->queue has an I/O scheduler and the following functions are used for request-based dm as the queue functions: make_request_fn: dm_make_request() pref_fn: dm_prep_fn() request_fn: dm_request_fn() softirq_done_fn: dm_softirq_done() lld_busy_fn: dm_lld_busy() Actual initializations are done in another patch (PATCH 2). Below is a brief summary of how request-based dm behaves, including: - making request from bio - cloning, mapping and dispatching request - completing request and bio - suspending md - resuming md bio to request ============== md->queue->make_request_fn() (dm_make_request()) calls __make_request() for a bio submitted to the md. Then, the bio is kept in the queue as a new request or merged into another request in the queue if possible. Cloning and Mapping =================== Cloning and mapping are done in md->queue->request_fn() (dm_request_fn()), when requests are dispatched after they are sorted by the I/O scheduler. dm_request_fn() checks busy state of underlying devices using target's busy() function and stops dispatching requests to keep them on the dm device's queue if busy. It helps better I/O merging, since no merge is done for a request once it is dispatched to underlying devices. Actual cloning and mapping are done in dm_prep_fn() and map_request() called from dm_request_fn(). dm_prep_fn() clones not only request but also bios of the request so that dm can hold bio completion in error cases and prevent the bio submitter from noticing the error. (See the "Completion" section below for details.) After the cloning, the clone is mapped by target's map_rq() function and inserted to underlying device's queue using blk_insert_cloned_request(). Completion ========== Request completion can be hooked by rq->end_io(), but then, all bios in the request will have been completed even error cases, and the bio submitter will have noticed the error. To prevent the bio completion in error cases, request-based dm clones both bio and request and hooks both bio->bi_end_io() and rq->end_io(): bio->bi_end_io(): end_clone_bio() rq->end_io(): end_clone_request() Summary of the request completion flow is below: blk_end_request() for a clone request => blk_update_request() => bio->bi_end_io() == end_clone_bio() for each clone bio => Free the clone bio => Success: Complete the original bio (blk_update_request()) Error: Don't complete the original bio => blk_finish_request() => rq->end_io() == end_clone_request() => blk_complete_request() => dm_softirq_done() => Free the clone request => Success: Complete the original request (blk_end_request()) Error: Requeue the original request end_clone_bio() completes the original request on the size of the original bio in successful cases. Even if all bios in the original request are completed by that completion, the original request must not be completed yet to keep the ordering of request completion for the stacking. So end_clone_bio() uses blk_update_request() instead of blk_end_request(). In error cases, end_clone_bio() doesn't complete the original bio. It just frees the cloned bio and gives over the error handling to end_clone_request(). end_clone_request(), which is called with queue lock held, completes the clone request and the original request in a softirq context (dm_softirq_done()), which has no queue lock, to avoid a deadlock issue on submission of another request during the completion: - The submitted request may be mapped to the same device - Request submission requires queue lock, but the queue lock has been held by itself and it doesn't know that The clone request has no clone bio when dm_softirq_done() is called. So target drivers can't resubmit it again even error cases. Instead, they can ask dm core for requeueing and remapping the original request in that cases. suspend ======= Request-based dm uses stopping md->queue as suspend of the md. For noflush suspend, just stops md->queue. For flush suspend, inserts a marker request to the tail of md->queue. And dispatches all requests in md->queue until the marker comes to the front of md->queue. Then, stops dispatching request and waits for the all dispatched requests to complete. After that, completes the marker request, stops md->queue and wake up the waiter on the suspend queue, md->wait. resume ====== Starts md->queue. Signed-off-by: Kiyoshi Ueda <k-ueda@ct.jp.nec.com> Signed-off-by: Jun'ichi Nomura <j-nomura@ce.jp.nec.com> Signed-off-by: Alasdair G Kergon <agk@redhat.com>
2009-06-22 17:12:35 +08:00
/*
* Called with the queue lock held.
*/
static int dm_prep_fn(struct request_queue *q, struct request *rq)
{
struct mapped_device *md = q->queuedata;
struct request *clone;
if (unlikely(rq->special)) {
DMWARN("Already has something in rq->special.");
return BLKPREP_KILL;
}
clone = clone_rq(rq, md, GFP_ATOMIC);
if (!clone)
dm: prepare for request based option This patch adds core functions for request-based dm. When struct mapped device (md) is initialized, md->queue has an I/O scheduler and the following functions are used for request-based dm as the queue functions: make_request_fn: dm_make_request() pref_fn: dm_prep_fn() request_fn: dm_request_fn() softirq_done_fn: dm_softirq_done() lld_busy_fn: dm_lld_busy() Actual initializations are done in another patch (PATCH 2). Below is a brief summary of how request-based dm behaves, including: - making request from bio - cloning, mapping and dispatching request - completing request and bio - suspending md - resuming md bio to request ============== md->queue->make_request_fn() (dm_make_request()) calls __make_request() for a bio submitted to the md. Then, the bio is kept in the queue as a new request or merged into another request in the queue if possible. Cloning and Mapping =================== Cloning and mapping are done in md->queue->request_fn() (dm_request_fn()), when requests are dispatched after they are sorted by the I/O scheduler. dm_request_fn() checks busy state of underlying devices using target's busy() function and stops dispatching requests to keep them on the dm device's queue if busy. It helps better I/O merging, since no merge is done for a request once it is dispatched to underlying devices. Actual cloning and mapping are done in dm_prep_fn() and map_request() called from dm_request_fn(). dm_prep_fn() clones not only request but also bios of the request so that dm can hold bio completion in error cases and prevent the bio submitter from noticing the error. (See the "Completion" section below for details.) After the cloning, the clone is mapped by target's map_rq() function and inserted to underlying device's queue using blk_insert_cloned_request(). Completion ========== Request completion can be hooked by rq->end_io(), but then, all bios in the request will have been completed even error cases, and the bio submitter will have noticed the error. To prevent the bio completion in error cases, request-based dm clones both bio and request and hooks both bio->bi_end_io() and rq->end_io(): bio->bi_end_io(): end_clone_bio() rq->end_io(): end_clone_request() Summary of the request completion flow is below: blk_end_request() for a clone request => blk_update_request() => bio->bi_end_io() == end_clone_bio() for each clone bio => Free the clone bio => Success: Complete the original bio (blk_update_request()) Error: Don't complete the original bio => blk_finish_request() => rq->end_io() == end_clone_request() => blk_complete_request() => dm_softirq_done() => Free the clone request => Success: Complete the original request (blk_end_request()) Error: Requeue the original request end_clone_bio() completes the original request on the size of the original bio in successful cases. Even if all bios in the original request are completed by that completion, the original request must not be completed yet to keep the ordering of request completion for the stacking. So end_clone_bio() uses blk_update_request() instead of blk_end_request(). In error cases, end_clone_bio() doesn't complete the original bio. It just frees the cloned bio and gives over the error handling to end_clone_request(). end_clone_request(), which is called with queue lock held, completes the clone request and the original request in a softirq context (dm_softirq_done()), which has no queue lock, to avoid a deadlock issue on submission of another request during the completion: - The submitted request may be mapped to the same device - Request submission requires queue lock, but the queue lock has been held by itself and it doesn't know that The clone request has no clone bio when dm_softirq_done() is called. So target drivers can't resubmit it again even error cases. Instead, they can ask dm core for requeueing and remapping the original request in that cases. suspend ======= Request-based dm uses stopping md->queue as suspend of the md. For noflush suspend, just stops md->queue. For flush suspend, inserts a marker request to the tail of md->queue. And dispatches all requests in md->queue until the marker comes to the front of md->queue. Then, stops dispatching request and waits for the all dispatched requests to complete. After that, completes the marker request, stops md->queue and wake up the waiter on the suspend queue, md->wait. resume ====== Starts md->queue. Signed-off-by: Kiyoshi Ueda <k-ueda@ct.jp.nec.com> Signed-off-by: Jun'ichi Nomura <j-nomura@ce.jp.nec.com> Signed-off-by: Alasdair G Kergon <agk@redhat.com>
2009-06-22 17:12:35 +08:00
return BLKPREP_DEFER;
rq->special = clone;
rq->cmd_flags |= REQ_DONTPREP;
return BLKPREP_OK;
}
/*
* Returns:
* 0 : the request has been processed (not requeued)
* !0 : the request has been requeued
*/
static int map_request(struct dm_target *ti, struct request *clone,
struct mapped_device *md)
dm: prepare for request based option This patch adds core functions for request-based dm. When struct mapped device (md) is initialized, md->queue has an I/O scheduler and the following functions are used for request-based dm as the queue functions: make_request_fn: dm_make_request() pref_fn: dm_prep_fn() request_fn: dm_request_fn() softirq_done_fn: dm_softirq_done() lld_busy_fn: dm_lld_busy() Actual initializations are done in another patch (PATCH 2). Below is a brief summary of how request-based dm behaves, including: - making request from bio - cloning, mapping and dispatching request - completing request and bio - suspending md - resuming md bio to request ============== md->queue->make_request_fn() (dm_make_request()) calls __make_request() for a bio submitted to the md. Then, the bio is kept in the queue as a new request or merged into another request in the queue if possible. Cloning and Mapping =================== Cloning and mapping are done in md->queue->request_fn() (dm_request_fn()), when requests are dispatched after they are sorted by the I/O scheduler. dm_request_fn() checks busy state of underlying devices using target's busy() function and stops dispatching requests to keep them on the dm device's queue if busy. It helps better I/O merging, since no merge is done for a request once it is dispatched to underlying devices. Actual cloning and mapping are done in dm_prep_fn() and map_request() called from dm_request_fn(). dm_prep_fn() clones not only request but also bios of the request so that dm can hold bio completion in error cases and prevent the bio submitter from noticing the error. (See the "Completion" section below for details.) After the cloning, the clone is mapped by target's map_rq() function and inserted to underlying device's queue using blk_insert_cloned_request(). Completion ========== Request completion can be hooked by rq->end_io(), but then, all bios in the request will have been completed even error cases, and the bio submitter will have noticed the error. To prevent the bio completion in error cases, request-based dm clones both bio and request and hooks both bio->bi_end_io() and rq->end_io(): bio->bi_end_io(): end_clone_bio() rq->end_io(): end_clone_request() Summary of the request completion flow is below: blk_end_request() for a clone request => blk_update_request() => bio->bi_end_io() == end_clone_bio() for each clone bio => Free the clone bio => Success: Complete the original bio (blk_update_request()) Error: Don't complete the original bio => blk_finish_request() => rq->end_io() == end_clone_request() => blk_complete_request() => dm_softirq_done() => Free the clone request => Success: Complete the original request (blk_end_request()) Error: Requeue the original request end_clone_bio() completes the original request on the size of the original bio in successful cases. Even if all bios in the original request are completed by that completion, the original request must not be completed yet to keep the ordering of request completion for the stacking. So end_clone_bio() uses blk_update_request() instead of blk_end_request(). In error cases, end_clone_bio() doesn't complete the original bio. It just frees the cloned bio and gives over the error handling to end_clone_request(). end_clone_request(), which is called with queue lock held, completes the clone request and the original request in a softirq context (dm_softirq_done()), which has no queue lock, to avoid a deadlock issue on submission of another request during the completion: - The submitted request may be mapped to the same device - Request submission requires queue lock, but the queue lock has been held by itself and it doesn't know that The clone request has no clone bio when dm_softirq_done() is called. So target drivers can't resubmit it again even error cases. Instead, they can ask dm core for requeueing and remapping the original request in that cases. suspend ======= Request-based dm uses stopping md->queue as suspend of the md. For noflush suspend, just stops md->queue. For flush suspend, inserts a marker request to the tail of md->queue. And dispatches all requests in md->queue until the marker comes to the front of md->queue. Then, stops dispatching request and waits for the all dispatched requests to complete. After that, completes the marker request, stops md->queue and wake up the waiter on the suspend queue, md->wait. resume ====== Starts md->queue. Signed-off-by: Kiyoshi Ueda <k-ueda@ct.jp.nec.com> Signed-off-by: Jun'ichi Nomura <j-nomura@ce.jp.nec.com> Signed-off-by: Alasdair G Kergon <agk@redhat.com>
2009-06-22 17:12:35 +08:00
{
int r, requeued = 0;
dm: prepare for request based option This patch adds core functions for request-based dm. When struct mapped device (md) is initialized, md->queue has an I/O scheduler and the following functions are used for request-based dm as the queue functions: make_request_fn: dm_make_request() pref_fn: dm_prep_fn() request_fn: dm_request_fn() softirq_done_fn: dm_softirq_done() lld_busy_fn: dm_lld_busy() Actual initializations are done in another patch (PATCH 2). Below is a brief summary of how request-based dm behaves, including: - making request from bio - cloning, mapping and dispatching request - completing request and bio - suspending md - resuming md bio to request ============== md->queue->make_request_fn() (dm_make_request()) calls __make_request() for a bio submitted to the md. Then, the bio is kept in the queue as a new request or merged into another request in the queue if possible. Cloning and Mapping =================== Cloning and mapping are done in md->queue->request_fn() (dm_request_fn()), when requests are dispatched after they are sorted by the I/O scheduler. dm_request_fn() checks busy state of underlying devices using target's busy() function and stops dispatching requests to keep them on the dm device's queue if busy. It helps better I/O merging, since no merge is done for a request once it is dispatched to underlying devices. Actual cloning and mapping are done in dm_prep_fn() and map_request() called from dm_request_fn(). dm_prep_fn() clones not only request but also bios of the request so that dm can hold bio completion in error cases and prevent the bio submitter from noticing the error. (See the "Completion" section below for details.) After the cloning, the clone is mapped by target's map_rq() function and inserted to underlying device's queue using blk_insert_cloned_request(). Completion ========== Request completion can be hooked by rq->end_io(), but then, all bios in the request will have been completed even error cases, and the bio submitter will have noticed the error. To prevent the bio completion in error cases, request-based dm clones both bio and request and hooks both bio->bi_end_io() and rq->end_io(): bio->bi_end_io(): end_clone_bio() rq->end_io(): end_clone_request() Summary of the request completion flow is below: blk_end_request() for a clone request => blk_update_request() => bio->bi_end_io() == end_clone_bio() for each clone bio => Free the clone bio => Success: Complete the original bio (blk_update_request()) Error: Don't complete the original bio => blk_finish_request() => rq->end_io() == end_clone_request() => blk_complete_request() => dm_softirq_done() => Free the clone request => Success: Complete the original request (blk_end_request()) Error: Requeue the original request end_clone_bio() completes the original request on the size of the original bio in successful cases. Even if all bios in the original request are completed by that completion, the original request must not be completed yet to keep the ordering of request completion for the stacking. So end_clone_bio() uses blk_update_request() instead of blk_end_request(). In error cases, end_clone_bio() doesn't complete the original bio. It just frees the cloned bio and gives over the error handling to end_clone_request(). end_clone_request(), which is called with queue lock held, completes the clone request and the original request in a softirq context (dm_softirq_done()), which has no queue lock, to avoid a deadlock issue on submission of another request during the completion: - The submitted request may be mapped to the same device - Request submission requires queue lock, but the queue lock has been held by itself and it doesn't know that The clone request has no clone bio when dm_softirq_done() is called. So target drivers can't resubmit it again even error cases. Instead, they can ask dm core for requeueing and remapping the original request in that cases. suspend ======= Request-based dm uses stopping md->queue as suspend of the md. For noflush suspend, just stops md->queue. For flush suspend, inserts a marker request to the tail of md->queue. And dispatches all requests in md->queue until the marker comes to the front of md->queue. Then, stops dispatching request and waits for the all dispatched requests to complete. After that, completes the marker request, stops md->queue and wake up the waiter on the suspend queue, md->wait. resume ====== Starts md->queue. Signed-off-by: Kiyoshi Ueda <k-ueda@ct.jp.nec.com> Signed-off-by: Jun'ichi Nomura <j-nomura@ce.jp.nec.com> Signed-off-by: Alasdair G Kergon <agk@redhat.com>
2009-06-22 17:12:35 +08:00
struct dm_rq_target_io *tio = clone->end_io_data;
/*
* Hold the md reference here for the in-flight I/O.
* We can't rely on the reference count by device opener,
* because the device may be closed during the request completion
* when all bios are completed.
* See the comment in rq_completed() too.
*/
dm_get(md);
tio->ti = ti;
r = ti->type->map_rq(ti, clone, &tio->info);
switch (r) {
case DM_MAPIO_SUBMITTED:
/* The target has taken the I/O to submit by itself later */
break;
case DM_MAPIO_REMAPPED:
/* The target has remapped the I/O so dispatch it */
dm: trace request based remapping This patch adds a remapping trace to request-based dm. BIO-based dm already has the equivalent tracepoint. For example, under this dm stack (linear LV on multipath): # dmsetup ls --tree -o ascii vg-lv0 (253:1) `-mpath0 (253:0) |- (8:160) |- (66:80) |- (65:176) `- (65:160) Trace of 'dd of=/dev/vg/lv0 bs=128k count=1 oflag=direct' looks like this: without the patch: dd-6674 [000] 539.727384: block_bio_queue: 253,1 WS 0 + 256 [dd] dd-6674 [000] 539.727392: block_remap: 253,0 WS 384 + 256 <- (253,1) 0 dd-6674 [000] 539.727394: block_bio_queue: 253,0 WS 384 + 256 [dd] dd-6674 [000] 539.727405: block_getrq: 253,0 WS 384 + 256 [dd] dd-6674 [000] 539.727409: block_plug: [dd] dd-6674 [000] 539.727410: block_rq_insert: 253,0 W 0 () 384 + 256 [dd] dd-6674 [000] 539.727416: block_rq_issue: 253,0 W 0 () 384 + 256 [dd] dd-6674 [000] 539.727426: block_rq_insert: 65,176 W 0 () 384 + 256 [dd] dd-6674 [000] 539.727427: block_rq_issue: 65,176 W 0 () 384 + 256 [dd] ... and with the patch: (the line with '**' is the trace added by this patch) dd-6617 [002] 162.914301: block_bio_queue: 253,1 WS 0 + 256 [dd] dd-6617 [002] 162.914314: block_remap: 253,0 WS 384 + 256 <- (253,1) 0 dd-6617 [002] 162.914316: block_bio_queue: 253,0 WS 384 + 256 [dd] dd-6617 [002] 162.914331: block_getrq: 253,0 WS 384 + 256 [dd] dd-6617 [002] 162.914335: block_plug: [dd] dd-6617 [002] 162.914337: block_rq_insert: 253,0 W 0 () 384 + 256 [dd] dd-6617 [002] 162.914347: block_rq_issue: 253,0 W 0 () 384 + 256 [dd] **dd-6617 [002] 162.914356: block_rq_remap: 65,176 W 384 + 256 <- (253,0) 384 dd-6617 [002] 162.914358: block_rq_insert: 65,176 W 0 () 384 + 256 [dd] dd-6617 [002] 162.914359: block_rq_issue: 65,176 W 0 () 384 + 256 [dd] ... Signed-off-by: Kiyoshi Ueda <k-ueda@ct.jp.nec.com> Signed-off-by: Jun'ichi Nomura <j-nomura@ce.jp.nec.com> Cc: Jens Axboe <jens.axboe@oracle.com> Cc: Li Zefan <lizf@cn.fujitsu.com> Signed-off-by: Alasdair G Kergon <agk@redhat.com>
2009-12-11 07:52:25 +08:00
trace_block_rq_remap(clone->q, clone, disk_devt(dm_disk(md)),
blk_rq_pos(tio->orig));
dm: prepare for request based option This patch adds core functions for request-based dm. When struct mapped device (md) is initialized, md->queue has an I/O scheduler and the following functions are used for request-based dm as the queue functions: make_request_fn: dm_make_request() pref_fn: dm_prep_fn() request_fn: dm_request_fn() softirq_done_fn: dm_softirq_done() lld_busy_fn: dm_lld_busy() Actual initializations are done in another patch (PATCH 2). Below is a brief summary of how request-based dm behaves, including: - making request from bio - cloning, mapping and dispatching request - completing request and bio - suspending md - resuming md bio to request ============== md->queue->make_request_fn() (dm_make_request()) calls __make_request() for a bio submitted to the md. Then, the bio is kept in the queue as a new request or merged into another request in the queue if possible. Cloning and Mapping =================== Cloning and mapping are done in md->queue->request_fn() (dm_request_fn()), when requests are dispatched after they are sorted by the I/O scheduler. dm_request_fn() checks busy state of underlying devices using target's busy() function and stops dispatching requests to keep them on the dm device's queue if busy. It helps better I/O merging, since no merge is done for a request once it is dispatched to underlying devices. Actual cloning and mapping are done in dm_prep_fn() and map_request() called from dm_request_fn(). dm_prep_fn() clones not only request but also bios of the request so that dm can hold bio completion in error cases and prevent the bio submitter from noticing the error. (See the "Completion" section below for details.) After the cloning, the clone is mapped by target's map_rq() function and inserted to underlying device's queue using blk_insert_cloned_request(). Completion ========== Request completion can be hooked by rq->end_io(), but then, all bios in the request will have been completed even error cases, and the bio submitter will have noticed the error. To prevent the bio completion in error cases, request-based dm clones both bio and request and hooks both bio->bi_end_io() and rq->end_io(): bio->bi_end_io(): end_clone_bio() rq->end_io(): end_clone_request() Summary of the request completion flow is below: blk_end_request() for a clone request => blk_update_request() => bio->bi_end_io() == end_clone_bio() for each clone bio => Free the clone bio => Success: Complete the original bio (blk_update_request()) Error: Don't complete the original bio => blk_finish_request() => rq->end_io() == end_clone_request() => blk_complete_request() => dm_softirq_done() => Free the clone request => Success: Complete the original request (blk_end_request()) Error: Requeue the original request end_clone_bio() completes the original request on the size of the original bio in successful cases. Even if all bios in the original request are completed by that completion, the original request must not be completed yet to keep the ordering of request completion for the stacking. So end_clone_bio() uses blk_update_request() instead of blk_end_request(). In error cases, end_clone_bio() doesn't complete the original bio. It just frees the cloned bio and gives over the error handling to end_clone_request(). end_clone_request(), which is called with queue lock held, completes the clone request and the original request in a softirq context (dm_softirq_done()), which has no queue lock, to avoid a deadlock issue on submission of another request during the completion: - The submitted request may be mapped to the same device - Request submission requires queue lock, but the queue lock has been held by itself and it doesn't know that The clone request has no clone bio when dm_softirq_done() is called. So target drivers can't resubmit it again even error cases. Instead, they can ask dm core for requeueing and remapping the original request in that cases. suspend ======= Request-based dm uses stopping md->queue as suspend of the md. For noflush suspend, just stops md->queue. For flush suspend, inserts a marker request to the tail of md->queue. And dispatches all requests in md->queue until the marker comes to the front of md->queue. Then, stops dispatching request and waits for the all dispatched requests to complete. After that, completes the marker request, stops md->queue and wake up the waiter on the suspend queue, md->wait. resume ====== Starts md->queue. Signed-off-by: Kiyoshi Ueda <k-ueda@ct.jp.nec.com> Signed-off-by: Jun'ichi Nomura <j-nomura@ce.jp.nec.com> Signed-off-by: Alasdair G Kergon <agk@redhat.com>
2009-06-22 17:12:35 +08:00
dm_dispatch_request(clone);
break;
case DM_MAPIO_REQUEUE:
/* The target wants to requeue the I/O */
dm_requeue_unmapped_request(clone);
requeued = 1;
dm: prepare for request based option This patch adds core functions for request-based dm. When struct mapped device (md) is initialized, md->queue has an I/O scheduler and the following functions are used for request-based dm as the queue functions: make_request_fn: dm_make_request() pref_fn: dm_prep_fn() request_fn: dm_request_fn() softirq_done_fn: dm_softirq_done() lld_busy_fn: dm_lld_busy() Actual initializations are done in another patch (PATCH 2). Below is a brief summary of how request-based dm behaves, including: - making request from bio - cloning, mapping and dispatching request - completing request and bio - suspending md - resuming md bio to request ============== md->queue->make_request_fn() (dm_make_request()) calls __make_request() for a bio submitted to the md. Then, the bio is kept in the queue as a new request or merged into another request in the queue if possible. Cloning and Mapping =================== Cloning and mapping are done in md->queue->request_fn() (dm_request_fn()), when requests are dispatched after they are sorted by the I/O scheduler. dm_request_fn() checks busy state of underlying devices using target's busy() function and stops dispatching requests to keep them on the dm device's queue if busy. It helps better I/O merging, since no merge is done for a request once it is dispatched to underlying devices. Actual cloning and mapping are done in dm_prep_fn() and map_request() called from dm_request_fn(). dm_prep_fn() clones not only request but also bios of the request so that dm can hold bio completion in error cases and prevent the bio submitter from noticing the error. (See the "Completion" section below for details.) After the cloning, the clone is mapped by target's map_rq() function and inserted to underlying device's queue using blk_insert_cloned_request(). Completion ========== Request completion can be hooked by rq->end_io(), but then, all bios in the request will have been completed even error cases, and the bio submitter will have noticed the error. To prevent the bio completion in error cases, request-based dm clones both bio and request and hooks both bio->bi_end_io() and rq->end_io(): bio->bi_end_io(): end_clone_bio() rq->end_io(): end_clone_request() Summary of the request completion flow is below: blk_end_request() for a clone request => blk_update_request() => bio->bi_end_io() == end_clone_bio() for each clone bio => Free the clone bio => Success: Complete the original bio (blk_update_request()) Error: Don't complete the original bio => blk_finish_request() => rq->end_io() == end_clone_request() => blk_complete_request() => dm_softirq_done() => Free the clone request => Success: Complete the original request (blk_end_request()) Error: Requeue the original request end_clone_bio() completes the original request on the size of the original bio in successful cases. Even if all bios in the original request are completed by that completion, the original request must not be completed yet to keep the ordering of request completion for the stacking. So end_clone_bio() uses blk_update_request() instead of blk_end_request(). In error cases, end_clone_bio() doesn't complete the original bio. It just frees the cloned bio and gives over the error handling to end_clone_request(). end_clone_request(), which is called with queue lock held, completes the clone request and the original request in a softirq context (dm_softirq_done()), which has no queue lock, to avoid a deadlock issue on submission of another request during the completion: - The submitted request may be mapped to the same device - Request submission requires queue lock, but the queue lock has been held by itself and it doesn't know that The clone request has no clone bio when dm_softirq_done() is called. So target drivers can't resubmit it again even error cases. Instead, they can ask dm core for requeueing and remapping the original request in that cases. suspend ======= Request-based dm uses stopping md->queue as suspend of the md. For noflush suspend, just stops md->queue. For flush suspend, inserts a marker request to the tail of md->queue. And dispatches all requests in md->queue until the marker comes to the front of md->queue. Then, stops dispatching request and waits for the all dispatched requests to complete. After that, completes the marker request, stops md->queue and wake up the waiter on the suspend queue, md->wait. resume ====== Starts md->queue. Signed-off-by: Kiyoshi Ueda <k-ueda@ct.jp.nec.com> Signed-off-by: Jun'ichi Nomura <j-nomura@ce.jp.nec.com> Signed-off-by: Alasdair G Kergon <agk@redhat.com>
2009-06-22 17:12:35 +08:00
break;
default:
if (r > 0) {
DMWARN("unimplemented target map return value: %d", r);
BUG();
}
/* The target wants to complete the I/O */
dm_kill_unmapped_request(clone, r);
break;
}
return requeued;
dm: prepare for request based option This patch adds core functions for request-based dm. When struct mapped device (md) is initialized, md->queue has an I/O scheduler and the following functions are used for request-based dm as the queue functions: make_request_fn: dm_make_request() pref_fn: dm_prep_fn() request_fn: dm_request_fn() softirq_done_fn: dm_softirq_done() lld_busy_fn: dm_lld_busy() Actual initializations are done in another patch (PATCH 2). Below is a brief summary of how request-based dm behaves, including: - making request from bio - cloning, mapping and dispatching request - completing request and bio - suspending md - resuming md bio to request ============== md->queue->make_request_fn() (dm_make_request()) calls __make_request() for a bio submitted to the md. Then, the bio is kept in the queue as a new request or merged into another request in the queue if possible. Cloning and Mapping =================== Cloning and mapping are done in md->queue->request_fn() (dm_request_fn()), when requests are dispatched after they are sorted by the I/O scheduler. dm_request_fn() checks busy state of underlying devices using target's busy() function and stops dispatching requests to keep them on the dm device's queue if busy. It helps better I/O merging, since no merge is done for a request once it is dispatched to underlying devices. Actual cloning and mapping are done in dm_prep_fn() and map_request() called from dm_request_fn(). dm_prep_fn() clones not only request but also bios of the request so that dm can hold bio completion in error cases and prevent the bio submitter from noticing the error. (See the "Completion" section below for details.) After the cloning, the clone is mapped by target's map_rq() function and inserted to underlying device's queue using blk_insert_cloned_request(). Completion ========== Request completion can be hooked by rq->end_io(), but then, all bios in the request will have been completed even error cases, and the bio submitter will have noticed the error. To prevent the bio completion in error cases, request-based dm clones both bio and request and hooks both bio->bi_end_io() and rq->end_io(): bio->bi_end_io(): end_clone_bio() rq->end_io(): end_clone_request() Summary of the request completion flow is below: blk_end_request() for a clone request => blk_update_request() => bio->bi_end_io() == end_clone_bio() for each clone bio => Free the clone bio => Success: Complete the original bio (blk_update_request()) Error: Don't complete the original bio => blk_finish_request() => rq->end_io() == end_clone_request() => blk_complete_request() => dm_softirq_done() => Free the clone request => Success: Complete the original request (blk_end_request()) Error: Requeue the original request end_clone_bio() completes the original request on the size of the original bio in successful cases. Even if all bios in the original request are completed by that completion, the original request must not be completed yet to keep the ordering of request completion for the stacking. So end_clone_bio() uses blk_update_request() instead of blk_end_request(). In error cases, end_clone_bio() doesn't complete the original bio. It just frees the cloned bio and gives over the error handling to end_clone_request(). end_clone_request(), which is called with queue lock held, completes the clone request and the original request in a softirq context (dm_softirq_done()), which has no queue lock, to avoid a deadlock issue on submission of another request during the completion: - The submitted request may be mapped to the same device - Request submission requires queue lock, but the queue lock has been held by itself and it doesn't know that The clone request has no clone bio when dm_softirq_done() is called. So target drivers can't resubmit it again even error cases. Instead, they can ask dm core for requeueing and remapping the original request in that cases. suspend ======= Request-based dm uses stopping md->queue as suspend of the md. For noflush suspend, just stops md->queue. For flush suspend, inserts a marker request to the tail of md->queue. And dispatches all requests in md->queue until the marker comes to the front of md->queue. Then, stops dispatching request and waits for the all dispatched requests to complete. After that, completes the marker request, stops md->queue and wake up the waiter on the suspend queue, md->wait. resume ====== Starts md->queue. Signed-off-by: Kiyoshi Ueda <k-ueda@ct.jp.nec.com> Signed-off-by: Jun'ichi Nomura <j-nomura@ce.jp.nec.com> Signed-off-by: Alasdair G Kergon <agk@redhat.com>
2009-06-22 17:12:35 +08:00
}
/*
* q->request_fn for request-based dm.
* Called with the queue lock held.
*/
static void dm_request_fn(struct request_queue *q)
{
struct mapped_device *md = q->queuedata;
struct dm_table *map = dm_get_live_table(md);
dm: prepare for request based option This patch adds core functions for request-based dm. When struct mapped device (md) is initialized, md->queue has an I/O scheduler and the following functions are used for request-based dm as the queue functions: make_request_fn: dm_make_request() pref_fn: dm_prep_fn() request_fn: dm_request_fn() softirq_done_fn: dm_softirq_done() lld_busy_fn: dm_lld_busy() Actual initializations are done in another patch (PATCH 2). Below is a brief summary of how request-based dm behaves, including: - making request from bio - cloning, mapping and dispatching request - completing request and bio - suspending md - resuming md bio to request ============== md->queue->make_request_fn() (dm_make_request()) calls __make_request() for a bio submitted to the md. Then, the bio is kept in the queue as a new request or merged into another request in the queue if possible. Cloning and Mapping =================== Cloning and mapping are done in md->queue->request_fn() (dm_request_fn()), when requests are dispatched after they are sorted by the I/O scheduler. dm_request_fn() checks busy state of underlying devices using target's busy() function and stops dispatching requests to keep them on the dm device's queue if busy. It helps better I/O merging, since no merge is done for a request once it is dispatched to underlying devices. Actual cloning and mapping are done in dm_prep_fn() and map_request() called from dm_request_fn(). dm_prep_fn() clones not only request but also bios of the request so that dm can hold bio completion in error cases and prevent the bio submitter from noticing the error. (See the "Completion" section below for details.) After the cloning, the clone is mapped by target's map_rq() function and inserted to underlying device's queue using blk_insert_cloned_request(). Completion ========== Request completion can be hooked by rq->end_io(), but then, all bios in the request will have been completed even error cases, and the bio submitter will have noticed the error. To prevent the bio completion in error cases, request-based dm clones both bio and request and hooks both bio->bi_end_io() and rq->end_io(): bio->bi_end_io(): end_clone_bio() rq->end_io(): end_clone_request() Summary of the request completion flow is below: blk_end_request() for a clone request => blk_update_request() => bio->bi_end_io() == end_clone_bio() for each clone bio => Free the clone bio => Success: Complete the original bio (blk_update_request()) Error: Don't complete the original bio => blk_finish_request() => rq->end_io() == end_clone_request() => blk_complete_request() => dm_softirq_done() => Free the clone request => Success: Complete the original request (blk_end_request()) Error: Requeue the original request end_clone_bio() completes the original request on the size of the original bio in successful cases. Even if all bios in the original request are completed by that completion, the original request must not be completed yet to keep the ordering of request completion for the stacking. So end_clone_bio() uses blk_update_request() instead of blk_end_request(). In error cases, end_clone_bio() doesn't complete the original bio. It just frees the cloned bio and gives over the error handling to end_clone_request(). end_clone_request(), which is called with queue lock held, completes the clone request and the original request in a softirq context (dm_softirq_done()), which has no queue lock, to avoid a deadlock issue on submission of another request during the completion: - The submitted request may be mapped to the same device - Request submission requires queue lock, but the queue lock has been held by itself and it doesn't know that The clone request has no clone bio when dm_softirq_done() is called. So target drivers can't resubmit it again even error cases. Instead, they can ask dm core for requeueing and remapping the original request in that cases. suspend ======= Request-based dm uses stopping md->queue as suspend of the md. For noflush suspend, just stops md->queue. For flush suspend, inserts a marker request to the tail of md->queue. And dispatches all requests in md->queue until the marker comes to the front of md->queue. Then, stops dispatching request and waits for the all dispatched requests to complete. After that, completes the marker request, stops md->queue and wake up the waiter on the suspend queue, md->wait. resume ====== Starts md->queue. Signed-off-by: Kiyoshi Ueda <k-ueda@ct.jp.nec.com> Signed-off-by: Jun'ichi Nomura <j-nomura@ce.jp.nec.com> Signed-off-by: Alasdair G Kergon <agk@redhat.com>
2009-06-22 17:12:35 +08:00
struct dm_target *ti;
struct request *rq, *clone;
sector_t pos;
dm: prepare for request based option This patch adds core functions for request-based dm. When struct mapped device (md) is initialized, md->queue has an I/O scheduler and the following functions are used for request-based dm as the queue functions: make_request_fn: dm_make_request() pref_fn: dm_prep_fn() request_fn: dm_request_fn() softirq_done_fn: dm_softirq_done() lld_busy_fn: dm_lld_busy() Actual initializations are done in another patch (PATCH 2). Below is a brief summary of how request-based dm behaves, including: - making request from bio - cloning, mapping and dispatching request - completing request and bio - suspending md - resuming md bio to request ============== md->queue->make_request_fn() (dm_make_request()) calls __make_request() for a bio submitted to the md. Then, the bio is kept in the queue as a new request or merged into another request in the queue if possible. Cloning and Mapping =================== Cloning and mapping are done in md->queue->request_fn() (dm_request_fn()), when requests are dispatched after they are sorted by the I/O scheduler. dm_request_fn() checks busy state of underlying devices using target's busy() function and stops dispatching requests to keep them on the dm device's queue if busy. It helps better I/O merging, since no merge is done for a request once it is dispatched to underlying devices. Actual cloning and mapping are done in dm_prep_fn() and map_request() called from dm_request_fn(). dm_prep_fn() clones not only request but also bios of the request so that dm can hold bio completion in error cases and prevent the bio submitter from noticing the error. (See the "Completion" section below for details.) After the cloning, the clone is mapped by target's map_rq() function and inserted to underlying device's queue using blk_insert_cloned_request(). Completion ========== Request completion can be hooked by rq->end_io(), but then, all bios in the request will have been completed even error cases, and the bio submitter will have noticed the error. To prevent the bio completion in error cases, request-based dm clones both bio and request and hooks both bio->bi_end_io() and rq->end_io(): bio->bi_end_io(): end_clone_bio() rq->end_io(): end_clone_request() Summary of the request completion flow is below: blk_end_request() for a clone request => blk_update_request() => bio->bi_end_io() == end_clone_bio() for each clone bio => Free the clone bio => Success: Complete the original bio (blk_update_request()) Error: Don't complete the original bio => blk_finish_request() => rq->end_io() == end_clone_request() => blk_complete_request() => dm_softirq_done() => Free the clone request => Success: Complete the original request (blk_end_request()) Error: Requeue the original request end_clone_bio() completes the original request on the size of the original bio in successful cases. Even if all bios in the original request are completed by that completion, the original request must not be completed yet to keep the ordering of request completion for the stacking. So end_clone_bio() uses blk_update_request() instead of blk_end_request(). In error cases, end_clone_bio() doesn't complete the original bio. It just frees the cloned bio and gives over the error handling to end_clone_request(). end_clone_request(), which is called with queue lock held, completes the clone request and the original request in a softirq context (dm_softirq_done()), which has no queue lock, to avoid a deadlock issue on submission of another request during the completion: - The submitted request may be mapped to the same device - Request submission requires queue lock, but the queue lock has been held by itself and it doesn't know that The clone request has no clone bio when dm_softirq_done() is called. So target drivers can't resubmit it again even error cases. Instead, they can ask dm core for requeueing and remapping the original request in that cases. suspend ======= Request-based dm uses stopping md->queue as suspend of the md. For noflush suspend, just stops md->queue. For flush suspend, inserts a marker request to the tail of md->queue. And dispatches all requests in md->queue until the marker comes to the front of md->queue. Then, stops dispatching request and waits for the all dispatched requests to complete. After that, completes the marker request, stops md->queue and wake up the waiter on the suspend queue, md->wait. resume ====== Starts md->queue. Signed-off-by: Kiyoshi Ueda <k-ueda@ct.jp.nec.com> Signed-off-by: Jun'ichi Nomura <j-nomura@ce.jp.nec.com> Signed-off-by: Alasdair G Kergon <agk@redhat.com>
2009-06-22 17:12:35 +08:00
/*
* For suspend, check blk_queue_stopped() and increment
* ->pending within a single queue_lock not to increment the
* number of in-flight I/Os after the queue is stopped in
* dm_suspend().
dm: prepare for request based option This patch adds core functions for request-based dm. When struct mapped device (md) is initialized, md->queue has an I/O scheduler and the following functions are used for request-based dm as the queue functions: make_request_fn: dm_make_request() pref_fn: dm_prep_fn() request_fn: dm_request_fn() softirq_done_fn: dm_softirq_done() lld_busy_fn: dm_lld_busy() Actual initializations are done in another patch (PATCH 2). Below is a brief summary of how request-based dm behaves, including: - making request from bio - cloning, mapping and dispatching request - completing request and bio - suspending md - resuming md bio to request ============== md->queue->make_request_fn() (dm_make_request()) calls __make_request() for a bio submitted to the md. Then, the bio is kept in the queue as a new request or merged into another request in the queue if possible. Cloning and Mapping =================== Cloning and mapping are done in md->queue->request_fn() (dm_request_fn()), when requests are dispatched after they are sorted by the I/O scheduler. dm_request_fn() checks busy state of underlying devices using target's busy() function and stops dispatching requests to keep them on the dm device's queue if busy. It helps better I/O merging, since no merge is done for a request once it is dispatched to underlying devices. Actual cloning and mapping are done in dm_prep_fn() and map_request() called from dm_request_fn(). dm_prep_fn() clones not only request but also bios of the request so that dm can hold bio completion in error cases and prevent the bio submitter from noticing the error. (See the "Completion" section below for details.) After the cloning, the clone is mapped by target's map_rq() function and inserted to underlying device's queue using blk_insert_cloned_request(). Completion ========== Request completion can be hooked by rq->end_io(), but then, all bios in the request will have been completed even error cases, and the bio submitter will have noticed the error. To prevent the bio completion in error cases, request-based dm clones both bio and request and hooks both bio->bi_end_io() and rq->end_io(): bio->bi_end_io(): end_clone_bio() rq->end_io(): end_clone_request() Summary of the request completion flow is below: blk_end_request() for a clone request => blk_update_request() => bio->bi_end_io() == end_clone_bio() for each clone bio => Free the clone bio => Success: Complete the original bio (blk_update_request()) Error: Don't complete the original bio => blk_finish_request() => rq->end_io() == end_clone_request() => blk_complete_request() => dm_softirq_done() => Free the clone request => Success: Complete the original request (blk_end_request()) Error: Requeue the original request end_clone_bio() completes the original request on the size of the original bio in successful cases. Even if all bios in the original request are completed by that completion, the original request must not be completed yet to keep the ordering of request completion for the stacking. So end_clone_bio() uses blk_update_request() instead of blk_end_request(). In error cases, end_clone_bio() doesn't complete the original bio. It just frees the cloned bio and gives over the error handling to end_clone_request(). end_clone_request(), which is called with queue lock held, completes the clone request and the original request in a softirq context (dm_softirq_done()), which has no queue lock, to avoid a deadlock issue on submission of another request during the completion: - The submitted request may be mapped to the same device - Request submission requires queue lock, but the queue lock has been held by itself and it doesn't know that The clone request has no clone bio when dm_softirq_done() is called. So target drivers can't resubmit it again even error cases. Instead, they can ask dm core for requeueing and remapping the original request in that cases. suspend ======= Request-based dm uses stopping md->queue as suspend of the md. For noflush suspend, just stops md->queue. For flush suspend, inserts a marker request to the tail of md->queue. And dispatches all requests in md->queue until the marker comes to the front of md->queue. Then, stops dispatching request and waits for the all dispatched requests to complete. After that, completes the marker request, stops md->queue and wake up the waiter on the suspend queue, md->wait. resume ====== Starts md->queue. Signed-off-by: Kiyoshi Ueda <k-ueda@ct.jp.nec.com> Signed-off-by: Jun'ichi Nomura <j-nomura@ce.jp.nec.com> Signed-off-by: Alasdair G Kergon <agk@redhat.com>
2009-06-22 17:12:35 +08:00
*/
while (!blk_queue_stopped(q)) {
dm: prepare for request based option This patch adds core functions for request-based dm. When struct mapped device (md) is initialized, md->queue has an I/O scheduler and the following functions are used for request-based dm as the queue functions: make_request_fn: dm_make_request() pref_fn: dm_prep_fn() request_fn: dm_request_fn() softirq_done_fn: dm_softirq_done() lld_busy_fn: dm_lld_busy() Actual initializations are done in another patch (PATCH 2). Below is a brief summary of how request-based dm behaves, including: - making request from bio - cloning, mapping and dispatching request - completing request and bio - suspending md - resuming md bio to request ============== md->queue->make_request_fn() (dm_make_request()) calls __make_request() for a bio submitted to the md. Then, the bio is kept in the queue as a new request or merged into another request in the queue if possible. Cloning and Mapping =================== Cloning and mapping are done in md->queue->request_fn() (dm_request_fn()), when requests are dispatched after they are sorted by the I/O scheduler. dm_request_fn() checks busy state of underlying devices using target's busy() function and stops dispatching requests to keep them on the dm device's queue if busy. It helps better I/O merging, since no merge is done for a request once it is dispatched to underlying devices. Actual cloning and mapping are done in dm_prep_fn() and map_request() called from dm_request_fn(). dm_prep_fn() clones not only request but also bios of the request so that dm can hold bio completion in error cases and prevent the bio submitter from noticing the error. (See the "Completion" section below for details.) After the cloning, the clone is mapped by target's map_rq() function and inserted to underlying device's queue using blk_insert_cloned_request(). Completion ========== Request completion can be hooked by rq->end_io(), but then, all bios in the request will have been completed even error cases, and the bio submitter will have noticed the error. To prevent the bio completion in error cases, request-based dm clones both bio and request and hooks both bio->bi_end_io() and rq->end_io(): bio->bi_end_io(): end_clone_bio() rq->end_io(): end_clone_request() Summary of the request completion flow is below: blk_end_request() for a clone request => blk_update_request() => bio->bi_end_io() == end_clone_bio() for each clone bio => Free the clone bio => Success: Complete the original bio (blk_update_request()) Error: Don't complete the original bio => blk_finish_request() => rq->end_io() == end_clone_request() => blk_complete_request() => dm_softirq_done() => Free the clone request => Success: Complete the original request (blk_end_request()) Error: Requeue the original request end_clone_bio() completes the original request on the size of the original bio in successful cases. Even if all bios in the original request are completed by that completion, the original request must not be completed yet to keep the ordering of request completion for the stacking. So end_clone_bio() uses blk_update_request() instead of blk_end_request(). In error cases, end_clone_bio() doesn't complete the original bio. It just frees the cloned bio and gives over the error handling to end_clone_request(). end_clone_request(), which is called with queue lock held, completes the clone request and the original request in a softirq context (dm_softirq_done()), which has no queue lock, to avoid a deadlock issue on submission of another request during the completion: - The submitted request may be mapped to the same device - Request submission requires queue lock, but the queue lock has been held by itself and it doesn't know that The clone request has no clone bio when dm_softirq_done() is called. So target drivers can't resubmit it again even error cases. Instead, they can ask dm core for requeueing and remapping the original request in that cases. suspend ======= Request-based dm uses stopping md->queue as suspend of the md. For noflush suspend, just stops md->queue. For flush suspend, inserts a marker request to the tail of md->queue. And dispatches all requests in md->queue until the marker comes to the front of md->queue. Then, stops dispatching request and waits for the all dispatched requests to complete. After that, completes the marker request, stops md->queue and wake up the waiter on the suspend queue, md->wait. resume ====== Starts md->queue. Signed-off-by: Kiyoshi Ueda <k-ueda@ct.jp.nec.com> Signed-off-by: Jun'ichi Nomura <j-nomura@ce.jp.nec.com> Signed-off-by: Alasdair G Kergon <agk@redhat.com>
2009-06-22 17:12:35 +08:00
rq = blk_peek_request(q);
if (!rq)
goto delay_and_out;
dm: prepare for request based option This patch adds core functions for request-based dm. When struct mapped device (md) is initialized, md->queue has an I/O scheduler and the following functions are used for request-based dm as the queue functions: make_request_fn: dm_make_request() pref_fn: dm_prep_fn() request_fn: dm_request_fn() softirq_done_fn: dm_softirq_done() lld_busy_fn: dm_lld_busy() Actual initializations are done in another patch (PATCH 2). Below is a brief summary of how request-based dm behaves, including: - making request from bio - cloning, mapping and dispatching request - completing request and bio - suspending md - resuming md bio to request ============== md->queue->make_request_fn() (dm_make_request()) calls __make_request() for a bio submitted to the md. Then, the bio is kept in the queue as a new request or merged into another request in the queue if possible. Cloning and Mapping =================== Cloning and mapping are done in md->queue->request_fn() (dm_request_fn()), when requests are dispatched after they are sorted by the I/O scheduler. dm_request_fn() checks busy state of underlying devices using target's busy() function and stops dispatching requests to keep them on the dm device's queue if busy. It helps better I/O merging, since no merge is done for a request once it is dispatched to underlying devices. Actual cloning and mapping are done in dm_prep_fn() and map_request() called from dm_request_fn(). dm_prep_fn() clones not only request but also bios of the request so that dm can hold bio completion in error cases and prevent the bio submitter from noticing the error. (See the "Completion" section below for details.) After the cloning, the clone is mapped by target's map_rq() function and inserted to underlying device's queue using blk_insert_cloned_request(). Completion ========== Request completion can be hooked by rq->end_io(), but then, all bios in the request will have been completed even error cases, and the bio submitter will have noticed the error. To prevent the bio completion in error cases, request-based dm clones both bio and request and hooks both bio->bi_end_io() and rq->end_io(): bio->bi_end_io(): end_clone_bio() rq->end_io(): end_clone_request() Summary of the request completion flow is below: blk_end_request() for a clone request => blk_update_request() => bio->bi_end_io() == end_clone_bio() for each clone bio => Free the clone bio => Success: Complete the original bio (blk_update_request()) Error: Don't complete the original bio => blk_finish_request() => rq->end_io() == end_clone_request() => blk_complete_request() => dm_softirq_done() => Free the clone request => Success: Complete the original request (blk_end_request()) Error: Requeue the original request end_clone_bio() completes the original request on the size of the original bio in successful cases. Even if all bios in the original request are completed by that completion, the original request must not be completed yet to keep the ordering of request completion for the stacking. So end_clone_bio() uses blk_update_request() instead of blk_end_request(). In error cases, end_clone_bio() doesn't complete the original bio. It just frees the cloned bio and gives over the error handling to end_clone_request(). end_clone_request(), which is called with queue lock held, completes the clone request and the original request in a softirq context (dm_softirq_done()), which has no queue lock, to avoid a deadlock issue on submission of another request during the completion: - The submitted request may be mapped to the same device - Request submission requires queue lock, but the queue lock has been held by itself and it doesn't know that The clone request has no clone bio when dm_softirq_done() is called. So target drivers can't resubmit it again even error cases. Instead, they can ask dm core for requeueing and remapping the original request in that cases. suspend ======= Request-based dm uses stopping md->queue as suspend of the md. For noflush suspend, just stops md->queue. For flush suspend, inserts a marker request to the tail of md->queue. And dispatches all requests in md->queue until the marker comes to the front of md->queue. Then, stops dispatching request and waits for the all dispatched requests to complete. After that, completes the marker request, stops md->queue and wake up the waiter on the suspend queue, md->wait. resume ====== Starts md->queue. Signed-off-by: Kiyoshi Ueda <k-ueda@ct.jp.nec.com> Signed-off-by: Jun'ichi Nomura <j-nomura@ce.jp.nec.com> Signed-off-by: Alasdair G Kergon <agk@redhat.com>
2009-06-22 17:12:35 +08:00
/* always use block 0 to find the target for flushes for now */
pos = 0;
if (!(rq->cmd_flags & REQ_FLUSH))
pos = blk_rq_pos(rq);
ti = dm_table_find_target(map, pos);
BUG_ON(!dm_target_is_valid(ti));
dm: add request based barrier support This patch adds barrier support for request-based dm. CORE DESIGN The design is basically same as bio-based dm, which emulates barrier by mapping empty barrier bios before/after a barrier I/O. But request-based dm has been using struct request_queue for I/O queueing, so the block-layer's barrier mechanism can be used. o Summary of the block-layer's behavior (which is depended by dm-core) Request-based dm uses QUEUE_ORDERED_DRAIN_FLUSH ordered mode for I/O barrier. It means that when an I/O requiring barrier is found in the request_queue, the block-layer makes pre-flush request and post-flush request just before and just after the I/O respectively. After the ordered sequence starts, the block-layer waits for all in-flight I/Os to complete, then gives drivers the pre-flush request, the barrier I/O and the post-flush request one by one. It means that the request_queue is stopped automatically by the block-layer until drivers complete each sequence. o dm-core For the barrier I/O, treats it as a normal I/O, so no additional code is needed. For the pre/post-flush request, flushes caches by the followings: 1. Make the number of empty barrier requests required by target's num_flush_requests, and map them (dm_rq_barrier()). 2. Waits for the mapped barriers to complete (dm_rq_barrier()). If error has occurred, save the error value to md->barrier_error (dm_end_request()). (*) Basically, the first reported error is taken. But -EOPNOTSUPP supersedes any error and DM_ENDIO_REQUEUE follows. 3. Requeue the pre/post-flush request if the error value is DM_ENDIO_REQUEUE. Otherwise, completes with the error value (dm_rq_barrier_work()). The pre/post-flush work above is done in the kernel thread (kdmflush) context, since memory allocation which might sleep is needed in dm_rq_barrier() but sleep is not allowed in dm_request_fn(), which is an irq-disabled context. Also, clones of the pre/post-flush request share an original, so such clones can't be completed using the softirq context. Instead, complete them in the context of underlying device drivers. It should be safe since there is no I/O dispatching during the completion of such clones. For suspend, the workqueue of kdmflush needs to be flushed after the request_queue has been stopped. Otherwise, the next flush work can be kicked even after the suspend completes. TARGET INTERFACE No new interface is added. Just use the existing num_flush_requests in struct target_type as same as bio-based dm. Signed-off-by: Kiyoshi Ueda <k-ueda@ct.jp.nec.com> Signed-off-by: Jun'ichi Nomura <j-nomura@ce.jp.nec.com> Signed-off-by: Alasdair G Kergon <agk@redhat.com>
2009-12-11 07:52:18 +08:00
dm: prepare for request based option This patch adds core functions for request-based dm. When struct mapped device (md) is initialized, md->queue has an I/O scheduler and the following functions are used for request-based dm as the queue functions: make_request_fn: dm_make_request() pref_fn: dm_prep_fn() request_fn: dm_request_fn() softirq_done_fn: dm_softirq_done() lld_busy_fn: dm_lld_busy() Actual initializations are done in another patch (PATCH 2). Below is a brief summary of how request-based dm behaves, including: - making request from bio - cloning, mapping and dispatching request - completing request and bio - suspending md - resuming md bio to request ============== md->queue->make_request_fn() (dm_make_request()) calls __make_request() for a bio submitted to the md. Then, the bio is kept in the queue as a new request or merged into another request in the queue if possible. Cloning and Mapping =================== Cloning and mapping are done in md->queue->request_fn() (dm_request_fn()), when requests are dispatched after they are sorted by the I/O scheduler. dm_request_fn() checks busy state of underlying devices using target's busy() function and stops dispatching requests to keep them on the dm device's queue if busy. It helps better I/O merging, since no merge is done for a request once it is dispatched to underlying devices. Actual cloning and mapping are done in dm_prep_fn() and map_request() called from dm_request_fn(). dm_prep_fn() clones not only request but also bios of the request so that dm can hold bio completion in error cases and prevent the bio submitter from noticing the error. (See the "Completion" section below for details.) After the cloning, the clone is mapped by target's map_rq() function and inserted to underlying device's queue using blk_insert_cloned_request(). Completion ========== Request completion can be hooked by rq->end_io(), but then, all bios in the request will have been completed even error cases, and the bio submitter will have noticed the error. To prevent the bio completion in error cases, request-based dm clones both bio and request and hooks both bio->bi_end_io() and rq->end_io(): bio->bi_end_io(): end_clone_bio() rq->end_io(): end_clone_request() Summary of the request completion flow is below: blk_end_request() for a clone request => blk_update_request() => bio->bi_end_io() == end_clone_bio() for each clone bio => Free the clone bio => Success: Complete the original bio (blk_update_request()) Error: Don't complete the original bio => blk_finish_request() => rq->end_io() == end_clone_request() => blk_complete_request() => dm_softirq_done() => Free the clone request => Success: Complete the original request (blk_end_request()) Error: Requeue the original request end_clone_bio() completes the original request on the size of the original bio in successful cases. Even if all bios in the original request are completed by that completion, the original request must not be completed yet to keep the ordering of request completion for the stacking. So end_clone_bio() uses blk_update_request() instead of blk_end_request(). In error cases, end_clone_bio() doesn't complete the original bio. It just frees the cloned bio and gives over the error handling to end_clone_request(). end_clone_request(), which is called with queue lock held, completes the clone request and the original request in a softirq context (dm_softirq_done()), which has no queue lock, to avoid a deadlock issue on submission of another request during the completion: - The submitted request may be mapped to the same device - Request submission requires queue lock, but the queue lock has been held by itself and it doesn't know that The clone request has no clone bio when dm_softirq_done() is called. So target drivers can't resubmit it again even error cases. Instead, they can ask dm core for requeueing and remapping the original request in that cases. suspend ======= Request-based dm uses stopping md->queue as suspend of the md. For noflush suspend, just stops md->queue. For flush suspend, inserts a marker request to the tail of md->queue. And dispatches all requests in md->queue until the marker comes to the front of md->queue. Then, stops dispatching request and waits for the all dispatched requests to complete. After that, completes the marker request, stops md->queue and wake up the waiter on the suspend queue, md->wait. resume ====== Starts md->queue. Signed-off-by: Kiyoshi Ueda <k-ueda@ct.jp.nec.com> Signed-off-by: Jun'ichi Nomura <j-nomura@ce.jp.nec.com> Signed-off-by: Alasdair G Kergon <agk@redhat.com>
2009-06-22 17:12:35 +08:00
if (ti->type->busy && ti->type->busy(ti))
goto delay_and_out;
dm: prepare for request based option This patch adds core functions for request-based dm. When struct mapped device (md) is initialized, md->queue has an I/O scheduler and the following functions are used for request-based dm as the queue functions: make_request_fn: dm_make_request() pref_fn: dm_prep_fn() request_fn: dm_request_fn() softirq_done_fn: dm_softirq_done() lld_busy_fn: dm_lld_busy() Actual initializations are done in another patch (PATCH 2). Below is a brief summary of how request-based dm behaves, including: - making request from bio - cloning, mapping and dispatching request - completing request and bio - suspending md - resuming md bio to request ============== md->queue->make_request_fn() (dm_make_request()) calls __make_request() for a bio submitted to the md. Then, the bio is kept in the queue as a new request or merged into another request in the queue if possible. Cloning and Mapping =================== Cloning and mapping are done in md->queue->request_fn() (dm_request_fn()), when requests are dispatched after they are sorted by the I/O scheduler. dm_request_fn() checks busy state of underlying devices using target's busy() function and stops dispatching requests to keep them on the dm device's queue if busy. It helps better I/O merging, since no merge is done for a request once it is dispatched to underlying devices. Actual cloning and mapping are done in dm_prep_fn() and map_request() called from dm_request_fn(). dm_prep_fn() clones not only request but also bios of the request so that dm can hold bio completion in error cases and prevent the bio submitter from noticing the error. (See the "Completion" section below for details.) After the cloning, the clone is mapped by target's map_rq() function and inserted to underlying device's queue using blk_insert_cloned_request(). Completion ========== Request completion can be hooked by rq->end_io(), but then, all bios in the request will have been completed even error cases, and the bio submitter will have noticed the error. To prevent the bio completion in error cases, request-based dm clones both bio and request and hooks both bio->bi_end_io() and rq->end_io(): bio->bi_end_io(): end_clone_bio() rq->end_io(): end_clone_request() Summary of the request completion flow is below: blk_end_request() for a clone request => blk_update_request() => bio->bi_end_io() == end_clone_bio() for each clone bio => Free the clone bio => Success: Complete the original bio (blk_update_request()) Error: Don't complete the original bio => blk_finish_request() => rq->end_io() == end_clone_request() => blk_complete_request() => dm_softirq_done() => Free the clone request => Success: Complete the original request (blk_end_request()) Error: Requeue the original request end_clone_bio() completes the original request on the size of the original bio in successful cases. Even if all bios in the original request are completed by that completion, the original request must not be completed yet to keep the ordering of request completion for the stacking. So end_clone_bio() uses blk_update_request() instead of blk_end_request(). In error cases, end_clone_bio() doesn't complete the original bio. It just frees the cloned bio and gives over the error handling to end_clone_request(). end_clone_request(), which is called with queue lock held, completes the clone request and the original request in a softirq context (dm_softirq_done()), which has no queue lock, to avoid a deadlock issue on submission of another request during the completion: - The submitted request may be mapped to the same device - Request submission requires queue lock, but the queue lock has been held by itself and it doesn't know that The clone request has no clone bio when dm_softirq_done() is called. So target drivers can't resubmit it again even error cases. Instead, they can ask dm core for requeueing and remapping the original request in that cases. suspend ======= Request-based dm uses stopping md->queue as suspend of the md. For noflush suspend, just stops md->queue. For flush suspend, inserts a marker request to the tail of md->queue. And dispatches all requests in md->queue until the marker comes to the front of md->queue. Then, stops dispatching request and waits for the all dispatched requests to complete. After that, completes the marker request, stops md->queue and wake up the waiter on the suspend queue, md->wait. resume ====== Starts md->queue. Signed-off-by: Kiyoshi Ueda <k-ueda@ct.jp.nec.com> Signed-off-by: Jun'ichi Nomura <j-nomura@ce.jp.nec.com> Signed-off-by: Alasdair G Kergon <agk@redhat.com>
2009-06-22 17:12:35 +08:00
blk_start_request(rq);
clone = rq->special;
atomic_inc(&md->pending[rq_data_dir(clone)]);
dm: prepare for request based option This patch adds core functions for request-based dm. When struct mapped device (md) is initialized, md->queue has an I/O scheduler and the following functions are used for request-based dm as the queue functions: make_request_fn: dm_make_request() pref_fn: dm_prep_fn() request_fn: dm_request_fn() softirq_done_fn: dm_softirq_done() lld_busy_fn: dm_lld_busy() Actual initializations are done in another patch (PATCH 2). Below is a brief summary of how request-based dm behaves, including: - making request from bio - cloning, mapping and dispatching request - completing request and bio - suspending md - resuming md bio to request ============== md->queue->make_request_fn() (dm_make_request()) calls __make_request() for a bio submitted to the md. Then, the bio is kept in the queue as a new request or merged into another request in the queue if possible. Cloning and Mapping =================== Cloning and mapping are done in md->queue->request_fn() (dm_request_fn()), when requests are dispatched after they are sorted by the I/O scheduler. dm_request_fn() checks busy state of underlying devices using target's busy() function and stops dispatching requests to keep them on the dm device's queue if busy. It helps better I/O merging, since no merge is done for a request once it is dispatched to underlying devices. Actual cloning and mapping are done in dm_prep_fn() and map_request() called from dm_request_fn(). dm_prep_fn() clones not only request but also bios of the request so that dm can hold bio completion in error cases and prevent the bio submitter from noticing the error. (See the "Completion" section below for details.) After the cloning, the clone is mapped by target's map_rq() function and inserted to underlying device's queue using blk_insert_cloned_request(). Completion ========== Request completion can be hooked by rq->end_io(), but then, all bios in the request will have been completed even error cases, and the bio submitter will have noticed the error. To prevent the bio completion in error cases, request-based dm clones both bio and request and hooks both bio->bi_end_io() and rq->end_io(): bio->bi_end_io(): end_clone_bio() rq->end_io(): end_clone_request() Summary of the request completion flow is below: blk_end_request() for a clone request => blk_update_request() => bio->bi_end_io() == end_clone_bio() for each clone bio => Free the clone bio => Success: Complete the original bio (blk_update_request()) Error: Don't complete the original bio => blk_finish_request() => rq->end_io() == end_clone_request() => blk_complete_request() => dm_softirq_done() => Free the clone request => Success: Complete the original request (blk_end_request()) Error: Requeue the original request end_clone_bio() completes the original request on the size of the original bio in successful cases. Even if all bios in the original request are completed by that completion, the original request must not be completed yet to keep the ordering of request completion for the stacking. So end_clone_bio() uses blk_update_request() instead of blk_end_request(). In error cases, end_clone_bio() doesn't complete the original bio. It just frees the cloned bio and gives over the error handling to end_clone_request(). end_clone_request(), which is called with queue lock held, completes the clone request and the original request in a softirq context (dm_softirq_done()), which has no queue lock, to avoid a deadlock issue on submission of another request during the completion: - The submitted request may be mapped to the same device - Request submission requires queue lock, but the queue lock has been held by itself and it doesn't know that The clone request has no clone bio when dm_softirq_done() is called. So target drivers can't resubmit it again even error cases. Instead, they can ask dm core for requeueing and remapping the original request in that cases. suspend ======= Request-based dm uses stopping md->queue as suspend of the md. For noflush suspend, just stops md->queue. For flush suspend, inserts a marker request to the tail of md->queue. And dispatches all requests in md->queue until the marker comes to the front of md->queue. Then, stops dispatching request and waits for the all dispatched requests to complete. After that, completes the marker request, stops md->queue and wake up the waiter on the suspend queue, md->wait. resume ====== Starts md->queue. Signed-off-by: Kiyoshi Ueda <k-ueda@ct.jp.nec.com> Signed-off-by: Jun'ichi Nomura <j-nomura@ce.jp.nec.com> Signed-off-by: Alasdair G Kergon <agk@redhat.com>
2009-06-22 17:12:35 +08:00
spin_unlock(q->queue_lock);
if (map_request(ti, clone, md))
goto requeued;
BUG_ON(!irqs_disabled());
spin_lock(q->queue_lock);
dm: prepare for request based option This patch adds core functions for request-based dm. When struct mapped device (md) is initialized, md->queue has an I/O scheduler and the following functions are used for request-based dm as the queue functions: make_request_fn: dm_make_request() pref_fn: dm_prep_fn() request_fn: dm_request_fn() softirq_done_fn: dm_softirq_done() lld_busy_fn: dm_lld_busy() Actual initializations are done in another patch (PATCH 2). Below is a brief summary of how request-based dm behaves, including: - making request from bio - cloning, mapping and dispatching request - completing request and bio - suspending md - resuming md bio to request ============== md->queue->make_request_fn() (dm_make_request()) calls __make_request() for a bio submitted to the md. Then, the bio is kept in the queue as a new request or merged into another request in the queue if possible. Cloning and Mapping =================== Cloning and mapping are done in md->queue->request_fn() (dm_request_fn()), when requests are dispatched after they are sorted by the I/O scheduler. dm_request_fn() checks busy state of underlying devices using target's busy() function and stops dispatching requests to keep them on the dm device's queue if busy. It helps better I/O merging, since no merge is done for a request once it is dispatched to underlying devices. Actual cloning and mapping are done in dm_prep_fn() and map_request() called from dm_request_fn(). dm_prep_fn() clones not only request but also bios of the request so that dm can hold bio completion in error cases and prevent the bio submitter from noticing the error. (See the "Completion" section below for details.) After the cloning, the clone is mapped by target's map_rq() function and inserted to underlying device's queue using blk_insert_cloned_request(). Completion ========== Request completion can be hooked by rq->end_io(), but then, all bios in the request will have been completed even error cases, and the bio submitter will have noticed the error. To prevent the bio completion in error cases, request-based dm clones both bio and request and hooks both bio->bi_end_io() and rq->end_io(): bio->bi_end_io(): end_clone_bio() rq->end_io(): end_clone_request() Summary of the request completion flow is below: blk_end_request() for a clone request => blk_update_request() => bio->bi_end_io() == end_clone_bio() for each clone bio => Free the clone bio => Success: Complete the original bio (blk_update_request()) Error: Don't complete the original bio => blk_finish_request() => rq->end_io() == end_clone_request() => blk_complete_request() => dm_softirq_done() => Free the clone request => Success: Complete the original request (blk_end_request()) Error: Requeue the original request end_clone_bio() completes the original request on the size of the original bio in successful cases. Even if all bios in the original request are completed by that completion, the original request must not be completed yet to keep the ordering of request completion for the stacking. So end_clone_bio() uses blk_update_request() instead of blk_end_request(). In error cases, end_clone_bio() doesn't complete the original bio. It just frees the cloned bio and gives over the error handling to end_clone_request(). end_clone_request(), which is called with queue lock held, completes the clone request and the original request in a softirq context (dm_softirq_done()), which has no queue lock, to avoid a deadlock issue on submission of another request during the completion: - The submitted request may be mapped to the same device - Request submission requires queue lock, but the queue lock has been held by itself and it doesn't know that The clone request has no clone bio when dm_softirq_done() is called. So target drivers can't resubmit it again even error cases. Instead, they can ask dm core for requeueing and remapping the original request in that cases. suspend ======= Request-based dm uses stopping md->queue as suspend of the md. For noflush suspend, just stops md->queue. For flush suspend, inserts a marker request to the tail of md->queue. And dispatches all requests in md->queue until the marker comes to the front of md->queue. Then, stops dispatching request and waits for the all dispatched requests to complete. After that, completes the marker request, stops md->queue and wake up the waiter on the suspend queue, md->wait. resume ====== Starts md->queue. Signed-off-by: Kiyoshi Ueda <k-ueda@ct.jp.nec.com> Signed-off-by: Jun'ichi Nomura <j-nomura@ce.jp.nec.com> Signed-off-by: Alasdair G Kergon <agk@redhat.com>
2009-06-22 17:12:35 +08:00
}
goto out;
requeued:
BUG_ON(!irqs_disabled());
spin_lock(q->queue_lock);
delay_and_out:
blk_delay_queue(q, HZ / 10);
dm: prepare for request based option This patch adds core functions for request-based dm. When struct mapped device (md) is initialized, md->queue has an I/O scheduler and the following functions are used for request-based dm as the queue functions: make_request_fn: dm_make_request() pref_fn: dm_prep_fn() request_fn: dm_request_fn() softirq_done_fn: dm_softirq_done() lld_busy_fn: dm_lld_busy() Actual initializations are done in another patch (PATCH 2). Below is a brief summary of how request-based dm behaves, including: - making request from bio - cloning, mapping and dispatching request - completing request and bio - suspending md - resuming md bio to request ============== md->queue->make_request_fn() (dm_make_request()) calls __make_request() for a bio submitted to the md. Then, the bio is kept in the queue as a new request or merged into another request in the queue if possible. Cloning and Mapping =================== Cloning and mapping are done in md->queue->request_fn() (dm_request_fn()), when requests are dispatched after they are sorted by the I/O scheduler. dm_request_fn() checks busy state of underlying devices using target's busy() function and stops dispatching requests to keep them on the dm device's queue if busy. It helps better I/O merging, since no merge is done for a request once it is dispatched to underlying devices. Actual cloning and mapping are done in dm_prep_fn() and map_request() called from dm_request_fn(). dm_prep_fn() clones not only request but also bios of the request so that dm can hold bio completion in error cases and prevent the bio submitter from noticing the error. (See the "Completion" section below for details.) After the cloning, the clone is mapped by target's map_rq() function and inserted to underlying device's queue using blk_insert_cloned_request(). Completion ========== Request completion can be hooked by rq->end_io(), but then, all bios in the request will have been completed even error cases, and the bio submitter will have noticed the error. To prevent the bio completion in error cases, request-based dm clones both bio and request and hooks both bio->bi_end_io() and rq->end_io(): bio->bi_end_io(): end_clone_bio() rq->end_io(): end_clone_request() Summary of the request completion flow is below: blk_end_request() for a clone request => blk_update_request() => bio->bi_end_io() == end_clone_bio() for each clone bio => Free the clone bio => Success: Complete the original bio (blk_update_request()) Error: Don't complete the original bio => blk_finish_request() => rq->end_io() == end_clone_request() => blk_complete_request() => dm_softirq_done() => Free the clone request => Success: Complete the original request (blk_end_request()) Error: Requeue the original request end_clone_bio() completes the original request on the size of the original bio in successful cases. Even if all bios in the original request are completed by that completion, the original request must not be completed yet to keep the ordering of request completion for the stacking. So end_clone_bio() uses blk_update_request() instead of blk_end_request(). In error cases, end_clone_bio() doesn't complete the original bio. It just frees the cloned bio and gives over the error handling to end_clone_request(). end_clone_request(), which is called with queue lock held, completes the clone request and the original request in a softirq context (dm_softirq_done()), which has no queue lock, to avoid a deadlock issue on submission of another request during the completion: - The submitted request may be mapped to the same device - Request submission requires queue lock, but the queue lock has been held by itself and it doesn't know that The clone request has no clone bio when dm_softirq_done() is called. So target drivers can't resubmit it again even error cases. Instead, they can ask dm core for requeueing and remapping the original request in that cases. suspend ======= Request-based dm uses stopping md->queue as suspend of the md. For noflush suspend, just stops md->queue. For flush suspend, inserts a marker request to the tail of md->queue. And dispatches all requests in md->queue until the marker comes to the front of md->queue. Then, stops dispatching request and waits for the all dispatched requests to complete. After that, completes the marker request, stops md->queue and wake up the waiter on the suspend queue, md->wait. resume ====== Starts md->queue. Signed-off-by: Kiyoshi Ueda <k-ueda@ct.jp.nec.com> Signed-off-by: Jun'ichi Nomura <j-nomura@ce.jp.nec.com> Signed-off-by: Alasdair G Kergon <agk@redhat.com>
2009-06-22 17:12:35 +08:00
out:
dm_table_put(map);
return;
}
int dm_underlying_device_busy(struct request_queue *q)
{
return blk_lld_busy(q);
}
EXPORT_SYMBOL_GPL(dm_underlying_device_busy);
static int dm_lld_busy(struct request_queue *q)
{
int r;
struct mapped_device *md = q->queuedata;
struct dm_table *map = dm_get_live_table(md);
dm: prepare for request based option This patch adds core functions for request-based dm. When struct mapped device (md) is initialized, md->queue has an I/O scheduler and the following functions are used for request-based dm as the queue functions: make_request_fn: dm_make_request() pref_fn: dm_prep_fn() request_fn: dm_request_fn() softirq_done_fn: dm_softirq_done() lld_busy_fn: dm_lld_busy() Actual initializations are done in another patch (PATCH 2). Below is a brief summary of how request-based dm behaves, including: - making request from bio - cloning, mapping and dispatching request - completing request and bio - suspending md - resuming md bio to request ============== md->queue->make_request_fn() (dm_make_request()) calls __make_request() for a bio submitted to the md. Then, the bio is kept in the queue as a new request or merged into another request in the queue if possible. Cloning and Mapping =================== Cloning and mapping are done in md->queue->request_fn() (dm_request_fn()), when requests are dispatched after they are sorted by the I/O scheduler. dm_request_fn() checks busy state of underlying devices using target's busy() function and stops dispatching requests to keep them on the dm device's queue if busy. It helps better I/O merging, since no merge is done for a request once it is dispatched to underlying devices. Actual cloning and mapping are done in dm_prep_fn() and map_request() called from dm_request_fn(). dm_prep_fn() clones not only request but also bios of the request so that dm can hold bio completion in error cases and prevent the bio submitter from noticing the error. (See the "Completion" section below for details.) After the cloning, the clone is mapped by target's map_rq() function and inserted to underlying device's queue using blk_insert_cloned_request(). Completion ========== Request completion can be hooked by rq->end_io(), but then, all bios in the request will have been completed even error cases, and the bio submitter will have noticed the error. To prevent the bio completion in error cases, request-based dm clones both bio and request and hooks both bio->bi_end_io() and rq->end_io(): bio->bi_end_io(): end_clone_bio() rq->end_io(): end_clone_request() Summary of the request completion flow is below: blk_end_request() for a clone request => blk_update_request() => bio->bi_end_io() == end_clone_bio() for each clone bio => Free the clone bio => Success: Complete the original bio (blk_update_request()) Error: Don't complete the original bio => blk_finish_request() => rq->end_io() == end_clone_request() => blk_complete_request() => dm_softirq_done() => Free the clone request => Success: Complete the original request (blk_end_request()) Error: Requeue the original request end_clone_bio() completes the original request on the size of the original bio in successful cases. Even if all bios in the original request are completed by that completion, the original request must not be completed yet to keep the ordering of request completion for the stacking. So end_clone_bio() uses blk_update_request() instead of blk_end_request(). In error cases, end_clone_bio() doesn't complete the original bio. It just frees the cloned bio and gives over the error handling to end_clone_request(). end_clone_request(), which is called with queue lock held, completes the clone request and the original request in a softirq context (dm_softirq_done()), which has no queue lock, to avoid a deadlock issue on submission of another request during the completion: - The submitted request may be mapped to the same device - Request submission requires queue lock, but the queue lock has been held by itself and it doesn't know that The clone request has no clone bio when dm_softirq_done() is called. So target drivers can't resubmit it again even error cases. Instead, they can ask dm core for requeueing and remapping the original request in that cases. suspend ======= Request-based dm uses stopping md->queue as suspend of the md. For noflush suspend, just stops md->queue. For flush suspend, inserts a marker request to the tail of md->queue. And dispatches all requests in md->queue until the marker comes to the front of md->queue. Then, stops dispatching request and waits for the all dispatched requests to complete. After that, completes the marker request, stops md->queue and wake up the waiter on the suspend queue, md->wait. resume ====== Starts md->queue. Signed-off-by: Kiyoshi Ueda <k-ueda@ct.jp.nec.com> Signed-off-by: Jun'ichi Nomura <j-nomura@ce.jp.nec.com> Signed-off-by: Alasdair G Kergon <agk@redhat.com>
2009-06-22 17:12:35 +08:00
if (!map || test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags))
r = 1;
else
r = dm_table_any_busy_target(map);
dm_table_put(map);
return r;
}
static int dm_any_congested(void *congested_data, int bdi_bits)
{
int r = bdi_bits;
struct mapped_device *md = congested_data;
struct dm_table *map;
if (!test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags)) {
map = dm_get_live_table(md);
if (map) {
dm: prepare for request based option This patch adds core functions for request-based dm. When struct mapped device (md) is initialized, md->queue has an I/O scheduler and the following functions are used for request-based dm as the queue functions: make_request_fn: dm_make_request() pref_fn: dm_prep_fn() request_fn: dm_request_fn() softirq_done_fn: dm_softirq_done() lld_busy_fn: dm_lld_busy() Actual initializations are done in another patch (PATCH 2). Below is a brief summary of how request-based dm behaves, including: - making request from bio - cloning, mapping and dispatching request - completing request and bio - suspending md - resuming md bio to request ============== md->queue->make_request_fn() (dm_make_request()) calls __make_request() for a bio submitted to the md. Then, the bio is kept in the queue as a new request or merged into another request in the queue if possible. Cloning and Mapping =================== Cloning and mapping are done in md->queue->request_fn() (dm_request_fn()), when requests are dispatched after they are sorted by the I/O scheduler. dm_request_fn() checks busy state of underlying devices using target's busy() function and stops dispatching requests to keep them on the dm device's queue if busy. It helps better I/O merging, since no merge is done for a request once it is dispatched to underlying devices. Actual cloning and mapping are done in dm_prep_fn() and map_request() called from dm_request_fn(). dm_prep_fn() clones not only request but also bios of the request so that dm can hold bio completion in error cases and prevent the bio submitter from noticing the error. (See the "Completion" section below for details.) After the cloning, the clone is mapped by target's map_rq() function and inserted to underlying device's queue using blk_insert_cloned_request(). Completion ========== Request completion can be hooked by rq->end_io(), but then, all bios in the request will have been completed even error cases, and the bio submitter will have noticed the error. To prevent the bio completion in error cases, request-based dm clones both bio and request and hooks both bio->bi_end_io() and rq->end_io(): bio->bi_end_io(): end_clone_bio() rq->end_io(): end_clone_request() Summary of the request completion flow is below: blk_end_request() for a clone request => blk_update_request() => bio->bi_end_io() == end_clone_bio() for each clone bio => Free the clone bio => Success: Complete the original bio (blk_update_request()) Error: Don't complete the original bio => blk_finish_request() => rq->end_io() == end_clone_request() => blk_complete_request() => dm_softirq_done() => Free the clone request => Success: Complete the original request (blk_end_request()) Error: Requeue the original request end_clone_bio() completes the original request on the size of the original bio in successful cases. Even if all bios in the original request are completed by that completion, the original request must not be completed yet to keep the ordering of request completion for the stacking. So end_clone_bio() uses blk_update_request() instead of blk_end_request(). In error cases, end_clone_bio() doesn't complete the original bio. It just frees the cloned bio and gives over the error handling to end_clone_request(). end_clone_request(), which is called with queue lock held, completes the clone request and the original request in a softirq context (dm_softirq_done()), which has no queue lock, to avoid a deadlock issue on submission of another request during the completion: - The submitted request may be mapped to the same device - Request submission requires queue lock, but the queue lock has been held by itself and it doesn't know that The clone request has no clone bio when dm_softirq_done() is called. So target drivers can't resubmit it again even error cases. Instead, they can ask dm core for requeueing and remapping the original request in that cases. suspend ======= Request-based dm uses stopping md->queue as suspend of the md. For noflush suspend, just stops md->queue. For flush suspend, inserts a marker request to the tail of md->queue. And dispatches all requests in md->queue until the marker comes to the front of md->queue. Then, stops dispatching request and waits for the all dispatched requests to complete. After that, completes the marker request, stops md->queue and wake up the waiter on the suspend queue, md->wait. resume ====== Starts md->queue. Signed-off-by: Kiyoshi Ueda <k-ueda@ct.jp.nec.com> Signed-off-by: Jun'ichi Nomura <j-nomura@ce.jp.nec.com> Signed-off-by: Alasdair G Kergon <agk@redhat.com>
2009-06-22 17:12:35 +08:00
/*
* Request-based dm cares about only own queue for
* the query about congestion status of request_queue
*/
if (dm_request_based(md))
r = md->queue->backing_dev_info.state &
bdi_bits;
else
r = dm_table_any_congested(map, bdi_bits);
dm_table_put(map);
}
}
return r;
}
/*-----------------------------------------------------------------
* An IDR is used to keep track of allocated minor numbers.
*---------------------------------------------------------------*/
static void free_minor(int minor)
{
spin_lock(&_minor_lock);
idr_remove(&_minor_idr, minor);
spin_unlock(&_minor_lock);
}
/*
* See if the device with a specific minor # is free.
*/
static int specific_minor(int minor)
{
int r, m;
if (minor >= (1 << MINORBITS))
return -EINVAL;
r = idr_pre_get(&_minor_idr, GFP_KERNEL);
if (!r)
return -ENOMEM;
spin_lock(&_minor_lock);
if (idr_find(&_minor_idr, minor)) {
r = -EBUSY;
goto out;
}
r = idr_get_new_above(&_minor_idr, MINOR_ALLOCED, minor, &m);
if (r)
goto out;
if (m != minor) {
idr_remove(&_minor_idr, m);
r = -EBUSY;
goto out;
}
out:
spin_unlock(&_minor_lock);
return r;
}
static int next_free_minor(int *minor)
{
int r, m;
r = idr_pre_get(&_minor_idr, GFP_KERNEL);
if (!r)
return -ENOMEM;
spin_lock(&_minor_lock);
r = idr_get_new(&_minor_idr, MINOR_ALLOCED, &m);
if (r)
goto out;
if (m >= (1 << MINORBITS)) {
idr_remove(&_minor_idr, m);
r = -ENOSPC;
goto out;
}
*minor = m;
out:
spin_unlock(&_minor_lock);
return r;
}
static const struct block_device_operations dm_blk_dops;
static void dm_wq_work(struct work_struct *work);
dm: do not initialise full request queue when bio based Change bio-based mapped devices no longer to have a fully initialized request_queue (request_fn, elevator, etc). This means bio-based DM devices no longer register elevator sysfs attributes ('iosched/' tree or 'scheduler' other than "none"). In contrast, a request-based DM device will continue to have a full request_queue and will register elevator sysfs attributes. Therefore a user can determine a DM device's type by checking if elevator sysfs attributes exist. First allocate a minimalist request_queue structure for a DM device (needed for both bio and request-based DM). Initialization of a full request_queue is deferred until it is known that the DM device is request-based, at the end of the table load sequence. Factor DM device's request_queue initialization: - common to both request-based and bio-based into dm_init_md_queue(). - specific to request-based into dm_init_request_based_queue(). The md->type_lock mutex is used to protect md->queue, in addition to md->type, during table_load(). A DM device's first table_load will establish the immutable md->type. But md->queue initialization, based on md->type, may fail at that time (because blk_init_allocated_queue cannot allocate memory). Therefore any subsequent table_load must (re)try dm_setup_md_queue independently of establishing md->type. Signed-off-by: Mike Snitzer <snitzer@redhat.com> Acked-by: Kiyoshi Ueda <k-ueda@ct.jp.nec.com> Signed-off-by: Alasdair G Kergon <agk@redhat.com>
2010-08-12 11:14:02 +08:00
static void dm_init_md_queue(struct mapped_device *md)
{
/*
* Request-based dm devices cannot be stacked on top of bio-based dm
* devices. The type of this dm device has not been decided yet.
* The type is decided at the first table loading time.
* To prevent problematic device stacking, clear the queue flag
* for request stacking support until then.
*
* This queue is new, so no concurrency on the queue_flags.
*/
queue_flag_clear_unlocked(QUEUE_FLAG_STACKABLE, md->queue);
md->queue->queuedata = md;
md->queue->backing_dev_info.congested_fn = dm_any_congested;
md->queue->backing_dev_info.congested_data = md;
blk_queue_make_request(md->queue, dm_request);
blk_queue_bounce_limit(md->queue, BLK_BOUNCE_ANY);
blk_queue_merge_bvec(md->queue, dm_merge_bvec);
}
/*
* Allocate and initialise a blank device with a given minor.
*/
static struct mapped_device *alloc_dev(int minor)
{
int r;
struct mapped_device *md = kzalloc(sizeof(*md), GFP_KERNEL);
void *old_md;
if (!md) {
DMWARN("unable to allocate device, out of memory.");
return NULL;
}
if (!try_module_get(THIS_MODULE))
goto bad_module_get;
/* get a minor number for the dev */
if (minor == DM_ANY_MINOR)
r = next_free_minor(&minor);
else
r = specific_minor(minor);
if (r < 0)
goto bad_minor;
md->type = DM_TYPE_NONE;
init_rwsem(&md->io_lock);
mutex_init(&md->suspend_lock);
mutex_init(&md->type_lock);
spin_lock_init(&md->deferred_lock);
rwlock_init(&md->map_lock);
atomic_set(&md->holders, 1);
atomic_set(&md->open_count, 0);
atomic_set(&md->event_nr, 0);
atomic_set(&md->uevent_seq, 0);
INIT_LIST_HEAD(&md->uevent_list);
spin_lock_init(&md->uevent_lock);
dm: do not initialise full request queue when bio based Change bio-based mapped devices no longer to have a fully initialized request_queue (request_fn, elevator, etc). This means bio-based DM devices no longer register elevator sysfs attributes ('iosched/' tree or 'scheduler' other than "none"). In contrast, a request-based DM device will continue to have a full request_queue and will register elevator sysfs attributes. Therefore a user can determine a DM device's type by checking if elevator sysfs attributes exist. First allocate a minimalist request_queue structure for a DM device (needed for both bio and request-based DM). Initialization of a full request_queue is deferred until it is known that the DM device is request-based, at the end of the table load sequence. Factor DM device's request_queue initialization: - common to both request-based and bio-based into dm_init_md_queue(). - specific to request-based into dm_init_request_based_queue(). The md->type_lock mutex is used to protect md->queue, in addition to md->type, during table_load(). A DM device's first table_load will establish the immutable md->type. But md->queue initialization, based on md->type, may fail at that time (because blk_init_allocated_queue cannot allocate memory). Therefore any subsequent table_load must (re)try dm_setup_md_queue independently of establishing md->type. Signed-off-by: Mike Snitzer <snitzer@redhat.com> Acked-by: Kiyoshi Ueda <k-ueda@ct.jp.nec.com> Signed-off-by: Alasdair G Kergon <agk@redhat.com>
2010-08-12 11:14:02 +08:00
md->queue = blk_alloc_queue(GFP_KERNEL);
if (!md->queue)
goto bad_queue;
dm: do not initialise full request queue when bio based Change bio-based mapped devices no longer to have a fully initialized request_queue (request_fn, elevator, etc). This means bio-based DM devices no longer register elevator sysfs attributes ('iosched/' tree or 'scheduler' other than "none"). In contrast, a request-based DM device will continue to have a full request_queue and will register elevator sysfs attributes. Therefore a user can determine a DM device's type by checking if elevator sysfs attributes exist. First allocate a minimalist request_queue structure for a DM device (needed for both bio and request-based DM). Initialization of a full request_queue is deferred until it is known that the DM device is request-based, at the end of the table load sequence. Factor DM device's request_queue initialization: - common to both request-based and bio-based into dm_init_md_queue(). - specific to request-based into dm_init_request_based_queue(). The md->type_lock mutex is used to protect md->queue, in addition to md->type, during table_load(). A DM device's first table_load will establish the immutable md->type. But md->queue initialization, based on md->type, may fail at that time (because blk_init_allocated_queue cannot allocate memory). Therefore any subsequent table_load must (re)try dm_setup_md_queue independently of establishing md->type. Signed-off-by: Mike Snitzer <snitzer@redhat.com> Acked-by: Kiyoshi Ueda <k-ueda@ct.jp.nec.com> Signed-off-by: Alasdair G Kergon <agk@redhat.com>
2010-08-12 11:14:02 +08:00
dm_init_md_queue(md);
md->disk = alloc_disk(1);
if (!md->disk)
goto bad_disk;
atomic_set(&md->pending[0], 0);
atomic_set(&md->pending[1], 0);
init_waitqueue_head(&md->wait);
INIT_WORK(&md->work, dm_wq_work);
init_waitqueue_head(&md->eventq);
md->disk->major = _major;
md->disk->first_minor = minor;
md->disk->fops = &dm_blk_dops;
md->disk->queue = md->queue;
md->disk->private_data = md;
sprintf(md->disk->disk_name, "dm-%d", minor);
add_disk(md->disk);
format_dev_t(md->name, MKDEV(_major, minor));
md->wq = alloc_workqueue("kdmflush",
WQ_NON_REENTRANT | WQ_MEM_RECLAIM, 0);
if (!md->wq)
goto bad_thread;
md->bdev = bdget_disk(md->disk, 0);
if (!md->bdev)
goto bad_bdev;
dm: relax ordering of bio-based flush implementation Unlike REQ_HARDBARRIER, REQ_FLUSH/FUA doesn't mandate any ordering against other bio's. This patch relaxes ordering around flushes. * A flush bio is no longer deferred to workqueue directly. It's processed like other bio's but __split_and_process_bio() uses md->flush_bio as the clone source. md->flush_bio is initialized to empty flush during md initialization and shared for all flushes. * As a flush bio now travels through the same execution path as other bio's, there's no need for dedicated error handling path either. It can use the same error handling path in dec_pending(). Dedicated error handling removed along with md->flush_error. * When dec_pending() detects that a flush has completed, it checks whether the original bio has data. If so, the bio is queued to the deferred list w/ REQ_FLUSH cleared; otherwise, it's completed. * As flush sequencing is handled in the usual issue/completion path, dm_wq_work() no longer needs to handle flushes differently. Now its only responsibility is re-issuing deferred bio's the same way as _dm_request() would. REQ_FLUSH handling logic including process_flush() is dropped. * There's no reason for queue_io() and dm_wq_work() write lock dm->io_lock. queue_io() now only uses md->deferred_lock and dm_wq_work() read locks dm->io_lock. * bio's no longer need to be queued on the deferred list while a flush is in progress making DMF_QUEUE_IO_TO_THREAD unncessary. Drop it. This avoids stalling the device during flushes and simplifies the implementation. Signed-off-by: Tejun Heo <tj@kernel.org> Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
2010-09-09 00:07:00 +08:00
bio_init(&md->flush_bio);
md->flush_bio.bi_bdev = md->bdev;
md->flush_bio.bi_rw = WRITE_FLUSH;
/* Populate the mapping, nobody knows we exist yet */
spin_lock(&_minor_lock);
old_md = idr_replace(&_minor_idr, md, minor);
spin_unlock(&_minor_lock);
BUG_ON(old_md != MINOR_ALLOCED);
return md;
bad_bdev:
destroy_workqueue(md->wq);
bad_thread:
del_gendisk(md->disk);
put_disk(md->disk);
bad_disk:
blk_cleanup_queue(md->queue);
bad_queue:
free_minor(minor);
bad_minor:
module_put(THIS_MODULE);
bad_module_get:
kfree(md);
return NULL;
}
dm: fix thaw_bdev This patch fixes a bd_mount_sem counter corruption bug in device-mapper. thaw_bdev() should be called only when freeze_bdev() was called for the device. Otherwise, thaw_bdev() will up bd_mount_sem and corrupt the semaphore counter. struct block_device with the corrupted semaphore may remain in slab cache and be reused later. Attached patch will fix it by calling unlock_fs() instead. unlock_fs() will determine whether it should call thaw_bdev() by checking the device is frozen or not. Easy reproducer is: #!/bin/sh while [ 1 ]; do dmsetup --notable create a dmsetup --nolockfs suspend a dmsetup remove a done It's not easy to see the effect of corrupted semaphore. So I have tested with putting printk below in bdev_alloc_inode(): if (atomic_read(&ei->bdev.bd_mount_sem.count) != 1) printk(KERN_DEBUG "Incorrect semaphore count = %d (%p)\n", atomic_read(&ei->bdev.bd_mount_sem.count), &ei->bdev); Without the patch, I saw something like: Incorrect semaphore count = 17 (f2ab91c0) With the patch, the message didn't appear. The bug was introduced in 2.6.16 with this bug fix: commit d9dde59ba03095e526640988c0fedd75e93bc8b7 Date: Fri Feb 24 13:04:24 2006 -0800 [PATCH] dm: missing bdput/thaw_bdev at removal Need to unfreeze and release bdev otherwise the bdev inode with inconsistent state is reused later and cause problem. and backported to 2.6.15.5. It occurs only in free_dev(), which is called only when the dm device is removed. The buggy code is executed only if md->suspended_bdev is non-NULL and that can happen only when the device was suspended without noflush. Signed-off-by: Jun'ichi Nomura <j-nomura@ce.jp.nec.com> Signed-off-by: Alasdair G Kergon <agk@redhat.com> Cc: stable@kernel.org
2007-10-20 05:38:43 +08:00
static void unlock_fs(struct mapped_device *md);
static void free_dev(struct mapped_device *md)
{
int minor = MINOR(disk_devt(md->disk));
unlock_fs(md);
bdput(md->bdev);
destroy_workqueue(md->wq);
if (md->tio_pool)
mempool_destroy(md->tio_pool);
if (md->io_pool)
mempool_destroy(md->io_pool);
if (md->bs)
bioset_free(md->bs);
blk_integrity_unregister(md->disk);
del_gendisk(md->disk);
free_minor(minor);
spin_lock(&_minor_lock);
md->disk->private_data = NULL;
spin_unlock(&_minor_lock);
put_disk(md->disk);
blk_cleanup_queue(md->queue);
module_put(THIS_MODULE);
kfree(md);
}
static void __bind_mempools(struct mapped_device *md, struct dm_table *t)
{
struct dm_md_mempools *p;
if (md->io_pool && md->tio_pool && md->bs)
/* the md already has necessary mempools */
goto out;
p = dm_table_get_md_mempools(t);
BUG_ON(!p || md->io_pool || md->tio_pool || md->bs);
md->io_pool = p->io_pool;
p->io_pool = NULL;
md->tio_pool = p->tio_pool;
p->tio_pool = NULL;
md->bs = p->bs;
p->bs = NULL;
out:
/* mempool bind completed, now no need any mempools in the table */
dm_table_free_md_mempools(t);
}
/*
* Bind a table to the device.
*/
static void event_callback(void *context)
{
unsigned long flags;
LIST_HEAD(uevents);
struct mapped_device *md = (struct mapped_device *) context;
spin_lock_irqsave(&md->uevent_lock, flags);
list_splice_init(&md->uevent_list, &uevents);
spin_unlock_irqrestore(&md->uevent_lock, flags);
dm_send_uevents(&uevents, &disk_to_dev(md->disk)->kobj);
atomic_inc(&md->event_nr);
wake_up(&md->eventq);
}
/*
* Protected by md->suspend_lock obtained by dm_swap_table().
*/
static void __set_size(struct mapped_device *md, sector_t size)
{
set_capacity(md->disk, size);
i_size_write(md->bdev->bd_inode, (loff_t)size << SECTOR_SHIFT);
}
/*
* Return 1 if the queue has a compulsory merge_bvec_fn function.
*
* If this function returns 0, then the device is either a non-dm
* device without a merge_bvec_fn, or it is a dm device that is
* able to split any bios it receives that are too big.
*/
int dm_queue_merge_is_compulsory(struct request_queue *q)
{
struct mapped_device *dev_md;
if (!q->merge_bvec_fn)
return 0;
if (q->make_request_fn == dm_request) {
dev_md = q->queuedata;
if (test_bit(DMF_MERGE_IS_OPTIONAL, &dev_md->flags))
return 0;
}
return 1;
}
static int dm_device_merge_is_compulsory(struct dm_target *ti,
struct dm_dev *dev, sector_t start,
sector_t len, void *data)
{
struct block_device *bdev = dev->bdev;
struct request_queue *q = bdev_get_queue(bdev);
return dm_queue_merge_is_compulsory(q);
}
/*
* Return 1 if it is acceptable to ignore merge_bvec_fn based
* on the properties of the underlying devices.
*/
static int dm_table_merge_is_optional(struct dm_table *table)
{
unsigned i = 0;
struct dm_target *ti;
while (i < dm_table_get_num_targets(table)) {
ti = dm_table_get_target(table, i++);
if (ti->type->iterate_devices &&
ti->type->iterate_devices(ti, dm_device_merge_is_compulsory, NULL))
return 0;
}
return 1;
}
/*
* Returns old map, which caller must destroy.
*/
static struct dm_table *__bind(struct mapped_device *md, struct dm_table *t,
struct queue_limits *limits)
{
struct dm_table *old_map;
struct request_queue *q = md->queue;
sector_t size;
unsigned long flags;
int merge_is_optional;
size = dm_table_get_size(t);
/*
* Wipe any geometry if the size of the table changed.
*/
if (size != get_capacity(md->disk))
memset(&md->geometry, 0, sizeof(md->geometry));
__set_size(md, size);
dm table: rework reference counting Rework table reference counting. The existing code uses a reference counter. When the last reference is dropped and the counter reaches zero, the table destructor is called. Table reference counters are acquired/released from upcalls from other kernel code (dm_any_congested, dm_merge_bvec, dm_unplug_all). If the reference counter reaches zero in one of the upcalls, the table destructor is called from almost random kernel code. This leads to various problems: * dm_any_congested being called under a spinlock, which calls the destructor, which calls some sleeping function. * the destructor attempting to take a lock that is already taken by the same process. * stale reference from some other kernel code keeps the table constructed, which keeps some devices open, even after successful return from "dmsetup remove". This can confuse lvm and prevent closing of underlying devices or reusing device minor numbers. The patch changes reference counting so that the table destructor can be called only at predetermined places. The table has always exactly one reference from either mapped_device->map or hash_cell->new_map. After this patch, this reference is not counted in table->holders. A pair of dm_create_table/dm_destroy_table functions is used for table creation/destruction. Temporary references from the other code increase table->holders. A pair of dm_table_get/dm_table_put functions is used to manipulate it. When the table is about to be destroyed, we wait for table->holders to reach 0. Then, we call the table destructor. We use active waiting with msleep(1), because the situation happens rarely (to one user in 5 years) and removing the device isn't performance-critical task: the user doesn't care if it takes one tick more or not. This way, the destructor is called only at specific points (dm_table_destroy function) and the above problems associated with lazy destruction can't happen. Finally remove the temporary protection added to dm_any_congested(). Signed-off-by: Mikulas Patocka <mpatocka@redhat.com> Signed-off-by: Alasdair G Kergon <agk@redhat.com>
2009-01-06 11:05:10 +08:00
dm_table_event_callback(t, event_callback, md);
/*
* The queue hasn't been stopped yet, if the old table type wasn't
* for request-based during suspension. So stop it to prevent
* I/O mapping before resume.
* This must be done before setting the queue restrictions,
* because request-based dm may be run just after the setting.
*/
if (dm_table_request_based(t) && !blk_queue_stopped(q))
stop_queue(q);
__bind_mempools(md, t);
merge_is_optional = dm_table_merge_is_optional(t);
write_lock_irqsave(&md->map_lock, flags);
old_map = md->map;
md->map = t;
md->immutable_target_type = dm_table_get_immutable_target_type(t);
dm_table_set_restrictions(t, q, limits);
if (merge_is_optional)
set_bit(DMF_MERGE_IS_OPTIONAL, &md->flags);
else
clear_bit(DMF_MERGE_IS_OPTIONAL, &md->flags);
write_unlock_irqrestore(&md->map_lock, flags);
return old_map;
}
/*
* Returns unbound table for the caller to free.
*/
static struct dm_table *__unbind(struct mapped_device *md)
{
struct dm_table *map = md->map;
unsigned long flags;
if (!map)
return NULL;
dm_table_event_callback(map, NULL, NULL);
write_lock_irqsave(&md->map_lock, flags);
md->map = NULL;
write_unlock_irqrestore(&md->map_lock, flags);
return map;
}
/*
* Constructor for a new device.
*/
int dm_create(int minor, struct mapped_device **result)
{
struct mapped_device *md;
md = alloc_dev(minor);
if (!md)
return -ENXIO;
dm_sysfs_init(md);
*result = md;
return 0;
}
/*
* Functions to manage md->type.
* All are required to hold md->type_lock.
*/
void dm_lock_md_type(struct mapped_device *md)
{
mutex_lock(&md->type_lock);
}
void dm_unlock_md_type(struct mapped_device *md)
{
mutex_unlock(&md->type_lock);
}
void dm_set_md_type(struct mapped_device *md, unsigned type)
{
md->type = type;
}
unsigned dm_get_md_type(struct mapped_device *md)
{
return md->type;
}
struct target_type *dm_get_immutable_target_type(struct mapped_device *md)
{
return md->immutable_target_type;
}
dm: do not initialise full request queue when bio based Change bio-based mapped devices no longer to have a fully initialized request_queue (request_fn, elevator, etc). This means bio-based DM devices no longer register elevator sysfs attributes ('iosched/' tree or 'scheduler' other than "none"). In contrast, a request-based DM device will continue to have a full request_queue and will register elevator sysfs attributes. Therefore a user can determine a DM device's type by checking if elevator sysfs attributes exist. First allocate a minimalist request_queue structure for a DM device (needed for both bio and request-based DM). Initialization of a full request_queue is deferred until it is known that the DM device is request-based, at the end of the table load sequence. Factor DM device's request_queue initialization: - common to both request-based and bio-based into dm_init_md_queue(). - specific to request-based into dm_init_request_based_queue(). The md->type_lock mutex is used to protect md->queue, in addition to md->type, during table_load(). A DM device's first table_load will establish the immutable md->type. But md->queue initialization, based on md->type, may fail at that time (because blk_init_allocated_queue cannot allocate memory). Therefore any subsequent table_load must (re)try dm_setup_md_queue independently of establishing md->type. Signed-off-by: Mike Snitzer <snitzer@redhat.com> Acked-by: Kiyoshi Ueda <k-ueda@ct.jp.nec.com> Signed-off-by: Alasdair G Kergon <agk@redhat.com>
2010-08-12 11:14:02 +08:00
/*
* Fully initialize a request-based queue (->elevator, ->request_fn, etc).
*/
static int dm_init_request_based_queue(struct mapped_device *md)
{
struct request_queue *q = NULL;
if (md->queue->elevator)
return 1;
/* Fully initialize the queue */
q = blk_init_allocated_queue(md->queue, dm_request_fn, NULL);
if (!q)
return 0;
md->queue = q;
dm_init_md_queue(md);
blk_queue_softirq_done(md->queue, dm_softirq_done);
blk_queue_prep_rq(md->queue, dm_prep_fn);
blk_queue_lld_busy(md->queue, dm_lld_busy);
elv_register_queue(md->queue);
return 1;
}
/*
* Setup the DM device's queue based on md's type
*/
int dm_setup_md_queue(struct mapped_device *md)
{
if ((dm_get_md_type(md) == DM_TYPE_REQUEST_BASED) &&
!dm_init_request_based_queue(md)) {
DMWARN("Cannot initialize queue for request-based mapped device");
return -EINVAL;
}
return 0;
}
static struct mapped_device *dm_find_md(dev_t dev)
{
struct mapped_device *md;
unsigned minor = MINOR(dev);
if (MAJOR(dev) != _major || minor >= (1 << MINORBITS))
return NULL;
spin_lock(&_minor_lock);
md = idr_find(&_minor_idr, minor);
if (md && (md == MINOR_ALLOCED ||
(MINOR(disk_devt(dm_disk(md))) != minor) ||
dm_deleting_md(md) ||
test_bit(DMF_FREEING, &md->flags))) {
md = NULL;
goto out;
}
out:
spin_unlock(&_minor_lock);
return md;
}
struct mapped_device *dm_get_md(dev_t dev)
{
struct mapped_device *md = dm_find_md(dev);
if (md)
dm_get(md);
return md;
}
EXPORT_SYMBOL_GPL(dm_get_md);
void *dm_get_mdptr(struct mapped_device *md)
{
return md->interface_ptr;
}
void dm_set_mdptr(struct mapped_device *md, void *ptr)
{
md->interface_ptr = ptr;
}
void dm_get(struct mapped_device *md)
{
atomic_inc(&md->holders);
dm: separate device deletion from dm_put This patch separates the device deletion code from dm_put() to make sure the deletion happens in the process context. By this patch, device deletion always occurs in an ioctl (process) context and dm_put() can be called in interrupt context. As a result, the request-based dm's bad dm_put() usage pointed out by Mikulas below disappears. http://marc.info/?l=dm-devel&m=126699981019735&w=2 Without this patch, I confirmed there is a case to crash the system: dm_put() => dm_table_destroy() => vfree() => BUG_ON(in_interrupt()) Some more backgrounds and details: In request-based dm, a device opener can remove a mapped_device while the last request is still completing, because bios in the last request complete first and then the device opener can close and remove the mapped_device before the last request completes: CPU0 CPU1 ================================================================= <<INTERRUPT>> blk_end_request_all(clone_rq) blk_update_request(clone_rq) bio_endio(clone_bio) == end_clone_bio blk_update_request(orig_rq) bio_endio(orig_bio) <<I/O completed>> dm_blk_close() dev_remove() dm_put(md) <<Free md>> blk_finish_request(clone_rq) .... dm_end_request(clone_rq) free_rq_clone(clone_rq) blk_end_request_all(orig_rq) rq_completed(md) So request-based dm used dm_get()/dm_put() to hold md for each I/O until its request completion handling is fully done. However, the final dm_put() can call the device deletion code which must not be run in interrupt context and may cause kernel panic. To solve the problem, this patch moves the device deletion code, dm_destroy(), to predetermined places that is actually deleting the mapped_device in ioctl (process) context, and changes dm_put() just to decrement the reference count of the mapped_device. By this change, dm_put() can be used in any context and the symmetric model below is introduced: dm_create(): create a mapped_device dm_destroy(): destroy a mapped_device dm_get(): increment the reference count of a mapped_device dm_put(): decrement the reference count of a mapped_device dm_destroy() waits for all references of the mapped_device to disappear, then deletes the mapped_device. dm_destroy() uses active waiting with msleep(1), since deleting the mapped_device isn't performance-critical task. And since at this point, nobody opens the mapped_device and no new reference will be taken, the pending counts are just for racing completing activity and will eventually decrease to zero. For the unlikely case of the forced module unload, dm_destroy_immediate(), which doesn't wait and forcibly deletes the mapped_device, is also introduced and used in dm_hash_remove_all(). Otherwise, "rmmod -f" may be stuck and never return. And now, because the mapped_device is deleted at this point, subsequent accesses to the mapped_device may cause NULL pointer references. Cc: stable@kernel.org Signed-off-by: Kiyoshi Ueda <k-ueda@ct.jp.nec.com> Signed-off-by: Jun'ichi Nomura <j-nomura@ce.jp.nec.com> Signed-off-by: Alasdair G Kergon <agk@redhat.com>
2010-08-12 11:13:56 +08:00
BUG_ON(test_bit(DMF_FREEING, &md->flags));
}
const char *dm_device_name(struct mapped_device *md)
{
return md->name;
}
EXPORT_SYMBOL_GPL(dm_device_name);
dm: separate device deletion from dm_put This patch separates the device deletion code from dm_put() to make sure the deletion happens in the process context. By this patch, device deletion always occurs in an ioctl (process) context and dm_put() can be called in interrupt context. As a result, the request-based dm's bad dm_put() usage pointed out by Mikulas below disappears. http://marc.info/?l=dm-devel&m=126699981019735&w=2 Without this patch, I confirmed there is a case to crash the system: dm_put() => dm_table_destroy() => vfree() => BUG_ON(in_interrupt()) Some more backgrounds and details: In request-based dm, a device opener can remove a mapped_device while the last request is still completing, because bios in the last request complete first and then the device opener can close and remove the mapped_device before the last request completes: CPU0 CPU1 ================================================================= <<INTERRUPT>> blk_end_request_all(clone_rq) blk_update_request(clone_rq) bio_endio(clone_bio) == end_clone_bio blk_update_request(orig_rq) bio_endio(orig_bio) <<I/O completed>> dm_blk_close() dev_remove() dm_put(md) <<Free md>> blk_finish_request(clone_rq) .... dm_end_request(clone_rq) free_rq_clone(clone_rq) blk_end_request_all(orig_rq) rq_completed(md) So request-based dm used dm_get()/dm_put() to hold md for each I/O until its request completion handling is fully done. However, the final dm_put() can call the device deletion code which must not be run in interrupt context and may cause kernel panic. To solve the problem, this patch moves the device deletion code, dm_destroy(), to predetermined places that is actually deleting the mapped_device in ioctl (process) context, and changes dm_put() just to decrement the reference count of the mapped_device. By this change, dm_put() can be used in any context and the symmetric model below is introduced: dm_create(): create a mapped_device dm_destroy(): destroy a mapped_device dm_get(): increment the reference count of a mapped_device dm_put(): decrement the reference count of a mapped_device dm_destroy() waits for all references of the mapped_device to disappear, then deletes the mapped_device. dm_destroy() uses active waiting with msleep(1), since deleting the mapped_device isn't performance-critical task. And since at this point, nobody opens the mapped_device and no new reference will be taken, the pending counts are just for racing completing activity and will eventually decrease to zero. For the unlikely case of the forced module unload, dm_destroy_immediate(), which doesn't wait and forcibly deletes the mapped_device, is also introduced and used in dm_hash_remove_all(). Otherwise, "rmmod -f" may be stuck and never return. And now, because the mapped_device is deleted at this point, subsequent accesses to the mapped_device may cause NULL pointer references. Cc: stable@kernel.org Signed-off-by: Kiyoshi Ueda <k-ueda@ct.jp.nec.com> Signed-off-by: Jun'ichi Nomura <j-nomura@ce.jp.nec.com> Signed-off-by: Alasdair G Kergon <agk@redhat.com>
2010-08-12 11:13:56 +08:00
static void __dm_destroy(struct mapped_device *md, bool wait)
{
struct dm_table *map;
dm: separate device deletion from dm_put This patch separates the device deletion code from dm_put() to make sure the deletion happens in the process context. By this patch, device deletion always occurs in an ioctl (process) context and dm_put() can be called in interrupt context. As a result, the request-based dm's bad dm_put() usage pointed out by Mikulas below disappears. http://marc.info/?l=dm-devel&m=126699981019735&w=2 Without this patch, I confirmed there is a case to crash the system: dm_put() => dm_table_destroy() => vfree() => BUG_ON(in_interrupt()) Some more backgrounds and details: In request-based dm, a device opener can remove a mapped_device while the last request is still completing, because bios in the last request complete first and then the device opener can close and remove the mapped_device before the last request completes: CPU0 CPU1 ================================================================= <<INTERRUPT>> blk_end_request_all(clone_rq) blk_update_request(clone_rq) bio_endio(clone_bio) == end_clone_bio blk_update_request(orig_rq) bio_endio(orig_bio) <<I/O completed>> dm_blk_close() dev_remove() dm_put(md) <<Free md>> blk_finish_request(clone_rq) .... dm_end_request(clone_rq) free_rq_clone(clone_rq) blk_end_request_all(orig_rq) rq_completed(md) So request-based dm used dm_get()/dm_put() to hold md for each I/O until its request completion handling is fully done. However, the final dm_put() can call the device deletion code which must not be run in interrupt context and may cause kernel panic. To solve the problem, this patch moves the device deletion code, dm_destroy(), to predetermined places that is actually deleting the mapped_device in ioctl (process) context, and changes dm_put() just to decrement the reference count of the mapped_device. By this change, dm_put() can be used in any context and the symmetric model below is introduced: dm_create(): create a mapped_device dm_destroy(): destroy a mapped_device dm_get(): increment the reference count of a mapped_device dm_put(): decrement the reference count of a mapped_device dm_destroy() waits for all references of the mapped_device to disappear, then deletes the mapped_device. dm_destroy() uses active waiting with msleep(1), since deleting the mapped_device isn't performance-critical task. And since at this point, nobody opens the mapped_device and no new reference will be taken, the pending counts are just for racing completing activity and will eventually decrease to zero. For the unlikely case of the forced module unload, dm_destroy_immediate(), which doesn't wait and forcibly deletes the mapped_device, is also introduced and used in dm_hash_remove_all(). Otherwise, "rmmod -f" may be stuck and never return. And now, because the mapped_device is deleted at this point, subsequent accesses to the mapped_device may cause NULL pointer references. Cc: stable@kernel.org Signed-off-by: Kiyoshi Ueda <k-ueda@ct.jp.nec.com> Signed-off-by: Jun'ichi Nomura <j-nomura@ce.jp.nec.com> Signed-off-by: Alasdair G Kergon <agk@redhat.com>
2010-08-12 11:13:56 +08:00
might_sleep();
dm: separate device deletion from dm_put This patch separates the device deletion code from dm_put() to make sure the deletion happens in the process context. By this patch, device deletion always occurs in an ioctl (process) context and dm_put() can be called in interrupt context. As a result, the request-based dm's bad dm_put() usage pointed out by Mikulas below disappears. http://marc.info/?l=dm-devel&m=126699981019735&w=2 Without this patch, I confirmed there is a case to crash the system: dm_put() => dm_table_destroy() => vfree() => BUG_ON(in_interrupt()) Some more backgrounds and details: In request-based dm, a device opener can remove a mapped_device while the last request is still completing, because bios in the last request complete first and then the device opener can close and remove the mapped_device before the last request completes: CPU0 CPU1 ================================================================= <<INTERRUPT>> blk_end_request_all(clone_rq) blk_update_request(clone_rq) bio_endio(clone_bio) == end_clone_bio blk_update_request(orig_rq) bio_endio(orig_bio) <<I/O completed>> dm_blk_close() dev_remove() dm_put(md) <<Free md>> blk_finish_request(clone_rq) .... dm_end_request(clone_rq) free_rq_clone(clone_rq) blk_end_request_all(orig_rq) rq_completed(md) So request-based dm used dm_get()/dm_put() to hold md for each I/O until its request completion handling is fully done. However, the final dm_put() can call the device deletion code which must not be run in interrupt context and may cause kernel panic. To solve the problem, this patch moves the device deletion code, dm_destroy(), to predetermined places that is actually deleting the mapped_device in ioctl (process) context, and changes dm_put() just to decrement the reference count of the mapped_device. By this change, dm_put() can be used in any context and the symmetric model below is introduced: dm_create(): create a mapped_device dm_destroy(): destroy a mapped_device dm_get(): increment the reference count of a mapped_device dm_put(): decrement the reference count of a mapped_device dm_destroy() waits for all references of the mapped_device to disappear, then deletes the mapped_device. dm_destroy() uses active waiting with msleep(1), since deleting the mapped_device isn't performance-critical task. And since at this point, nobody opens the mapped_device and no new reference will be taken, the pending counts are just for racing completing activity and will eventually decrease to zero. For the unlikely case of the forced module unload, dm_destroy_immediate(), which doesn't wait and forcibly deletes the mapped_device, is also introduced and used in dm_hash_remove_all(). Otherwise, "rmmod -f" may be stuck and never return. And now, because the mapped_device is deleted at this point, subsequent accesses to the mapped_device may cause NULL pointer references. Cc: stable@kernel.org Signed-off-by: Kiyoshi Ueda <k-ueda@ct.jp.nec.com> Signed-off-by: Jun'ichi Nomura <j-nomura@ce.jp.nec.com> Signed-off-by: Alasdair G Kergon <agk@redhat.com>
2010-08-12 11:13:56 +08:00
spin_lock(&_minor_lock);
map = dm_get_live_table(md);
idr_replace(&_minor_idr, MINOR_ALLOCED, MINOR(disk_devt(dm_disk(md))));
set_bit(DMF_FREEING, &md->flags);
spin_unlock(&_minor_lock);
if (!dm_suspended_md(md)) {
dm_table_presuspend_targets(map);
dm_table_postsuspend_targets(map);
}
dm: separate device deletion from dm_put This patch separates the device deletion code from dm_put() to make sure the deletion happens in the process context. By this patch, device deletion always occurs in an ioctl (process) context and dm_put() can be called in interrupt context. As a result, the request-based dm's bad dm_put() usage pointed out by Mikulas below disappears. http://marc.info/?l=dm-devel&m=126699981019735&w=2 Without this patch, I confirmed there is a case to crash the system: dm_put() => dm_table_destroy() => vfree() => BUG_ON(in_interrupt()) Some more backgrounds and details: In request-based dm, a device opener can remove a mapped_device while the last request is still completing, because bios in the last request complete first and then the device opener can close and remove the mapped_device before the last request completes: CPU0 CPU1 ================================================================= <<INTERRUPT>> blk_end_request_all(clone_rq) blk_update_request(clone_rq) bio_endio(clone_bio) == end_clone_bio blk_update_request(orig_rq) bio_endio(orig_bio) <<I/O completed>> dm_blk_close() dev_remove() dm_put(md) <<Free md>> blk_finish_request(clone_rq) .... dm_end_request(clone_rq) free_rq_clone(clone_rq) blk_end_request_all(orig_rq) rq_completed(md) So request-based dm used dm_get()/dm_put() to hold md for each I/O until its request completion handling is fully done. However, the final dm_put() can call the device deletion code which must not be run in interrupt context and may cause kernel panic. To solve the problem, this patch moves the device deletion code, dm_destroy(), to predetermined places that is actually deleting the mapped_device in ioctl (process) context, and changes dm_put() just to decrement the reference count of the mapped_device. By this change, dm_put() can be used in any context and the symmetric model below is introduced: dm_create(): create a mapped_device dm_destroy(): destroy a mapped_device dm_get(): increment the reference count of a mapped_device dm_put(): decrement the reference count of a mapped_device dm_destroy() waits for all references of the mapped_device to disappear, then deletes the mapped_device. dm_destroy() uses active waiting with msleep(1), since deleting the mapped_device isn't performance-critical task. And since at this point, nobody opens the mapped_device and no new reference will be taken, the pending counts are just for racing completing activity and will eventually decrease to zero. For the unlikely case of the forced module unload, dm_destroy_immediate(), which doesn't wait and forcibly deletes the mapped_device, is also introduced and used in dm_hash_remove_all(). Otherwise, "rmmod -f" may be stuck and never return. And now, because the mapped_device is deleted at this point, subsequent accesses to the mapped_device may cause NULL pointer references. Cc: stable@kernel.org Signed-off-by: Kiyoshi Ueda <k-ueda@ct.jp.nec.com> Signed-off-by: Jun'ichi Nomura <j-nomura@ce.jp.nec.com> Signed-off-by: Alasdair G Kergon <agk@redhat.com>
2010-08-12 11:13:56 +08:00
/*
* Rare, but there may be I/O requests still going to complete,
* for example. Wait for all references to disappear.
* No one should increment the reference count of the mapped_device,
* after the mapped_device state becomes DMF_FREEING.
*/
if (wait)
while (atomic_read(&md->holders))
msleep(1);
else if (atomic_read(&md->holders))
DMWARN("%s: Forcibly removing mapped_device still in use! (%d users)",
dm_device_name(md), atomic_read(&md->holders));
dm_sysfs_exit(md);
dm_table_put(map);
dm_table_destroy(__unbind(md));
free_dev(md);
}
void dm_destroy(struct mapped_device *md)
{
__dm_destroy(md, true);
}
void dm_destroy_immediate(struct mapped_device *md)
{
__dm_destroy(md, false);
}
void dm_put(struct mapped_device *md)
{
atomic_dec(&md->holders);
}
EXPORT_SYMBOL_GPL(dm_put);
static int dm_wait_for_completion(struct mapped_device *md, int interruptible)
{
int r = 0;
DECLARE_WAITQUEUE(wait, current);
add_wait_queue(&md->wait, &wait);
while (1) {
set_current_state(interruptible);
if (!md_in_flight(md))
break;
if (interruptible == TASK_INTERRUPTIBLE &&
signal_pending(current)) {
r = -EINTR;
break;
}
io_schedule();
}
set_current_state(TASK_RUNNING);
remove_wait_queue(&md->wait, &wait);
return r;
}
/*
* Process the deferred bios
*/
static void dm_wq_work(struct work_struct *work)
{
struct mapped_device *md = container_of(work, struct mapped_device,
work);
struct bio *c;
dm: relax ordering of bio-based flush implementation Unlike REQ_HARDBARRIER, REQ_FLUSH/FUA doesn't mandate any ordering against other bio's. This patch relaxes ordering around flushes. * A flush bio is no longer deferred to workqueue directly. It's processed like other bio's but __split_and_process_bio() uses md->flush_bio as the clone source. md->flush_bio is initialized to empty flush during md initialization and shared for all flushes. * As a flush bio now travels through the same execution path as other bio's, there's no need for dedicated error handling path either. It can use the same error handling path in dec_pending(). Dedicated error handling removed along with md->flush_error. * When dec_pending() detects that a flush has completed, it checks whether the original bio has data. If so, the bio is queued to the deferred list w/ REQ_FLUSH cleared; otherwise, it's completed. * As flush sequencing is handled in the usual issue/completion path, dm_wq_work() no longer needs to handle flushes differently. Now its only responsibility is re-issuing deferred bio's the same way as _dm_request() would. REQ_FLUSH handling logic including process_flush() is dropped. * There's no reason for queue_io() and dm_wq_work() write lock dm->io_lock. queue_io() now only uses md->deferred_lock and dm_wq_work() read locks dm->io_lock. * bio's no longer need to be queued on the deferred list while a flush is in progress making DMF_QUEUE_IO_TO_THREAD unncessary. Drop it. This avoids stalling the device during flushes and simplifies the implementation. Signed-off-by: Tejun Heo <tj@kernel.org> Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
2010-09-09 00:07:00 +08:00
down_read(&md->io_lock);
while (!test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags)) {
spin_lock_irq(&md->deferred_lock);
c = bio_list_pop(&md->deferred);
spin_unlock_irq(&md->deferred_lock);
dm: relax ordering of bio-based flush implementation Unlike REQ_HARDBARRIER, REQ_FLUSH/FUA doesn't mandate any ordering against other bio's. This patch relaxes ordering around flushes. * A flush bio is no longer deferred to workqueue directly. It's processed like other bio's but __split_and_process_bio() uses md->flush_bio as the clone source. md->flush_bio is initialized to empty flush during md initialization and shared for all flushes. * As a flush bio now travels through the same execution path as other bio's, there's no need for dedicated error handling path either. It can use the same error handling path in dec_pending(). Dedicated error handling removed along with md->flush_error. * When dec_pending() detects that a flush has completed, it checks whether the original bio has data. If so, the bio is queued to the deferred list w/ REQ_FLUSH cleared; otherwise, it's completed. * As flush sequencing is handled in the usual issue/completion path, dm_wq_work() no longer needs to handle flushes differently. Now its only responsibility is re-issuing deferred bio's the same way as _dm_request() would. REQ_FLUSH handling logic including process_flush() is dropped. * There's no reason for queue_io() and dm_wq_work() write lock dm->io_lock. queue_io() now only uses md->deferred_lock and dm_wq_work() read locks dm->io_lock. * bio's no longer need to be queued on the deferred list while a flush is in progress making DMF_QUEUE_IO_TO_THREAD unncessary. Drop it. This avoids stalling the device during flushes and simplifies the implementation. Signed-off-by: Tejun Heo <tj@kernel.org> Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
2010-09-09 00:07:00 +08:00
if (!c)
break;
dm: relax ordering of bio-based flush implementation Unlike REQ_HARDBARRIER, REQ_FLUSH/FUA doesn't mandate any ordering against other bio's. This patch relaxes ordering around flushes. * A flush bio is no longer deferred to workqueue directly. It's processed like other bio's but __split_and_process_bio() uses md->flush_bio as the clone source. md->flush_bio is initialized to empty flush during md initialization and shared for all flushes. * As a flush bio now travels through the same execution path as other bio's, there's no need for dedicated error handling path either. It can use the same error handling path in dec_pending(). Dedicated error handling removed along with md->flush_error. * When dec_pending() detects that a flush has completed, it checks whether the original bio has data. If so, the bio is queued to the deferred list w/ REQ_FLUSH cleared; otherwise, it's completed. * As flush sequencing is handled in the usual issue/completion path, dm_wq_work() no longer needs to handle flushes differently. Now its only responsibility is re-issuing deferred bio's the same way as _dm_request() would. REQ_FLUSH handling logic including process_flush() is dropped. * There's no reason for queue_io() and dm_wq_work() write lock dm->io_lock. queue_io() now only uses md->deferred_lock and dm_wq_work() read locks dm->io_lock. * bio's no longer need to be queued on the deferred list while a flush is in progress making DMF_QUEUE_IO_TO_THREAD unncessary. Drop it. This avoids stalling the device during flushes and simplifies the implementation. Signed-off-by: Tejun Heo <tj@kernel.org> Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
2010-09-09 00:07:00 +08:00
up_read(&md->io_lock);
if (dm_request_based(md))
generic_make_request(c);
dm: relax ordering of bio-based flush implementation Unlike REQ_HARDBARRIER, REQ_FLUSH/FUA doesn't mandate any ordering against other bio's. This patch relaxes ordering around flushes. * A flush bio is no longer deferred to workqueue directly. It's processed like other bio's but __split_and_process_bio() uses md->flush_bio as the clone source. md->flush_bio is initialized to empty flush during md initialization and shared for all flushes. * As a flush bio now travels through the same execution path as other bio's, there's no need for dedicated error handling path either. It can use the same error handling path in dec_pending(). Dedicated error handling removed along with md->flush_error. * When dec_pending() detects that a flush has completed, it checks whether the original bio has data. If so, the bio is queued to the deferred list w/ REQ_FLUSH cleared; otherwise, it's completed. * As flush sequencing is handled in the usual issue/completion path, dm_wq_work() no longer needs to handle flushes differently. Now its only responsibility is re-issuing deferred bio's the same way as _dm_request() would. REQ_FLUSH handling logic including process_flush() is dropped. * There's no reason for queue_io() and dm_wq_work() write lock dm->io_lock. queue_io() now only uses md->deferred_lock and dm_wq_work() read locks dm->io_lock. * bio's no longer need to be queued on the deferred list while a flush is in progress making DMF_QUEUE_IO_TO_THREAD unncessary. Drop it. This avoids stalling the device during flushes and simplifies the implementation. Signed-off-by: Tejun Heo <tj@kernel.org> Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
2010-09-09 00:07:00 +08:00
else
__split_and_process_bio(md, c);
dm: relax ordering of bio-based flush implementation Unlike REQ_HARDBARRIER, REQ_FLUSH/FUA doesn't mandate any ordering against other bio's. This patch relaxes ordering around flushes. * A flush bio is no longer deferred to workqueue directly. It's processed like other bio's but __split_and_process_bio() uses md->flush_bio as the clone source. md->flush_bio is initialized to empty flush during md initialization and shared for all flushes. * As a flush bio now travels through the same execution path as other bio's, there's no need for dedicated error handling path either. It can use the same error handling path in dec_pending(). Dedicated error handling removed along with md->flush_error. * When dec_pending() detects that a flush has completed, it checks whether the original bio has data. If so, the bio is queued to the deferred list w/ REQ_FLUSH cleared; otherwise, it's completed. * As flush sequencing is handled in the usual issue/completion path, dm_wq_work() no longer needs to handle flushes differently. Now its only responsibility is re-issuing deferred bio's the same way as _dm_request() would. REQ_FLUSH handling logic including process_flush() is dropped. * There's no reason for queue_io() and dm_wq_work() write lock dm->io_lock. queue_io() now only uses md->deferred_lock and dm_wq_work() read locks dm->io_lock. * bio's no longer need to be queued on the deferred list while a flush is in progress making DMF_QUEUE_IO_TO_THREAD unncessary. Drop it. This avoids stalling the device during flushes and simplifies the implementation. Signed-off-by: Tejun Heo <tj@kernel.org> Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
2010-09-09 00:07:00 +08:00
down_read(&md->io_lock);
}
dm: relax ordering of bio-based flush implementation Unlike REQ_HARDBARRIER, REQ_FLUSH/FUA doesn't mandate any ordering against other bio's. This patch relaxes ordering around flushes. * A flush bio is no longer deferred to workqueue directly. It's processed like other bio's but __split_and_process_bio() uses md->flush_bio as the clone source. md->flush_bio is initialized to empty flush during md initialization and shared for all flushes. * As a flush bio now travels through the same execution path as other bio's, there's no need for dedicated error handling path either. It can use the same error handling path in dec_pending(). Dedicated error handling removed along with md->flush_error. * When dec_pending() detects that a flush has completed, it checks whether the original bio has data. If so, the bio is queued to the deferred list w/ REQ_FLUSH cleared; otherwise, it's completed. * As flush sequencing is handled in the usual issue/completion path, dm_wq_work() no longer needs to handle flushes differently. Now its only responsibility is re-issuing deferred bio's the same way as _dm_request() would. REQ_FLUSH handling logic including process_flush() is dropped. * There's no reason for queue_io() and dm_wq_work() write lock dm->io_lock. queue_io() now only uses md->deferred_lock and dm_wq_work() read locks dm->io_lock. * bio's no longer need to be queued on the deferred list while a flush is in progress making DMF_QUEUE_IO_TO_THREAD unncessary. Drop it. This avoids stalling the device during flushes and simplifies the implementation. Signed-off-by: Tejun Heo <tj@kernel.org> Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
2010-09-09 00:07:00 +08:00
up_read(&md->io_lock);
}
static void dm_queue_flush(struct mapped_device *md)
{
clear_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags);
smp_mb__after_clear_bit();
queue_work(md->wq, &md->work);
}
/*
* Swap in a new table, returning the old one for the caller to destroy.
*/
struct dm_table *dm_swap_table(struct mapped_device *md, struct dm_table *table)
{
struct dm_table *map = ERR_PTR(-EINVAL);
struct queue_limits limits;
int r;
mutex_lock(&md->suspend_lock);
/* device must be suspended */
if (!dm_suspended_md(md))
goto out;
r = dm_calculate_queue_limits(table, &limits);
if (r) {
map = ERR_PTR(r);
goto out;
}
map = __bind(md, table, &limits);
out:
mutex_unlock(&md->suspend_lock);
return map;
}
/*
* Functions to lock and unlock any filesystem running on the
* device.
*/
static int lock_fs(struct mapped_device *md)
{
int r;
WARN_ON(md->frozen_sb);
md->frozen_sb = freeze_bdev(md->bdev);
if (IS_ERR(md->frozen_sb)) {
r = PTR_ERR(md->frozen_sb);
md->frozen_sb = NULL;
return r;
}
set_bit(DMF_FROZEN, &md->flags);
return 0;
}
static void unlock_fs(struct mapped_device *md)
{
if (!test_bit(DMF_FROZEN, &md->flags))
return;
thaw_bdev(md->bdev, md->frozen_sb);
md->frozen_sb = NULL;
clear_bit(DMF_FROZEN, &md->flags);
}
/*
* We need to be able to change a mapping table under a mounted
* filesystem. For example we might want to move some data in
* the background. Before the table can be swapped with
* dm_bind_table, dm_suspend must be called to flush any in
* flight bios and ensure that any further io gets deferred.
*/
dm: prepare for request based option This patch adds core functions for request-based dm. When struct mapped device (md) is initialized, md->queue has an I/O scheduler and the following functions are used for request-based dm as the queue functions: make_request_fn: dm_make_request() pref_fn: dm_prep_fn() request_fn: dm_request_fn() softirq_done_fn: dm_softirq_done() lld_busy_fn: dm_lld_busy() Actual initializations are done in another patch (PATCH 2). Below is a brief summary of how request-based dm behaves, including: - making request from bio - cloning, mapping and dispatching request - completing request and bio - suspending md - resuming md bio to request ============== md->queue->make_request_fn() (dm_make_request()) calls __make_request() for a bio submitted to the md. Then, the bio is kept in the queue as a new request or merged into another request in the queue if possible. Cloning and Mapping =================== Cloning and mapping are done in md->queue->request_fn() (dm_request_fn()), when requests are dispatched after they are sorted by the I/O scheduler. dm_request_fn() checks busy state of underlying devices using target's busy() function and stops dispatching requests to keep them on the dm device's queue if busy. It helps better I/O merging, since no merge is done for a request once it is dispatched to underlying devices. Actual cloning and mapping are done in dm_prep_fn() and map_request() called from dm_request_fn(). dm_prep_fn() clones not only request but also bios of the request so that dm can hold bio completion in error cases and prevent the bio submitter from noticing the error. (See the "Completion" section below for details.) After the cloning, the clone is mapped by target's map_rq() function and inserted to underlying device's queue using blk_insert_cloned_request(). Completion ========== Request completion can be hooked by rq->end_io(), but then, all bios in the request will have been completed even error cases, and the bio submitter will have noticed the error. To prevent the bio completion in error cases, request-based dm clones both bio and request and hooks both bio->bi_end_io() and rq->end_io(): bio->bi_end_io(): end_clone_bio() rq->end_io(): end_clone_request() Summary of the request completion flow is below: blk_end_request() for a clone request => blk_update_request() => bio->bi_end_io() == end_clone_bio() for each clone bio => Free the clone bio => Success: Complete the original bio (blk_update_request()) Error: Don't complete the original bio => blk_finish_request() => rq->end_io() == end_clone_request() => blk_complete_request() => dm_softirq_done() => Free the clone request => Success: Complete the original request (blk_end_request()) Error: Requeue the original request end_clone_bio() completes the original request on the size of the original bio in successful cases. Even if all bios in the original request are completed by that completion, the original request must not be completed yet to keep the ordering of request completion for the stacking. So end_clone_bio() uses blk_update_request() instead of blk_end_request(). In error cases, end_clone_bio() doesn't complete the original bio. It just frees the cloned bio and gives over the error handling to end_clone_request(). end_clone_request(), which is called with queue lock held, completes the clone request and the original request in a softirq context (dm_softirq_done()), which has no queue lock, to avoid a deadlock issue on submission of another request during the completion: - The submitted request may be mapped to the same device - Request submission requires queue lock, but the queue lock has been held by itself and it doesn't know that The clone request has no clone bio when dm_softirq_done() is called. So target drivers can't resubmit it again even error cases. Instead, they can ask dm core for requeueing and remapping the original request in that cases. suspend ======= Request-based dm uses stopping md->queue as suspend of the md. For noflush suspend, just stops md->queue. For flush suspend, inserts a marker request to the tail of md->queue. And dispatches all requests in md->queue until the marker comes to the front of md->queue. Then, stops dispatching request and waits for the all dispatched requests to complete. After that, completes the marker request, stops md->queue and wake up the waiter on the suspend queue, md->wait. resume ====== Starts md->queue. Signed-off-by: Kiyoshi Ueda <k-ueda@ct.jp.nec.com> Signed-off-by: Jun'ichi Nomura <j-nomura@ce.jp.nec.com> Signed-off-by: Alasdair G Kergon <agk@redhat.com>
2009-06-22 17:12:35 +08:00
/*
* Suspend mechanism in request-based dm.
*
dm: simplify request based suspend The semantics of bio-based dm were changed recently in the case of suspend with "--nolockfs" but without "--noflush". Before 2.6.30, I/Os submitted before the suspend invocation were always flushed. From 2.6.30 onwards, I/Os submitted before the suspend invocation might not be flushed. (For details, see http://marc.info/?t=123994433400003&r=1&w=2) This patch brings the behaviour of request-based dm into line with bio-based dm, simplifying the code and preparing for a subsequent patch that will wait for all in_flight I/Os to complete without stopping request_queue and use dm_wait_for_completion() for it. This change in semantics simplifies the suspend code as follows: o Suspend is implemented as stopping request_queue in request-based dm, and all I/Os are queued in the request_queue even after suspend is invoked. o In the old semantics, we had to track whether I/Os were queued before or after the suspend invocation, so a special barrier-like request called 'suspend marker' was introduced. o With the new semantics, we don't need to flush any I/O so we can remove the marker and the code related to the marker handling and I/O flushing. After removing this codes, the suspend sequence is now: 1. Flush all I/Os by lock_fs() if needed. 2. Stop dispatching any I/O by stopping the request_queue. 3. Wait for all in-flight I/Os to be completed or requeued. Signed-off-by: Kiyoshi Ueda <k-ueda@ct.jp.nec.com> Signed-off-by: Jun'ichi Nomura <j-nomura@ce.jp.nec.com> Signed-off-by: Alasdair G Kergon <agk@redhat.com>
2009-12-11 07:52:16 +08:00
* 1. Flush all I/Os by lock_fs() if needed.
* 2. Stop dispatching any I/O by stopping the request_queue.
* 3. Wait for all in-flight I/Os to be completed or requeued.
dm: prepare for request based option This patch adds core functions for request-based dm. When struct mapped device (md) is initialized, md->queue has an I/O scheduler and the following functions are used for request-based dm as the queue functions: make_request_fn: dm_make_request() pref_fn: dm_prep_fn() request_fn: dm_request_fn() softirq_done_fn: dm_softirq_done() lld_busy_fn: dm_lld_busy() Actual initializations are done in another patch (PATCH 2). Below is a brief summary of how request-based dm behaves, including: - making request from bio - cloning, mapping and dispatching request - completing request and bio - suspending md - resuming md bio to request ============== md->queue->make_request_fn() (dm_make_request()) calls __make_request() for a bio submitted to the md. Then, the bio is kept in the queue as a new request or merged into another request in the queue if possible. Cloning and Mapping =================== Cloning and mapping are done in md->queue->request_fn() (dm_request_fn()), when requests are dispatched after they are sorted by the I/O scheduler. dm_request_fn() checks busy state of underlying devices using target's busy() function and stops dispatching requests to keep them on the dm device's queue if busy. It helps better I/O merging, since no merge is done for a request once it is dispatched to underlying devices. Actual cloning and mapping are done in dm_prep_fn() and map_request() called from dm_request_fn(). dm_prep_fn() clones not only request but also bios of the request so that dm can hold bio completion in error cases and prevent the bio submitter from noticing the error. (See the "Completion" section below for details.) After the cloning, the clone is mapped by target's map_rq() function and inserted to underlying device's queue using blk_insert_cloned_request(). Completion ========== Request completion can be hooked by rq->end_io(), but then, all bios in the request will have been completed even error cases, and the bio submitter will have noticed the error. To prevent the bio completion in error cases, request-based dm clones both bio and request and hooks both bio->bi_end_io() and rq->end_io(): bio->bi_end_io(): end_clone_bio() rq->end_io(): end_clone_request() Summary of the request completion flow is below: blk_end_request() for a clone request => blk_update_request() => bio->bi_end_io() == end_clone_bio() for each clone bio => Free the clone bio => Success: Complete the original bio (blk_update_request()) Error: Don't complete the original bio => blk_finish_request() => rq->end_io() == end_clone_request() => blk_complete_request() => dm_softirq_done() => Free the clone request => Success: Complete the original request (blk_end_request()) Error: Requeue the original request end_clone_bio() completes the original request on the size of the original bio in successful cases. Even if all bios in the original request are completed by that completion, the original request must not be completed yet to keep the ordering of request completion for the stacking. So end_clone_bio() uses blk_update_request() instead of blk_end_request(). In error cases, end_clone_bio() doesn't complete the original bio. It just frees the cloned bio and gives over the error handling to end_clone_request(). end_clone_request(), which is called with queue lock held, completes the clone request and the original request in a softirq context (dm_softirq_done()), which has no queue lock, to avoid a deadlock issue on submission of another request during the completion: - The submitted request may be mapped to the same device - Request submission requires queue lock, but the queue lock has been held by itself and it doesn't know that The clone request has no clone bio when dm_softirq_done() is called. So target drivers can't resubmit it again even error cases. Instead, they can ask dm core for requeueing and remapping the original request in that cases. suspend ======= Request-based dm uses stopping md->queue as suspend of the md. For noflush suspend, just stops md->queue. For flush suspend, inserts a marker request to the tail of md->queue. And dispatches all requests in md->queue until the marker comes to the front of md->queue. Then, stops dispatching request and waits for the all dispatched requests to complete. After that, completes the marker request, stops md->queue and wake up the waiter on the suspend queue, md->wait. resume ====== Starts md->queue. Signed-off-by: Kiyoshi Ueda <k-ueda@ct.jp.nec.com> Signed-off-by: Jun'ichi Nomura <j-nomura@ce.jp.nec.com> Signed-off-by: Alasdair G Kergon <agk@redhat.com>
2009-06-22 17:12:35 +08:00
*
dm: simplify request based suspend The semantics of bio-based dm were changed recently in the case of suspend with "--nolockfs" but without "--noflush". Before 2.6.30, I/Os submitted before the suspend invocation were always flushed. From 2.6.30 onwards, I/Os submitted before the suspend invocation might not be flushed. (For details, see http://marc.info/?t=123994433400003&r=1&w=2) This patch brings the behaviour of request-based dm into line with bio-based dm, simplifying the code and preparing for a subsequent patch that will wait for all in_flight I/Os to complete without stopping request_queue and use dm_wait_for_completion() for it. This change in semantics simplifies the suspend code as follows: o Suspend is implemented as stopping request_queue in request-based dm, and all I/Os are queued in the request_queue even after suspend is invoked. o In the old semantics, we had to track whether I/Os were queued before or after the suspend invocation, so a special barrier-like request called 'suspend marker' was introduced. o With the new semantics, we don't need to flush any I/O so we can remove the marker and the code related to the marker handling and I/O flushing. After removing this codes, the suspend sequence is now: 1. Flush all I/Os by lock_fs() if needed. 2. Stop dispatching any I/O by stopping the request_queue. 3. Wait for all in-flight I/Os to be completed or requeued. Signed-off-by: Kiyoshi Ueda <k-ueda@ct.jp.nec.com> Signed-off-by: Jun'ichi Nomura <j-nomura@ce.jp.nec.com> Signed-off-by: Alasdair G Kergon <agk@redhat.com>
2009-12-11 07:52:16 +08:00
* To abort suspend, start the request_queue.
dm: prepare for request based option This patch adds core functions for request-based dm. When struct mapped device (md) is initialized, md->queue has an I/O scheduler and the following functions are used for request-based dm as the queue functions: make_request_fn: dm_make_request() pref_fn: dm_prep_fn() request_fn: dm_request_fn() softirq_done_fn: dm_softirq_done() lld_busy_fn: dm_lld_busy() Actual initializations are done in another patch (PATCH 2). Below is a brief summary of how request-based dm behaves, including: - making request from bio - cloning, mapping and dispatching request - completing request and bio - suspending md - resuming md bio to request ============== md->queue->make_request_fn() (dm_make_request()) calls __make_request() for a bio submitted to the md. Then, the bio is kept in the queue as a new request or merged into another request in the queue if possible. Cloning and Mapping =================== Cloning and mapping are done in md->queue->request_fn() (dm_request_fn()), when requests are dispatched after they are sorted by the I/O scheduler. dm_request_fn() checks busy state of underlying devices using target's busy() function and stops dispatching requests to keep them on the dm device's queue if busy. It helps better I/O merging, since no merge is done for a request once it is dispatched to underlying devices. Actual cloning and mapping are done in dm_prep_fn() and map_request() called from dm_request_fn(). dm_prep_fn() clones not only request but also bios of the request so that dm can hold bio completion in error cases and prevent the bio submitter from noticing the error. (See the "Completion" section below for details.) After the cloning, the clone is mapped by target's map_rq() function and inserted to underlying device's queue using blk_insert_cloned_request(). Completion ========== Request completion can be hooked by rq->end_io(), but then, all bios in the request will have been completed even error cases, and the bio submitter will have noticed the error. To prevent the bio completion in error cases, request-based dm clones both bio and request and hooks both bio->bi_end_io() and rq->end_io(): bio->bi_end_io(): end_clone_bio() rq->end_io(): end_clone_request() Summary of the request completion flow is below: blk_end_request() for a clone request => blk_update_request() => bio->bi_end_io() == end_clone_bio() for each clone bio => Free the clone bio => Success: Complete the original bio (blk_update_request()) Error: Don't complete the original bio => blk_finish_request() => rq->end_io() == end_clone_request() => blk_complete_request() => dm_softirq_done() => Free the clone request => Success: Complete the original request (blk_end_request()) Error: Requeue the original request end_clone_bio() completes the original request on the size of the original bio in successful cases. Even if all bios in the original request are completed by that completion, the original request must not be completed yet to keep the ordering of request completion for the stacking. So end_clone_bio() uses blk_update_request() instead of blk_end_request(). In error cases, end_clone_bio() doesn't complete the original bio. It just frees the cloned bio and gives over the error handling to end_clone_request(). end_clone_request(), which is called with queue lock held, completes the clone request and the original request in a softirq context (dm_softirq_done()), which has no queue lock, to avoid a deadlock issue on submission of another request during the completion: - The submitted request may be mapped to the same device - Request submission requires queue lock, but the queue lock has been held by itself and it doesn't know that The clone request has no clone bio when dm_softirq_done() is called. So target drivers can't resubmit it again even error cases. Instead, they can ask dm core for requeueing and remapping the original request in that cases. suspend ======= Request-based dm uses stopping md->queue as suspend of the md. For noflush suspend, just stops md->queue. For flush suspend, inserts a marker request to the tail of md->queue. And dispatches all requests in md->queue until the marker comes to the front of md->queue. Then, stops dispatching request and waits for the all dispatched requests to complete. After that, completes the marker request, stops md->queue and wake up the waiter on the suspend queue, md->wait. resume ====== Starts md->queue. Signed-off-by: Kiyoshi Ueda <k-ueda@ct.jp.nec.com> Signed-off-by: Jun'ichi Nomura <j-nomura@ce.jp.nec.com> Signed-off-by: Alasdair G Kergon <agk@redhat.com>
2009-06-22 17:12:35 +08:00
*/
int dm_suspend(struct mapped_device *md, unsigned suspend_flags)
{
struct dm_table *map = NULL;
int r = 0;
int do_lockfs = suspend_flags & DM_SUSPEND_LOCKFS_FLAG ? 1 : 0;
[PATCH] dm: suspend: add noflush pushback In device-mapper I/O is sometimes queued within targets for later processing. For example the multipath target can be configured to store I/O when no paths are available instead of returning it -EIO. This patch allows the device-mapper core to instruct a target to transfer the contents of any such in-target queue back into the core. This frees up the resources used by the target so the core can replace that target with an alternative one and then resend the I/O to it. Without this patch the only way to change the target in such circumstances involves returning the I/O with an error back to the filesystem/application. In the multipath case, this patch will let us add new paths for existing I/O to try after all the existing paths have failed. DMF_NOFLUSH_SUSPENDING ---------------------- If the DM_NOFLUSH_FLAG ioctl option is specified at suspend time, the DMF_NOFLUSH_SUSPENDING flag is set in md->flags during dm_suspend(). It is always cleared before dm_suspend() returns. The flag must be visible while the target is flushing pending I/Os so it is set before presuspend where the flush starts and unset after the wait for md->pending where the flush ends. Target drivers can check this flag by calling dm_noflush_suspending(). DM_MAPIO_REQUEUE / DM_ENDIO_REQUEUE ----------------------------------- A target's map() function can now return DM_MAPIO_REQUEUE to request the device mapper core queue the bio. Similarly, a target's end_io() function can return DM_ENDIO_REQUEUE to request the same. This has been labelled 'pushback'. The __map_bio() and clone_endio() functions in the core treat these return values as errors and call dec_pending() to end the I/O. dec_pending ----------- dec_pending() saves the pushback request in struct dm_io->error. Once all the split clones have ended, dec_pending() will put the original bio on the md->pushback list. Note that this supercedes any I/O errors. It is possible for the suspend with DM_NOFLUSH_FLAG to be aborted while in progress (e.g. by user interrupt). dec_pending() checks for this and returns -EIO if it happened. pushdback list and pushback_lock -------------------------------- The bio is queued on md->pushback temporarily in dec_pending(), and after all pending I/Os return, md->pushback is merged into md->deferred in dm_suspend() for re-issuing at resume time. md->pushback_lock protects md->pushback. The lock should be held with irq disabled because dec_pending() can be called from interrupt context. Queueing bios to md->pushback in dec_pending() must be done atomically with the check for DMF_NOFLUSH_SUSPENDING flag. So md->pushback_lock is held when checking the flag. Otherwise dec_pending() may queue a bio to md->pushback after the interrupted dm_suspend() flushes md->pushback. Then the bio would be left in md->pushback. Flag setting in dm_suspend() can be done without md->pushback_lock because the flag is checked only after presuspend and the set value is already made visible via the target's presuspend function. The flag can be checked without md->pushback_lock (e.g. the first part of the dec_pending() or target drivers), because the flag is checked again with md->pushback_lock held when the bio is really queued to md->pushback as described above. So even if the flag is cleared after the lockless checkings, the bio isn't left in md->pushback but returned to applications with -EIO. Other notes on the current patch -------------------------------- - md->pushback is added to the struct mapped_device instead of using md->deferred directly because md->io_lock which protects md->deferred is rw_semaphore and can't be used in interrupt context like dec_pending(), and md->io_lock protects the DMF_BLOCK_IO flag of md->flags too. - Don't issue lock_fs() in dm_suspend() if the DM_NOFLUSH_FLAG ioctl option is specified, because I/Os generated by lock_fs() would be pushed back and never return if there were no valid devices. - If an error occurs in dm_suspend() after the DMF_NOFLUSH_SUSPENDING flag is set, md->pushback must be flushed because I/Os may be queued to the list already. (flush_and_out label in dm_suspend()) Test results ------------ I have tested using multipath target with the next patch. The following tests are for regression/compatibility: - I/Os succeed when valid paths exist; - I/Os fail when there are no valid paths and queue_if_no_path is not set; - I/Os are queued in the multipath target when there are no valid paths and queue_if_no_path is set; - The queued I/Os above fail when suspend is issued without the DM_NOFLUSH_FLAG ioctl option. I/Os spanning 2 multipath targets also fail. The following tests are for the normal code path of new pushback feature: - Queued I/Os in the multipath target are flushed from the target but don't return when suspend is issued with the DM_NOFLUSH_FLAG ioctl option; - The I/Os above are queued in the multipath target again when resume is issued without path recovery; - The I/Os above succeed when resume is issued after path recovery or table load; - Queued I/Os in the multipath target succeed when resume is issued with the DM_NOFLUSH_FLAG ioctl option after table load. I/Os spanning 2 multipath targets also succeed. The following tests are for the error paths of the new pushback feature: - When the bdget_disk() fails in dm_suspend(), the DMF_NOFLUSH_SUSPENDING flag is cleared and I/Os already queued to the pushback list are flushed properly. - When suspend with the DM_NOFLUSH_FLAG ioctl option is interrupted, o I/Os which had already been queued to the pushback list at the time don't return, and are re-issued at resume time; o I/Os which hadn't been returned at the time return with EIO. Signed-off-by: Kiyoshi Ueda <k-ueda@ct.jp.nec.com> Signed-off-by: Jun'ichi Nomura <j-nomura@ce.jp.nec.com> Signed-off-by: Alasdair G Kergon <agk@redhat.com> Cc: dm-devel@redhat.com Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-12-08 18:41:09 +08:00
int noflush = suspend_flags & DM_SUSPEND_NOFLUSH_FLAG ? 1 : 0;
mutex_lock(&md->suspend_lock);
if (dm_suspended_md(md)) {
r = -EINVAL;
goto out_unlock;
}
map = dm_get_live_table(md);
[PATCH] dm: suspend: add noflush pushback In device-mapper I/O is sometimes queued within targets for later processing. For example the multipath target can be configured to store I/O when no paths are available instead of returning it -EIO. This patch allows the device-mapper core to instruct a target to transfer the contents of any such in-target queue back into the core. This frees up the resources used by the target so the core can replace that target with an alternative one and then resend the I/O to it. Without this patch the only way to change the target in such circumstances involves returning the I/O with an error back to the filesystem/application. In the multipath case, this patch will let us add new paths for existing I/O to try after all the existing paths have failed. DMF_NOFLUSH_SUSPENDING ---------------------- If the DM_NOFLUSH_FLAG ioctl option is specified at suspend time, the DMF_NOFLUSH_SUSPENDING flag is set in md->flags during dm_suspend(). It is always cleared before dm_suspend() returns. The flag must be visible while the target is flushing pending I/Os so it is set before presuspend where the flush starts and unset after the wait for md->pending where the flush ends. Target drivers can check this flag by calling dm_noflush_suspending(). DM_MAPIO_REQUEUE / DM_ENDIO_REQUEUE ----------------------------------- A target's map() function can now return DM_MAPIO_REQUEUE to request the device mapper core queue the bio. Similarly, a target's end_io() function can return DM_ENDIO_REQUEUE to request the same. This has been labelled 'pushback'. The __map_bio() and clone_endio() functions in the core treat these return values as errors and call dec_pending() to end the I/O. dec_pending ----------- dec_pending() saves the pushback request in struct dm_io->error. Once all the split clones have ended, dec_pending() will put the original bio on the md->pushback list. Note that this supercedes any I/O errors. It is possible for the suspend with DM_NOFLUSH_FLAG to be aborted while in progress (e.g. by user interrupt). dec_pending() checks for this and returns -EIO if it happened. pushdback list and pushback_lock -------------------------------- The bio is queued on md->pushback temporarily in dec_pending(), and after all pending I/Os return, md->pushback is merged into md->deferred in dm_suspend() for re-issuing at resume time. md->pushback_lock protects md->pushback. The lock should be held with irq disabled because dec_pending() can be called from interrupt context. Queueing bios to md->pushback in dec_pending() must be done atomically with the check for DMF_NOFLUSH_SUSPENDING flag. So md->pushback_lock is held when checking the flag. Otherwise dec_pending() may queue a bio to md->pushback after the interrupted dm_suspend() flushes md->pushback. Then the bio would be left in md->pushback. Flag setting in dm_suspend() can be done without md->pushback_lock because the flag is checked only after presuspend and the set value is already made visible via the target's presuspend function. The flag can be checked without md->pushback_lock (e.g. the first part of the dec_pending() or target drivers), because the flag is checked again with md->pushback_lock held when the bio is really queued to md->pushback as described above. So even if the flag is cleared after the lockless checkings, the bio isn't left in md->pushback but returned to applications with -EIO. Other notes on the current patch -------------------------------- - md->pushback is added to the struct mapped_device instead of using md->deferred directly because md->io_lock which protects md->deferred is rw_semaphore and can't be used in interrupt context like dec_pending(), and md->io_lock protects the DMF_BLOCK_IO flag of md->flags too. - Don't issue lock_fs() in dm_suspend() if the DM_NOFLUSH_FLAG ioctl option is specified, because I/Os generated by lock_fs() would be pushed back and never return if there were no valid devices. - If an error occurs in dm_suspend() after the DMF_NOFLUSH_SUSPENDING flag is set, md->pushback must be flushed because I/Os may be queued to the list already. (flush_and_out label in dm_suspend()) Test results ------------ I have tested using multipath target with the next patch. The following tests are for regression/compatibility: - I/Os succeed when valid paths exist; - I/Os fail when there are no valid paths and queue_if_no_path is not set; - I/Os are queued in the multipath target when there are no valid paths and queue_if_no_path is set; - The queued I/Os above fail when suspend is issued without the DM_NOFLUSH_FLAG ioctl option. I/Os spanning 2 multipath targets also fail. The following tests are for the normal code path of new pushback feature: - Queued I/Os in the multipath target are flushed from the target but don't return when suspend is issued with the DM_NOFLUSH_FLAG ioctl option; - The I/Os above are queued in the multipath target again when resume is issued without path recovery; - The I/Os above succeed when resume is issued after path recovery or table load; - Queued I/Os in the multipath target succeed when resume is issued with the DM_NOFLUSH_FLAG ioctl option after table load. I/Os spanning 2 multipath targets also succeed. The following tests are for the error paths of the new pushback feature: - When the bdget_disk() fails in dm_suspend(), the DMF_NOFLUSH_SUSPENDING flag is cleared and I/Os already queued to the pushback list are flushed properly. - When suspend with the DM_NOFLUSH_FLAG ioctl option is interrupted, o I/Os which had already been queued to the pushback list at the time don't return, and are re-issued at resume time; o I/Os which hadn't been returned at the time return with EIO. Signed-off-by: Kiyoshi Ueda <k-ueda@ct.jp.nec.com> Signed-off-by: Jun'ichi Nomura <j-nomura@ce.jp.nec.com> Signed-off-by: Alasdair G Kergon <agk@redhat.com> Cc: dm-devel@redhat.com Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-12-08 18:41:09 +08:00
/*
* DMF_NOFLUSH_SUSPENDING must be set before presuspend.
* This flag is cleared before dm_suspend returns.
*/
if (noflush)
set_bit(DMF_NOFLUSH_SUSPENDING, &md->flags);
/* This does not get reverted if there's an error later. */
dm_table_presuspend_targets(map);
/*
dm: simplify request based suspend The semantics of bio-based dm were changed recently in the case of suspend with "--nolockfs" but without "--noflush". Before 2.6.30, I/Os submitted before the suspend invocation were always flushed. From 2.6.30 onwards, I/Os submitted before the suspend invocation might not be flushed. (For details, see http://marc.info/?t=123994433400003&r=1&w=2) This patch brings the behaviour of request-based dm into line with bio-based dm, simplifying the code and preparing for a subsequent patch that will wait for all in_flight I/Os to complete without stopping request_queue and use dm_wait_for_completion() for it. This change in semantics simplifies the suspend code as follows: o Suspend is implemented as stopping request_queue in request-based dm, and all I/Os are queued in the request_queue even after suspend is invoked. o In the old semantics, we had to track whether I/Os were queued before or after the suspend invocation, so a special barrier-like request called 'suspend marker' was introduced. o With the new semantics, we don't need to flush any I/O so we can remove the marker and the code related to the marker handling and I/O flushing. After removing this codes, the suspend sequence is now: 1. Flush all I/Os by lock_fs() if needed. 2. Stop dispatching any I/O by stopping the request_queue. 3. Wait for all in-flight I/Os to be completed or requeued. Signed-off-by: Kiyoshi Ueda <k-ueda@ct.jp.nec.com> Signed-off-by: Jun'ichi Nomura <j-nomura@ce.jp.nec.com> Signed-off-by: Alasdair G Kergon <agk@redhat.com>
2009-12-11 07:52:16 +08:00
* Flush I/O to the device.
* Any I/O submitted after lock_fs() may not be flushed.
* noflush takes precedence over do_lockfs.
* (lock_fs() flushes I/Os and waits for them to complete.)
*/
if (!noflush && do_lockfs) {
r = lock_fs(md);
if (r)
goto out;
}
/*
* Here we must make sure that no processes are submitting requests
* to target drivers i.e. no one may be executing
* __split_and_process_bio. This is called from dm_request and
* dm_wq_work.
*
* To get all processes out of __split_and_process_bio in dm_request,
* we take the write lock. To prevent any process from reentering
dm: relax ordering of bio-based flush implementation Unlike REQ_HARDBARRIER, REQ_FLUSH/FUA doesn't mandate any ordering against other bio's. This patch relaxes ordering around flushes. * A flush bio is no longer deferred to workqueue directly. It's processed like other bio's but __split_and_process_bio() uses md->flush_bio as the clone source. md->flush_bio is initialized to empty flush during md initialization and shared for all flushes. * As a flush bio now travels through the same execution path as other bio's, there's no need for dedicated error handling path either. It can use the same error handling path in dec_pending(). Dedicated error handling removed along with md->flush_error. * When dec_pending() detects that a flush has completed, it checks whether the original bio has data. If so, the bio is queued to the deferred list w/ REQ_FLUSH cleared; otherwise, it's completed. * As flush sequencing is handled in the usual issue/completion path, dm_wq_work() no longer needs to handle flushes differently. Now its only responsibility is re-issuing deferred bio's the same way as _dm_request() would. REQ_FLUSH handling logic including process_flush() is dropped. * There's no reason for queue_io() and dm_wq_work() write lock dm->io_lock. queue_io() now only uses md->deferred_lock and dm_wq_work() read locks dm->io_lock. * bio's no longer need to be queued on the deferred list while a flush is in progress making DMF_QUEUE_IO_TO_THREAD unncessary. Drop it. This avoids stalling the device during flushes and simplifies the implementation. Signed-off-by: Tejun Heo <tj@kernel.org> Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
2010-09-09 00:07:00 +08:00
* __split_and_process_bio from dm_request and quiesce the thread
* (dm_wq_work), we set BMF_BLOCK_IO_FOR_SUSPEND and call
* flush_workqueue(md->wq).
*/
down_write(&md->io_lock);
set_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags);
up_write(&md->io_lock);
dm: add request based barrier support This patch adds barrier support for request-based dm. CORE DESIGN The design is basically same as bio-based dm, which emulates barrier by mapping empty barrier bios before/after a barrier I/O. But request-based dm has been using struct request_queue for I/O queueing, so the block-layer's barrier mechanism can be used. o Summary of the block-layer's behavior (which is depended by dm-core) Request-based dm uses QUEUE_ORDERED_DRAIN_FLUSH ordered mode for I/O barrier. It means that when an I/O requiring barrier is found in the request_queue, the block-layer makes pre-flush request and post-flush request just before and just after the I/O respectively. After the ordered sequence starts, the block-layer waits for all in-flight I/Os to complete, then gives drivers the pre-flush request, the barrier I/O and the post-flush request one by one. It means that the request_queue is stopped automatically by the block-layer until drivers complete each sequence. o dm-core For the barrier I/O, treats it as a normal I/O, so no additional code is needed. For the pre/post-flush request, flushes caches by the followings: 1. Make the number of empty barrier requests required by target's num_flush_requests, and map them (dm_rq_barrier()). 2. Waits for the mapped barriers to complete (dm_rq_barrier()). If error has occurred, save the error value to md->barrier_error (dm_end_request()). (*) Basically, the first reported error is taken. But -EOPNOTSUPP supersedes any error and DM_ENDIO_REQUEUE follows. 3. Requeue the pre/post-flush request if the error value is DM_ENDIO_REQUEUE. Otherwise, completes with the error value (dm_rq_barrier_work()). The pre/post-flush work above is done in the kernel thread (kdmflush) context, since memory allocation which might sleep is needed in dm_rq_barrier() but sleep is not allowed in dm_request_fn(), which is an irq-disabled context. Also, clones of the pre/post-flush request share an original, so such clones can't be completed using the softirq context. Instead, complete them in the context of underlying device drivers. It should be safe since there is no I/O dispatching during the completion of such clones. For suspend, the workqueue of kdmflush needs to be flushed after the request_queue has been stopped. Otherwise, the next flush work can be kicked even after the suspend completes. TARGET INTERFACE No new interface is added. Just use the existing num_flush_requests in struct target_type as same as bio-based dm. Signed-off-by: Kiyoshi Ueda <k-ueda@ct.jp.nec.com> Signed-off-by: Jun'ichi Nomura <j-nomura@ce.jp.nec.com> Signed-off-by: Alasdair G Kergon <agk@redhat.com>
2009-12-11 07:52:18 +08:00
/*
* Stop md->queue before flushing md->wq in case request-based
* dm defers requests to md->wq from md->queue.
dm: add request based barrier support This patch adds barrier support for request-based dm. CORE DESIGN The design is basically same as bio-based dm, which emulates barrier by mapping empty barrier bios before/after a barrier I/O. But request-based dm has been using struct request_queue for I/O queueing, so the block-layer's barrier mechanism can be used. o Summary of the block-layer's behavior (which is depended by dm-core) Request-based dm uses QUEUE_ORDERED_DRAIN_FLUSH ordered mode for I/O barrier. It means that when an I/O requiring barrier is found in the request_queue, the block-layer makes pre-flush request and post-flush request just before and just after the I/O respectively. After the ordered sequence starts, the block-layer waits for all in-flight I/Os to complete, then gives drivers the pre-flush request, the barrier I/O and the post-flush request one by one. It means that the request_queue is stopped automatically by the block-layer until drivers complete each sequence. o dm-core For the barrier I/O, treats it as a normal I/O, so no additional code is needed. For the pre/post-flush request, flushes caches by the followings: 1. Make the number of empty barrier requests required by target's num_flush_requests, and map them (dm_rq_barrier()). 2. Waits for the mapped barriers to complete (dm_rq_barrier()). If error has occurred, save the error value to md->barrier_error (dm_end_request()). (*) Basically, the first reported error is taken. But -EOPNOTSUPP supersedes any error and DM_ENDIO_REQUEUE follows. 3. Requeue the pre/post-flush request if the error value is DM_ENDIO_REQUEUE. Otherwise, completes with the error value (dm_rq_barrier_work()). The pre/post-flush work above is done in the kernel thread (kdmflush) context, since memory allocation which might sleep is needed in dm_rq_barrier() but sleep is not allowed in dm_request_fn(), which is an irq-disabled context. Also, clones of the pre/post-flush request share an original, so such clones can't be completed using the softirq context. Instead, complete them in the context of underlying device drivers. It should be safe since there is no I/O dispatching during the completion of such clones. For suspend, the workqueue of kdmflush needs to be flushed after the request_queue has been stopped. Otherwise, the next flush work can be kicked even after the suspend completes. TARGET INTERFACE No new interface is added. Just use the existing num_flush_requests in struct target_type as same as bio-based dm. Signed-off-by: Kiyoshi Ueda <k-ueda@ct.jp.nec.com> Signed-off-by: Jun'ichi Nomura <j-nomura@ce.jp.nec.com> Signed-off-by: Alasdair G Kergon <agk@redhat.com>
2009-12-11 07:52:18 +08:00
*/
dm: prepare for request based option This patch adds core functions for request-based dm. When struct mapped device (md) is initialized, md->queue has an I/O scheduler and the following functions are used for request-based dm as the queue functions: make_request_fn: dm_make_request() pref_fn: dm_prep_fn() request_fn: dm_request_fn() softirq_done_fn: dm_softirq_done() lld_busy_fn: dm_lld_busy() Actual initializations are done in another patch (PATCH 2). Below is a brief summary of how request-based dm behaves, including: - making request from bio - cloning, mapping and dispatching request - completing request and bio - suspending md - resuming md bio to request ============== md->queue->make_request_fn() (dm_make_request()) calls __make_request() for a bio submitted to the md. Then, the bio is kept in the queue as a new request or merged into another request in the queue if possible. Cloning and Mapping =================== Cloning and mapping are done in md->queue->request_fn() (dm_request_fn()), when requests are dispatched after they are sorted by the I/O scheduler. dm_request_fn() checks busy state of underlying devices using target's busy() function and stops dispatching requests to keep them on the dm device's queue if busy. It helps better I/O merging, since no merge is done for a request once it is dispatched to underlying devices. Actual cloning and mapping are done in dm_prep_fn() and map_request() called from dm_request_fn(). dm_prep_fn() clones not only request but also bios of the request so that dm can hold bio completion in error cases and prevent the bio submitter from noticing the error. (See the "Completion" section below for details.) After the cloning, the clone is mapped by target's map_rq() function and inserted to underlying device's queue using blk_insert_cloned_request(). Completion ========== Request completion can be hooked by rq->end_io(), but then, all bios in the request will have been completed even error cases, and the bio submitter will have noticed the error. To prevent the bio completion in error cases, request-based dm clones both bio and request and hooks both bio->bi_end_io() and rq->end_io(): bio->bi_end_io(): end_clone_bio() rq->end_io(): end_clone_request() Summary of the request completion flow is below: blk_end_request() for a clone request => blk_update_request() => bio->bi_end_io() == end_clone_bio() for each clone bio => Free the clone bio => Success: Complete the original bio (blk_update_request()) Error: Don't complete the original bio => blk_finish_request() => rq->end_io() == end_clone_request() => blk_complete_request() => dm_softirq_done() => Free the clone request => Success: Complete the original request (blk_end_request()) Error: Requeue the original request end_clone_bio() completes the original request on the size of the original bio in successful cases. Even if all bios in the original request are completed by that completion, the original request must not be completed yet to keep the ordering of request completion for the stacking. So end_clone_bio() uses blk_update_request() instead of blk_end_request(). In error cases, end_clone_bio() doesn't complete the original bio. It just frees the cloned bio and gives over the error handling to end_clone_request(). end_clone_request(), which is called with queue lock held, completes the clone request and the original request in a softirq context (dm_softirq_done()), which has no queue lock, to avoid a deadlock issue on submission of another request during the completion: - The submitted request may be mapped to the same device - Request submission requires queue lock, but the queue lock has been held by itself and it doesn't know that The clone request has no clone bio when dm_softirq_done() is called. So target drivers can't resubmit it again even error cases. Instead, they can ask dm core for requeueing and remapping the original request in that cases. suspend ======= Request-based dm uses stopping md->queue as suspend of the md. For noflush suspend, just stops md->queue. For flush suspend, inserts a marker request to the tail of md->queue. And dispatches all requests in md->queue until the marker comes to the front of md->queue. Then, stops dispatching request and waits for the all dispatched requests to complete. After that, completes the marker request, stops md->queue and wake up the waiter on the suspend queue, md->wait. resume ====== Starts md->queue. Signed-off-by: Kiyoshi Ueda <k-ueda@ct.jp.nec.com> Signed-off-by: Jun'ichi Nomura <j-nomura@ce.jp.nec.com> Signed-off-by: Alasdair G Kergon <agk@redhat.com>
2009-06-22 17:12:35 +08:00
if (dm_request_based(md))
dm: simplify request based suspend The semantics of bio-based dm were changed recently in the case of suspend with "--nolockfs" but without "--noflush". Before 2.6.30, I/Os submitted before the suspend invocation were always flushed. From 2.6.30 onwards, I/Os submitted before the suspend invocation might not be flushed. (For details, see http://marc.info/?t=123994433400003&r=1&w=2) This patch brings the behaviour of request-based dm into line with bio-based dm, simplifying the code and preparing for a subsequent patch that will wait for all in_flight I/Os to complete without stopping request_queue and use dm_wait_for_completion() for it. This change in semantics simplifies the suspend code as follows: o Suspend is implemented as stopping request_queue in request-based dm, and all I/Os are queued in the request_queue even after suspend is invoked. o In the old semantics, we had to track whether I/Os were queued before or after the suspend invocation, so a special barrier-like request called 'suspend marker' was introduced. o With the new semantics, we don't need to flush any I/O so we can remove the marker and the code related to the marker handling and I/O flushing. After removing this codes, the suspend sequence is now: 1. Flush all I/Os by lock_fs() if needed. 2. Stop dispatching any I/O by stopping the request_queue. 3. Wait for all in-flight I/Os to be completed or requeued. Signed-off-by: Kiyoshi Ueda <k-ueda@ct.jp.nec.com> Signed-off-by: Jun'ichi Nomura <j-nomura@ce.jp.nec.com> Signed-off-by: Alasdair G Kergon <agk@redhat.com>
2009-12-11 07:52:16 +08:00
stop_queue(md->queue);
dm: prepare for request based option This patch adds core functions for request-based dm. When struct mapped device (md) is initialized, md->queue has an I/O scheduler and the following functions are used for request-based dm as the queue functions: make_request_fn: dm_make_request() pref_fn: dm_prep_fn() request_fn: dm_request_fn() softirq_done_fn: dm_softirq_done() lld_busy_fn: dm_lld_busy() Actual initializations are done in another patch (PATCH 2). Below is a brief summary of how request-based dm behaves, including: - making request from bio - cloning, mapping and dispatching request - completing request and bio - suspending md - resuming md bio to request ============== md->queue->make_request_fn() (dm_make_request()) calls __make_request() for a bio submitted to the md. Then, the bio is kept in the queue as a new request or merged into another request in the queue if possible. Cloning and Mapping =================== Cloning and mapping are done in md->queue->request_fn() (dm_request_fn()), when requests are dispatched after they are sorted by the I/O scheduler. dm_request_fn() checks busy state of underlying devices using target's busy() function and stops dispatching requests to keep them on the dm device's queue if busy. It helps better I/O merging, since no merge is done for a request once it is dispatched to underlying devices. Actual cloning and mapping are done in dm_prep_fn() and map_request() called from dm_request_fn(). dm_prep_fn() clones not only request but also bios of the request so that dm can hold bio completion in error cases and prevent the bio submitter from noticing the error. (See the "Completion" section below for details.) After the cloning, the clone is mapped by target's map_rq() function and inserted to underlying device's queue using blk_insert_cloned_request(). Completion ========== Request completion can be hooked by rq->end_io(), but then, all bios in the request will have been completed even error cases, and the bio submitter will have noticed the error. To prevent the bio completion in error cases, request-based dm clones both bio and request and hooks both bio->bi_end_io() and rq->end_io(): bio->bi_end_io(): end_clone_bio() rq->end_io(): end_clone_request() Summary of the request completion flow is below: blk_end_request() for a clone request => blk_update_request() => bio->bi_end_io() == end_clone_bio() for each clone bio => Free the clone bio => Success: Complete the original bio (blk_update_request()) Error: Don't complete the original bio => blk_finish_request() => rq->end_io() == end_clone_request() => blk_complete_request() => dm_softirq_done() => Free the clone request => Success: Complete the original request (blk_end_request()) Error: Requeue the original request end_clone_bio() completes the original request on the size of the original bio in successful cases. Even if all bios in the original request are completed by that completion, the original request must not be completed yet to keep the ordering of request completion for the stacking. So end_clone_bio() uses blk_update_request() instead of blk_end_request(). In error cases, end_clone_bio() doesn't complete the original bio. It just frees the cloned bio and gives over the error handling to end_clone_request(). end_clone_request(), which is called with queue lock held, completes the clone request and the original request in a softirq context (dm_softirq_done()), which has no queue lock, to avoid a deadlock issue on submission of another request during the completion: - The submitted request may be mapped to the same device - Request submission requires queue lock, but the queue lock has been held by itself and it doesn't know that The clone request has no clone bio when dm_softirq_done() is called. So target drivers can't resubmit it again even error cases. Instead, they can ask dm core for requeueing and remapping the original request in that cases. suspend ======= Request-based dm uses stopping md->queue as suspend of the md. For noflush suspend, just stops md->queue. For flush suspend, inserts a marker request to the tail of md->queue. And dispatches all requests in md->queue until the marker comes to the front of md->queue. Then, stops dispatching request and waits for the all dispatched requests to complete. After that, completes the marker request, stops md->queue and wake up the waiter on the suspend queue, md->wait. resume ====== Starts md->queue. Signed-off-by: Kiyoshi Ueda <k-ueda@ct.jp.nec.com> Signed-off-by: Jun'ichi Nomura <j-nomura@ce.jp.nec.com> Signed-off-by: Alasdair G Kergon <agk@redhat.com>
2009-06-22 17:12:35 +08:00
dm: add request based barrier support This patch adds barrier support for request-based dm. CORE DESIGN The design is basically same as bio-based dm, which emulates barrier by mapping empty barrier bios before/after a barrier I/O. But request-based dm has been using struct request_queue for I/O queueing, so the block-layer's barrier mechanism can be used. o Summary of the block-layer's behavior (which is depended by dm-core) Request-based dm uses QUEUE_ORDERED_DRAIN_FLUSH ordered mode for I/O barrier. It means that when an I/O requiring barrier is found in the request_queue, the block-layer makes pre-flush request and post-flush request just before and just after the I/O respectively. After the ordered sequence starts, the block-layer waits for all in-flight I/Os to complete, then gives drivers the pre-flush request, the barrier I/O and the post-flush request one by one. It means that the request_queue is stopped automatically by the block-layer until drivers complete each sequence. o dm-core For the barrier I/O, treats it as a normal I/O, so no additional code is needed. For the pre/post-flush request, flushes caches by the followings: 1. Make the number of empty barrier requests required by target's num_flush_requests, and map them (dm_rq_barrier()). 2. Waits for the mapped barriers to complete (dm_rq_barrier()). If error has occurred, save the error value to md->barrier_error (dm_end_request()). (*) Basically, the first reported error is taken. But -EOPNOTSUPP supersedes any error and DM_ENDIO_REQUEUE follows. 3. Requeue the pre/post-flush request if the error value is DM_ENDIO_REQUEUE. Otherwise, completes with the error value (dm_rq_barrier_work()). The pre/post-flush work above is done in the kernel thread (kdmflush) context, since memory allocation which might sleep is needed in dm_rq_barrier() but sleep is not allowed in dm_request_fn(), which is an irq-disabled context. Also, clones of the pre/post-flush request share an original, so such clones can't be completed using the softirq context. Instead, complete them in the context of underlying device drivers. It should be safe since there is no I/O dispatching during the completion of such clones. For suspend, the workqueue of kdmflush needs to be flushed after the request_queue has been stopped. Otherwise, the next flush work can be kicked even after the suspend completes. TARGET INTERFACE No new interface is added. Just use the existing num_flush_requests in struct target_type as same as bio-based dm. Signed-off-by: Kiyoshi Ueda <k-ueda@ct.jp.nec.com> Signed-off-by: Jun'ichi Nomura <j-nomura@ce.jp.nec.com> Signed-off-by: Alasdair G Kergon <agk@redhat.com>
2009-12-11 07:52:18 +08:00
flush_workqueue(md->wq);
/*
* At this point no more requests are entering target request routines.
* We call dm_wait_for_completion to wait for all existing requests
* to finish.
*/
r = dm_wait_for_completion(md, TASK_INTERRUPTIBLE);
down_write(&md->io_lock);
if (noflush)
clear_bit(DMF_NOFLUSH_SUSPENDING, &md->flags);
up_write(&md->io_lock);
[PATCH] dm: suspend: add noflush pushback In device-mapper I/O is sometimes queued within targets for later processing. For example the multipath target can be configured to store I/O when no paths are available instead of returning it -EIO. This patch allows the device-mapper core to instruct a target to transfer the contents of any such in-target queue back into the core. This frees up the resources used by the target so the core can replace that target with an alternative one and then resend the I/O to it. Without this patch the only way to change the target in such circumstances involves returning the I/O with an error back to the filesystem/application. In the multipath case, this patch will let us add new paths for existing I/O to try after all the existing paths have failed. DMF_NOFLUSH_SUSPENDING ---------------------- If the DM_NOFLUSH_FLAG ioctl option is specified at suspend time, the DMF_NOFLUSH_SUSPENDING flag is set in md->flags during dm_suspend(). It is always cleared before dm_suspend() returns. The flag must be visible while the target is flushing pending I/Os so it is set before presuspend where the flush starts and unset after the wait for md->pending where the flush ends. Target drivers can check this flag by calling dm_noflush_suspending(). DM_MAPIO_REQUEUE / DM_ENDIO_REQUEUE ----------------------------------- A target's map() function can now return DM_MAPIO_REQUEUE to request the device mapper core queue the bio. Similarly, a target's end_io() function can return DM_ENDIO_REQUEUE to request the same. This has been labelled 'pushback'. The __map_bio() and clone_endio() functions in the core treat these return values as errors and call dec_pending() to end the I/O. dec_pending ----------- dec_pending() saves the pushback request in struct dm_io->error. Once all the split clones have ended, dec_pending() will put the original bio on the md->pushback list. Note that this supercedes any I/O errors. It is possible for the suspend with DM_NOFLUSH_FLAG to be aborted while in progress (e.g. by user interrupt). dec_pending() checks for this and returns -EIO if it happened. pushdback list and pushback_lock -------------------------------- The bio is queued on md->pushback temporarily in dec_pending(), and after all pending I/Os return, md->pushback is merged into md->deferred in dm_suspend() for re-issuing at resume time. md->pushback_lock protects md->pushback. The lock should be held with irq disabled because dec_pending() can be called from interrupt context. Queueing bios to md->pushback in dec_pending() must be done atomically with the check for DMF_NOFLUSH_SUSPENDING flag. So md->pushback_lock is held when checking the flag. Otherwise dec_pending() may queue a bio to md->pushback after the interrupted dm_suspend() flushes md->pushback. Then the bio would be left in md->pushback. Flag setting in dm_suspend() can be done without md->pushback_lock because the flag is checked only after presuspend and the set value is already made visible via the target's presuspend function. The flag can be checked without md->pushback_lock (e.g. the first part of the dec_pending() or target drivers), because the flag is checked again with md->pushback_lock held when the bio is really queued to md->pushback as described above. So even if the flag is cleared after the lockless checkings, the bio isn't left in md->pushback but returned to applications with -EIO. Other notes on the current patch -------------------------------- - md->pushback is added to the struct mapped_device instead of using md->deferred directly because md->io_lock which protects md->deferred is rw_semaphore and can't be used in interrupt context like dec_pending(), and md->io_lock protects the DMF_BLOCK_IO flag of md->flags too. - Don't issue lock_fs() in dm_suspend() if the DM_NOFLUSH_FLAG ioctl option is specified, because I/Os generated by lock_fs() would be pushed back and never return if there were no valid devices. - If an error occurs in dm_suspend() after the DMF_NOFLUSH_SUSPENDING flag is set, md->pushback must be flushed because I/Os may be queued to the list already. (flush_and_out label in dm_suspend()) Test results ------------ I have tested using multipath target with the next patch. The following tests are for regression/compatibility: - I/Os succeed when valid paths exist; - I/Os fail when there are no valid paths and queue_if_no_path is not set; - I/Os are queued in the multipath target when there are no valid paths and queue_if_no_path is set; - The queued I/Os above fail when suspend is issued without the DM_NOFLUSH_FLAG ioctl option. I/Os spanning 2 multipath targets also fail. The following tests are for the normal code path of new pushback feature: - Queued I/Os in the multipath target are flushed from the target but don't return when suspend is issued with the DM_NOFLUSH_FLAG ioctl option; - The I/Os above are queued in the multipath target again when resume is issued without path recovery; - The I/Os above succeed when resume is issued after path recovery or table load; - Queued I/Os in the multipath target succeed when resume is issued with the DM_NOFLUSH_FLAG ioctl option after table load. I/Os spanning 2 multipath targets also succeed. The following tests are for the error paths of the new pushback feature: - When the bdget_disk() fails in dm_suspend(), the DMF_NOFLUSH_SUSPENDING flag is cleared and I/Os already queued to the pushback list are flushed properly. - When suspend with the DM_NOFLUSH_FLAG ioctl option is interrupted, o I/Os which had already been queued to the pushback list at the time don't return, and are re-issued at resume time; o I/Os which hadn't been returned at the time return with EIO. Signed-off-by: Kiyoshi Ueda <k-ueda@ct.jp.nec.com> Signed-off-by: Jun'ichi Nomura <j-nomura@ce.jp.nec.com> Signed-off-by: Alasdair G Kergon <agk@redhat.com> Cc: dm-devel@redhat.com Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-12-08 18:41:09 +08:00
/* were we interrupted ? */
if (r < 0) {
dm_queue_flush(md);
dm: prepare for request based option This patch adds core functions for request-based dm. When struct mapped device (md) is initialized, md->queue has an I/O scheduler and the following functions are used for request-based dm as the queue functions: make_request_fn: dm_make_request() pref_fn: dm_prep_fn() request_fn: dm_request_fn() softirq_done_fn: dm_softirq_done() lld_busy_fn: dm_lld_busy() Actual initializations are done in another patch (PATCH 2). Below is a brief summary of how request-based dm behaves, including: - making request from bio - cloning, mapping and dispatching request - completing request and bio - suspending md - resuming md bio to request ============== md->queue->make_request_fn() (dm_make_request()) calls __make_request() for a bio submitted to the md. Then, the bio is kept in the queue as a new request or merged into another request in the queue if possible. Cloning and Mapping =================== Cloning and mapping are done in md->queue->request_fn() (dm_request_fn()), when requests are dispatched after they are sorted by the I/O scheduler. dm_request_fn() checks busy state of underlying devices using target's busy() function and stops dispatching requests to keep them on the dm device's queue if busy. It helps better I/O merging, since no merge is done for a request once it is dispatched to underlying devices. Actual cloning and mapping are done in dm_prep_fn() and map_request() called from dm_request_fn(). dm_prep_fn() clones not only request but also bios of the request so that dm can hold bio completion in error cases and prevent the bio submitter from noticing the error. (See the "Completion" section below for details.) After the cloning, the clone is mapped by target's map_rq() function and inserted to underlying device's queue using blk_insert_cloned_request(). Completion ========== Request completion can be hooked by rq->end_io(), but then, all bios in the request will have been completed even error cases, and the bio submitter will have noticed the error. To prevent the bio completion in error cases, request-based dm clones both bio and request and hooks both bio->bi_end_io() and rq->end_io(): bio->bi_end_io(): end_clone_bio() rq->end_io(): end_clone_request() Summary of the request completion flow is below: blk_end_request() for a clone request => blk_update_request() => bio->bi_end_io() == end_clone_bio() for each clone bio => Free the clone bio => Success: Complete the original bio (blk_update_request()) Error: Don't complete the original bio => blk_finish_request() => rq->end_io() == end_clone_request() => blk_complete_request() => dm_softirq_done() => Free the clone request => Success: Complete the original request (blk_end_request()) Error: Requeue the original request end_clone_bio() completes the original request on the size of the original bio in successful cases. Even if all bios in the original request are completed by that completion, the original request must not be completed yet to keep the ordering of request completion for the stacking. So end_clone_bio() uses blk_update_request() instead of blk_end_request(). In error cases, end_clone_bio() doesn't complete the original bio. It just frees the cloned bio and gives over the error handling to end_clone_request(). end_clone_request(), which is called with queue lock held, completes the clone request and the original request in a softirq context (dm_softirq_done()), which has no queue lock, to avoid a deadlock issue on submission of another request during the completion: - The submitted request may be mapped to the same device - Request submission requires queue lock, but the queue lock has been held by itself and it doesn't know that The clone request has no clone bio when dm_softirq_done() is called. So target drivers can't resubmit it again even error cases. Instead, they can ask dm core for requeueing and remapping the original request in that cases. suspend ======= Request-based dm uses stopping md->queue as suspend of the md. For noflush suspend, just stops md->queue. For flush suspend, inserts a marker request to the tail of md->queue. And dispatches all requests in md->queue until the marker comes to the front of md->queue. Then, stops dispatching request and waits for the all dispatched requests to complete. After that, completes the marker request, stops md->queue and wake up the waiter on the suspend queue, md->wait. resume ====== Starts md->queue. Signed-off-by: Kiyoshi Ueda <k-ueda@ct.jp.nec.com> Signed-off-by: Jun'ichi Nomura <j-nomura@ce.jp.nec.com> Signed-off-by: Alasdair G Kergon <agk@redhat.com>
2009-06-22 17:12:35 +08:00
if (dm_request_based(md))
dm: simplify request based suspend The semantics of bio-based dm were changed recently in the case of suspend with "--nolockfs" but without "--noflush". Before 2.6.30, I/Os submitted before the suspend invocation were always flushed. From 2.6.30 onwards, I/Os submitted before the suspend invocation might not be flushed. (For details, see http://marc.info/?t=123994433400003&r=1&w=2) This patch brings the behaviour of request-based dm into line with bio-based dm, simplifying the code and preparing for a subsequent patch that will wait for all in_flight I/Os to complete without stopping request_queue and use dm_wait_for_completion() for it. This change in semantics simplifies the suspend code as follows: o Suspend is implemented as stopping request_queue in request-based dm, and all I/Os are queued in the request_queue even after suspend is invoked. o In the old semantics, we had to track whether I/Os were queued before or after the suspend invocation, so a special barrier-like request called 'suspend marker' was introduced. o With the new semantics, we don't need to flush any I/O so we can remove the marker and the code related to the marker handling and I/O flushing. After removing this codes, the suspend sequence is now: 1. Flush all I/Os by lock_fs() if needed. 2. Stop dispatching any I/O by stopping the request_queue. 3. Wait for all in-flight I/Os to be completed or requeued. Signed-off-by: Kiyoshi Ueda <k-ueda@ct.jp.nec.com> Signed-off-by: Jun'ichi Nomura <j-nomura@ce.jp.nec.com> Signed-off-by: Alasdair G Kergon <agk@redhat.com>
2009-12-11 07:52:16 +08:00
start_queue(md->queue);
dm: prepare for request based option This patch adds core functions for request-based dm. When struct mapped device (md) is initialized, md->queue has an I/O scheduler and the following functions are used for request-based dm as the queue functions: make_request_fn: dm_make_request() pref_fn: dm_prep_fn() request_fn: dm_request_fn() softirq_done_fn: dm_softirq_done() lld_busy_fn: dm_lld_busy() Actual initializations are done in another patch (PATCH 2). Below is a brief summary of how request-based dm behaves, including: - making request from bio - cloning, mapping and dispatching request - completing request and bio - suspending md - resuming md bio to request ============== md->queue->make_request_fn() (dm_make_request()) calls __make_request() for a bio submitted to the md. Then, the bio is kept in the queue as a new request or merged into another request in the queue if possible. Cloning and Mapping =================== Cloning and mapping are done in md->queue->request_fn() (dm_request_fn()), when requests are dispatched after they are sorted by the I/O scheduler. dm_request_fn() checks busy state of underlying devices using target's busy() function and stops dispatching requests to keep them on the dm device's queue if busy. It helps better I/O merging, since no merge is done for a request once it is dispatched to underlying devices. Actual cloning and mapping are done in dm_prep_fn() and map_request() called from dm_request_fn(). dm_prep_fn() clones not only request but also bios of the request so that dm can hold bio completion in error cases and prevent the bio submitter from noticing the error. (See the "Completion" section below for details.) After the cloning, the clone is mapped by target's map_rq() function and inserted to underlying device's queue using blk_insert_cloned_request(). Completion ========== Request completion can be hooked by rq->end_io(), but then, all bios in the request will have been completed even error cases, and the bio submitter will have noticed the error. To prevent the bio completion in error cases, request-based dm clones both bio and request and hooks both bio->bi_end_io() and rq->end_io(): bio->bi_end_io(): end_clone_bio() rq->end_io(): end_clone_request() Summary of the request completion flow is below: blk_end_request() for a clone request => blk_update_request() => bio->bi_end_io() == end_clone_bio() for each clone bio => Free the clone bio => Success: Complete the original bio (blk_update_request()) Error: Don't complete the original bio => blk_finish_request() => rq->end_io() == end_clone_request() => blk_complete_request() => dm_softirq_done() => Free the clone request => Success: Complete the original request (blk_end_request()) Error: Requeue the original request end_clone_bio() completes the original request on the size of the original bio in successful cases. Even if all bios in the original request are completed by that completion, the original request must not be completed yet to keep the ordering of request completion for the stacking. So end_clone_bio() uses blk_update_request() instead of blk_end_request(). In error cases, end_clone_bio() doesn't complete the original bio. It just frees the cloned bio and gives over the error handling to end_clone_request(). end_clone_request(), which is called with queue lock held, completes the clone request and the original request in a softirq context (dm_softirq_done()), which has no queue lock, to avoid a deadlock issue on submission of another request during the completion: - The submitted request may be mapped to the same device - Request submission requires queue lock, but the queue lock has been held by itself and it doesn't know that The clone request has no clone bio when dm_softirq_done() is called. So target drivers can't resubmit it again even error cases. Instead, they can ask dm core for requeueing and remapping the original request in that cases. suspend ======= Request-based dm uses stopping md->queue as suspend of the md. For noflush suspend, just stops md->queue. For flush suspend, inserts a marker request to the tail of md->queue. And dispatches all requests in md->queue until the marker comes to the front of md->queue. Then, stops dispatching request and waits for the all dispatched requests to complete. After that, completes the marker request, stops md->queue and wake up the waiter on the suspend queue, md->wait. resume ====== Starts md->queue. Signed-off-by: Kiyoshi Ueda <k-ueda@ct.jp.nec.com> Signed-off-by: Jun'ichi Nomura <j-nomura@ce.jp.nec.com> Signed-off-by: Alasdair G Kergon <agk@redhat.com>
2009-06-22 17:12:35 +08:00
unlock_fs(md);
[PATCH] dm: suspend: add noflush pushback In device-mapper I/O is sometimes queued within targets for later processing. For example the multipath target can be configured to store I/O when no paths are available instead of returning it -EIO. This patch allows the device-mapper core to instruct a target to transfer the contents of any such in-target queue back into the core. This frees up the resources used by the target so the core can replace that target with an alternative one and then resend the I/O to it. Without this patch the only way to change the target in such circumstances involves returning the I/O with an error back to the filesystem/application. In the multipath case, this patch will let us add new paths for existing I/O to try after all the existing paths have failed. DMF_NOFLUSH_SUSPENDING ---------------------- If the DM_NOFLUSH_FLAG ioctl option is specified at suspend time, the DMF_NOFLUSH_SUSPENDING flag is set in md->flags during dm_suspend(). It is always cleared before dm_suspend() returns. The flag must be visible while the target is flushing pending I/Os so it is set before presuspend where the flush starts and unset after the wait for md->pending where the flush ends. Target drivers can check this flag by calling dm_noflush_suspending(). DM_MAPIO_REQUEUE / DM_ENDIO_REQUEUE ----------------------------------- A target's map() function can now return DM_MAPIO_REQUEUE to request the device mapper core queue the bio. Similarly, a target's end_io() function can return DM_ENDIO_REQUEUE to request the same. This has been labelled 'pushback'. The __map_bio() and clone_endio() functions in the core treat these return values as errors and call dec_pending() to end the I/O. dec_pending ----------- dec_pending() saves the pushback request in struct dm_io->error. Once all the split clones have ended, dec_pending() will put the original bio on the md->pushback list. Note that this supercedes any I/O errors. It is possible for the suspend with DM_NOFLUSH_FLAG to be aborted while in progress (e.g. by user interrupt). dec_pending() checks for this and returns -EIO if it happened. pushdback list and pushback_lock -------------------------------- The bio is queued on md->pushback temporarily in dec_pending(), and after all pending I/Os return, md->pushback is merged into md->deferred in dm_suspend() for re-issuing at resume time. md->pushback_lock protects md->pushback. The lock should be held with irq disabled because dec_pending() can be called from interrupt context. Queueing bios to md->pushback in dec_pending() must be done atomically with the check for DMF_NOFLUSH_SUSPENDING flag. So md->pushback_lock is held when checking the flag. Otherwise dec_pending() may queue a bio to md->pushback after the interrupted dm_suspend() flushes md->pushback. Then the bio would be left in md->pushback. Flag setting in dm_suspend() can be done without md->pushback_lock because the flag is checked only after presuspend and the set value is already made visible via the target's presuspend function. The flag can be checked without md->pushback_lock (e.g. the first part of the dec_pending() or target drivers), because the flag is checked again with md->pushback_lock held when the bio is really queued to md->pushback as described above. So even if the flag is cleared after the lockless checkings, the bio isn't left in md->pushback but returned to applications with -EIO. Other notes on the current patch -------------------------------- - md->pushback is added to the struct mapped_device instead of using md->deferred directly because md->io_lock which protects md->deferred is rw_semaphore and can't be used in interrupt context like dec_pending(), and md->io_lock protects the DMF_BLOCK_IO flag of md->flags too. - Don't issue lock_fs() in dm_suspend() if the DM_NOFLUSH_FLAG ioctl option is specified, because I/Os generated by lock_fs() would be pushed back and never return if there were no valid devices. - If an error occurs in dm_suspend() after the DMF_NOFLUSH_SUSPENDING flag is set, md->pushback must be flushed because I/Os may be queued to the list already. (flush_and_out label in dm_suspend()) Test results ------------ I have tested using multipath target with the next patch. The following tests are for regression/compatibility: - I/Os succeed when valid paths exist; - I/Os fail when there are no valid paths and queue_if_no_path is not set; - I/Os are queued in the multipath target when there are no valid paths and queue_if_no_path is set; - The queued I/Os above fail when suspend is issued without the DM_NOFLUSH_FLAG ioctl option. I/Os spanning 2 multipath targets also fail. The following tests are for the normal code path of new pushback feature: - Queued I/Os in the multipath target are flushed from the target but don't return when suspend is issued with the DM_NOFLUSH_FLAG ioctl option; - The I/Os above are queued in the multipath target again when resume is issued without path recovery; - The I/Os above succeed when resume is issued after path recovery or table load; - Queued I/Os in the multipath target succeed when resume is issued with the DM_NOFLUSH_FLAG ioctl option after table load. I/Os spanning 2 multipath targets also succeed. The following tests are for the error paths of the new pushback feature: - When the bdget_disk() fails in dm_suspend(), the DMF_NOFLUSH_SUSPENDING flag is cleared and I/Os already queued to the pushback list are flushed properly. - When suspend with the DM_NOFLUSH_FLAG ioctl option is interrupted, o I/Os which had already been queued to the pushback list at the time don't return, and are re-issued at resume time; o I/Os which hadn't been returned at the time return with EIO. Signed-off-by: Kiyoshi Ueda <k-ueda@ct.jp.nec.com> Signed-off-by: Jun'ichi Nomura <j-nomura@ce.jp.nec.com> Signed-off-by: Alasdair G Kergon <agk@redhat.com> Cc: dm-devel@redhat.com Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-12-08 18:41:09 +08:00
goto out; /* pushback list is already flushed, so skip flush */
}
/*
* If dm_wait_for_completion returned 0, the device is completely
* quiescent now. There is no request-processing activity. All new
* requests are being added to md->deferred list.
*/
set_bit(DMF_SUSPENDED, &md->flags);
dm_table_postsuspend_targets(map);
out:
dm_table_put(map);
out_unlock:
mutex_unlock(&md->suspend_lock);
return r;
}
int dm_resume(struct mapped_device *md)
{
int r = -EINVAL;
struct dm_table *map = NULL;
mutex_lock(&md->suspend_lock);
if (!dm_suspended_md(md))
goto out;
map = dm_get_live_table(md);
if (!map || !dm_table_get_size(map))
goto out;
r = dm_table_resume_targets(map);
if (r)
goto out;
dm_queue_flush(md);
dm: prepare for request based option This patch adds core functions for request-based dm. When struct mapped device (md) is initialized, md->queue has an I/O scheduler and the following functions are used for request-based dm as the queue functions: make_request_fn: dm_make_request() pref_fn: dm_prep_fn() request_fn: dm_request_fn() softirq_done_fn: dm_softirq_done() lld_busy_fn: dm_lld_busy() Actual initializations are done in another patch (PATCH 2). Below is a brief summary of how request-based dm behaves, including: - making request from bio - cloning, mapping and dispatching request - completing request and bio - suspending md - resuming md bio to request ============== md->queue->make_request_fn() (dm_make_request()) calls __make_request() for a bio submitted to the md. Then, the bio is kept in the queue as a new request or merged into another request in the queue if possible. Cloning and Mapping =================== Cloning and mapping are done in md->queue->request_fn() (dm_request_fn()), when requests are dispatched after they are sorted by the I/O scheduler. dm_request_fn() checks busy state of underlying devices using target's busy() function and stops dispatching requests to keep them on the dm device's queue if busy. It helps better I/O merging, since no merge is done for a request once it is dispatched to underlying devices. Actual cloning and mapping are done in dm_prep_fn() and map_request() called from dm_request_fn(). dm_prep_fn() clones not only request but also bios of the request so that dm can hold bio completion in error cases and prevent the bio submitter from noticing the error. (See the "Completion" section below for details.) After the cloning, the clone is mapped by target's map_rq() function and inserted to underlying device's queue using blk_insert_cloned_request(). Completion ========== Request completion can be hooked by rq->end_io(), but then, all bios in the request will have been completed even error cases, and the bio submitter will have noticed the error. To prevent the bio completion in error cases, request-based dm clones both bio and request and hooks both bio->bi_end_io() and rq->end_io(): bio->bi_end_io(): end_clone_bio() rq->end_io(): end_clone_request() Summary of the request completion flow is below: blk_end_request() for a clone request => blk_update_request() => bio->bi_end_io() == end_clone_bio() for each clone bio => Free the clone bio => Success: Complete the original bio (blk_update_request()) Error: Don't complete the original bio => blk_finish_request() => rq->end_io() == end_clone_request() => blk_complete_request() => dm_softirq_done() => Free the clone request => Success: Complete the original request (blk_end_request()) Error: Requeue the original request end_clone_bio() completes the original request on the size of the original bio in successful cases. Even if all bios in the original request are completed by that completion, the original request must not be completed yet to keep the ordering of request completion for the stacking. So end_clone_bio() uses blk_update_request() instead of blk_end_request(). In error cases, end_clone_bio() doesn't complete the original bio. It just frees the cloned bio and gives over the error handling to end_clone_request(). end_clone_request(), which is called with queue lock held, completes the clone request and the original request in a softirq context (dm_softirq_done()), which has no queue lock, to avoid a deadlock issue on submission of another request during the completion: - The submitted request may be mapped to the same device - Request submission requires queue lock, but the queue lock has been held by itself and it doesn't know that The clone request has no clone bio when dm_softirq_done() is called. So target drivers can't resubmit it again even error cases. Instead, they can ask dm core for requeueing and remapping the original request in that cases. suspend ======= Request-based dm uses stopping md->queue as suspend of the md. For noflush suspend, just stops md->queue. For flush suspend, inserts a marker request to the tail of md->queue. And dispatches all requests in md->queue until the marker comes to the front of md->queue. Then, stops dispatching request and waits for the all dispatched requests to complete. After that, completes the marker request, stops md->queue and wake up the waiter on the suspend queue, md->wait. resume ====== Starts md->queue. Signed-off-by: Kiyoshi Ueda <k-ueda@ct.jp.nec.com> Signed-off-by: Jun'ichi Nomura <j-nomura@ce.jp.nec.com> Signed-off-by: Alasdair G Kergon <agk@redhat.com>
2009-06-22 17:12:35 +08:00
/*
* Flushing deferred I/Os must be done after targets are resumed
* so that mapping of targets can work correctly.
* Request-based dm is queueing the deferred I/Os in its request_queue.
*/
if (dm_request_based(md))
start_queue(md->queue);
unlock_fs(md);
clear_bit(DMF_SUSPENDED, &md->flags);
r = 0;
out:
dm_table_put(map);
mutex_unlock(&md->suspend_lock);
return r;
}
/*-----------------------------------------------------------------
* Event notification.
*---------------------------------------------------------------*/
int dm_kobject_uevent(struct mapped_device *md, enum kobject_action action,
unsigned cookie)
{
char udev_cookie[DM_COOKIE_LENGTH];
char *envp[] = { udev_cookie, NULL };
if (!cookie)
return kobject_uevent(&disk_to_dev(md->disk)->kobj, action);
else {
snprintf(udev_cookie, DM_COOKIE_LENGTH, "%s=%u",
DM_COOKIE_ENV_VAR_NAME, cookie);
return kobject_uevent_env(&disk_to_dev(md->disk)->kobj,
action, envp);
}
}
uint32_t dm_next_uevent_seq(struct mapped_device *md)
{
return atomic_add_return(1, &md->uevent_seq);
}
uint32_t dm_get_event_nr(struct mapped_device *md)
{
return atomic_read(&md->event_nr);
}
int dm_wait_event(struct mapped_device *md, int event_nr)
{
return wait_event_interruptible(md->eventq,
(event_nr != atomic_read(&md->event_nr)));
}
void dm_uevent_add(struct mapped_device *md, struct list_head *elist)
{
unsigned long flags;
spin_lock_irqsave(&md->uevent_lock, flags);
list_add(elist, &md->uevent_list);
spin_unlock_irqrestore(&md->uevent_lock, flags);
}
/*
* The gendisk is only valid as long as you have a reference
* count on 'md'.
*/
struct gendisk *dm_disk(struct mapped_device *md)
{
return md->disk;
}
struct kobject *dm_kobject(struct mapped_device *md)
{
return &md->kobj;
}
/*
* struct mapped_device should not be exported outside of dm.c
* so use this check to verify that kobj is part of md structure
*/
struct mapped_device *dm_get_from_kobject(struct kobject *kobj)
{
struct mapped_device *md;
md = container_of(kobj, struct mapped_device, kobj);
if (&md->kobj != kobj)
return NULL;
if (test_bit(DMF_FREEING, &md->flags) ||
dm_deleting_md(md))
return NULL;
dm_get(md);
return md;
}
int dm_suspended_md(struct mapped_device *md)
{
return test_bit(DMF_SUSPENDED, &md->flags);
}
int dm_suspended(struct dm_target *ti)
{
dm table: remove dm_get from dm_table_get_md Remove the dm_get() in dm_table_get_md() because dm_table_get_md() could be called from presuspend/postsuspend, which are called while mapped_device is in DMF_FREEING state, where dm_get() is not allowed. Justification for that is the lifetime of both objects: As far as the current dm design/implementation, mapped_device is never freed while targets are doing something, because dm core waits for targets to become quiet in dm_put() using presuspend/postsuspend. So targets should be able to touch mapped_device without holding reference count of the mapped_device, and we should allow targets to touch mapped_device even if it is in DMF_FREEING state. Backgrounds: I'm trying to remove the multipath internal queue, since dm core now has a generic queue for request-based dm. In the patch-set, the multipath target wants to request dm core to start/stop queue. One of such start/stop requests can happen during postsuspend() while the target waits for pg-init to complete, because the target stops queue when starting pg-init and tries to restart it when completing pg-init. Since queue belongs to mapped_device, it involves calling dm_table_get_md() and dm_put(). On the other hand, postsuspend() is called in dm_put() for mapped_device which is in DMF_FREEING state, and that triggers BUG_ON(DMF_FREEING) in the 2nd dm_put(). I had tried to solve this problem by changing only multipath not to touch mapped_device which is in DMF_FREEING state, but I couldn't and I came up with a question why we need dm_get() in dm_table_get_md(). Signed-off-by: Kiyoshi Ueda <k-ueda@ct.jp.nec.com> Signed-off-by: Jun'ichi Nomura <j-nomura@ce.jp.nec.com> Signed-off-by: Alasdair G Kergon <agk@redhat.com>
2010-03-06 10:29:52 +08:00
return dm_suspended_md(dm_table_get_md(ti->table));
}
EXPORT_SYMBOL_GPL(dm_suspended);
[PATCH] dm: suspend: add noflush pushback In device-mapper I/O is sometimes queued within targets for later processing. For example the multipath target can be configured to store I/O when no paths are available instead of returning it -EIO. This patch allows the device-mapper core to instruct a target to transfer the contents of any such in-target queue back into the core. This frees up the resources used by the target so the core can replace that target with an alternative one and then resend the I/O to it. Without this patch the only way to change the target in such circumstances involves returning the I/O with an error back to the filesystem/application. In the multipath case, this patch will let us add new paths for existing I/O to try after all the existing paths have failed. DMF_NOFLUSH_SUSPENDING ---------------------- If the DM_NOFLUSH_FLAG ioctl option is specified at suspend time, the DMF_NOFLUSH_SUSPENDING flag is set in md->flags during dm_suspend(). It is always cleared before dm_suspend() returns. The flag must be visible while the target is flushing pending I/Os so it is set before presuspend where the flush starts and unset after the wait for md->pending where the flush ends. Target drivers can check this flag by calling dm_noflush_suspending(). DM_MAPIO_REQUEUE / DM_ENDIO_REQUEUE ----------------------------------- A target's map() function can now return DM_MAPIO_REQUEUE to request the device mapper core queue the bio. Similarly, a target's end_io() function can return DM_ENDIO_REQUEUE to request the same. This has been labelled 'pushback'. The __map_bio() and clone_endio() functions in the core treat these return values as errors and call dec_pending() to end the I/O. dec_pending ----------- dec_pending() saves the pushback request in struct dm_io->error. Once all the split clones have ended, dec_pending() will put the original bio on the md->pushback list. Note that this supercedes any I/O errors. It is possible for the suspend with DM_NOFLUSH_FLAG to be aborted while in progress (e.g. by user interrupt). dec_pending() checks for this and returns -EIO if it happened. pushdback list and pushback_lock -------------------------------- The bio is queued on md->pushback temporarily in dec_pending(), and after all pending I/Os return, md->pushback is merged into md->deferred in dm_suspend() for re-issuing at resume time. md->pushback_lock protects md->pushback. The lock should be held with irq disabled because dec_pending() can be called from interrupt context. Queueing bios to md->pushback in dec_pending() must be done atomically with the check for DMF_NOFLUSH_SUSPENDING flag. So md->pushback_lock is held when checking the flag. Otherwise dec_pending() may queue a bio to md->pushback after the interrupted dm_suspend() flushes md->pushback. Then the bio would be left in md->pushback. Flag setting in dm_suspend() can be done without md->pushback_lock because the flag is checked only after presuspend and the set value is already made visible via the target's presuspend function. The flag can be checked without md->pushback_lock (e.g. the first part of the dec_pending() or target drivers), because the flag is checked again with md->pushback_lock held when the bio is really queued to md->pushback as described above. So even if the flag is cleared after the lockless checkings, the bio isn't left in md->pushback but returned to applications with -EIO. Other notes on the current patch -------------------------------- - md->pushback is added to the struct mapped_device instead of using md->deferred directly because md->io_lock which protects md->deferred is rw_semaphore and can't be used in interrupt context like dec_pending(), and md->io_lock protects the DMF_BLOCK_IO flag of md->flags too. - Don't issue lock_fs() in dm_suspend() if the DM_NOFLUSH_FLAG ioctl option is specified, because I/Os generated by lock_fs() would be pushed back and never return if there were no valid devices. - If an error occurs in dm_suspend() after the DMF_NOFLUSH_SUSPENDING flag is set, md->pushback must be flushed because I/Os may be queued to the list already. (flush_and_out label in dm_suspend()) Test results ------------ I have tested using multipath target with the next patch. The following tests are for regression/compatibility: - I/Os succeed when valid paths exist; - I/Os fail when there are no valid paths and queue_if_no_path is not set; - I/Os are queued in the multipath target when there are no valid paths and queue_if_no_path is set; - The queued I/Os above fail when suspend is issued without the DM_NOFLUSH_FLAG ioctl option. I/Os spanning 2 multipath targets also fail. The following tests are for the normal code path of new pushback feature: - Queued I/Os in the multipath target are flushed from the target but don't return when suspend is issued with the DM_NOFLUSH_FLAG ioctl option; - The I/Os above are queued in the multipath target again when resume is issued without path recovery; - The I/Os above succeed when resume is issued after path recovery or table load; - Queued I/Os in the multipath target succeed when resume is issued with the DM_NOFLUSH_FLAG ioctl option after table load. I/Os spanning 2 multipath targets also succeed. The following tests are for the error paths of the new pushback feature: - When the bdget_disk() fails in dm_suspend(), the DMF_NOFLUSH_SUSPENDING flag is cleared and I/Os already queued to the pushback list are flushed properly. - When suspend with the DM_NOFLUSH_FLAG ioctl option is interrupted, o I/Os which had already been queued to the pushback list at the time don't return, and are re-issued at resume time; o I/Os which hadn't been returned at the time return with EIO. Signed-off-by: Kiyoshi Ueda <k-ueda@ct.jp.nec.com> Signed-off-by: Jun'ichi Nomura <j-nomura@ce.jp.nec.com> Signed-off-by: Alasdair G Kergon <agk@redhat.com> Cc: dm-devel@redhat.com Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-12-08 18:41:09 +08:00
int dm_noflush_suspending(struct dm_target *ti)
{
dm table: remove dm_get from dm_table_get_md Remove the dm_get() in dm_table_get_md() because dm_table_get_md() could be called from presuspend/postsuspend, which are called while mapped_device is in DMF_FREEING state, where dm_get() is not allowed. Justification for that is the lifetime of both objects: As far as the current dm design/implementation, mapped_device is never freed while targets are doing something, because dm core waits for targets to become quiet in dm_put() using presuspend/postsuspend. So targets should be able to touch mapped_device without holding reference count of the mapped_device, and we should allow targets to touch mapped_device even if it is in DMF_FREEING state. Backgrounds: I'm trying to remove the multipath internal queue, since dm core now has a generic queue for request-based dm. In the patch-set, the multipath target wants to request dm core to start/stop queue. One of such start/stop requests can happen during postsuspend() while the target waits for pg-init to complete, because the target stops queue when starting pg-init and tries to restart it when completing pg-init. Since queue belongs to mapped_device, it involves calling dm_table_get_md() and dm_put(). On the other hand, postsuspend() is called in dm_put() for mapped_device which is in DMF_FREEING state, and that triggers BUG_ON(DMF_FREEING) in the 2nd dm_put(). I had tried to solve this problem by changing only multipath not to touch mapped_device which is in DMF_FREEING state, but I couldn't and I came up with a question why we need dm_get() in dm_table_get_md(). Signed-off-by: Kiyoshi Ueda <k-ueda@ct.jp.nec.com> Signed-off-by: Jun'ichi Nomura <j-nomura@ce.jp.nec.com> Signed-off-by: Alasdair G Kergon <agk@redhat.com>
2010-03-06 10:29:52 +08:00
return __noflush_suspending(dm_table_get_md(ti->table));
[PATCH] dm: suspend: add noflush pushback In device-mapper I/O is sometimes queued within targets for later processing. For example the multipath target can be configured to store I/O when no paths are available instead of returning it -EIO. This patch allows the device-mapper core to instruct a target to transfer the contents of any such in-target queue back into the core. This frees up the resources used by the target so the core can replace that target with an alternative one and then resend the I/O to it. Without this patch the only way to change the target in such circumstances involves returning the I/O with an error back to the filesystem/application. In the multipath case, this patch will let us add new paths for existing I/O to try after all the existing paths have failed. DMF_NOFLUSH_SUSPENDING ---------------------- If the DM_NOFLUSH_FLAG ioctl option is specified at suspend time, the DMF_NOFLUSH_SUSPENDING flag is set in md->flags during dm_suspend(). It is always cleared before dm_suspend() returns. The flag must be visible while the target is flushing pending I/Os so it is set before presuspend where the flush starts and unset after the wait for md->pending where the flush ends. Target drivers can check this flag by calling dm_noflush_suspending(). DM_MAPIO_REQUEUE / DM_ENDIO_REQUEUE ----------------------------------- A target's map() function can now return DM_MAPIO_REQUEUE to request the device mapper core queue the bio. Similarly, a target's end_io() function can return DM_ENDIO_REQUEUE to request the same. This has been labelled 'pushback'. The __map_bio() and clone_endio() functions in the core treat these return values as errors and call dec_pending() to end the I/O. dec_pending ----------- dec_pending() saves the pushback request in struct dm_io->error. Once all the split clones have ended, dec_pending() will put the original bio on the md->pushback list. Note that this supercedes any I/O errors. It is possible for the suspend with DM_NOFLUSH_FLAG to be aborted while in progress (e.g. by user interrupt). dec_pending() checks for this and returns -EIO if it happened. pushdback list and pushback_lock -------------------------------- The bio is queued on md->pushback temporarily in dec_pending(), and after all pending I/Os return, md->pushback is merged into md->deferred in dm_suspend() for re-issuing at resume time. md->pushback_lock protects md->pushback. The lock should be held with irq disabled because dec_pending() can be called from interrupt context. Queueing bios to md->pushback in dec_pending() must be done atomically with the check for DMF_NOFLUSH_SUSPENDING flag. So md->pushback_lock is held when checking the flag. Otherwise dec_pending() may queue a bio to md->pushback after the interrupted dm_suspend() flushes md->pushback. Then the bio would be left in md->pushback. Flag setting in dm_suspend() can be done without md->pushback_lock because the flag is checked only after presuspend and the set value is already made visible via the target's presuspend function. The flag can be checked without md->pushback_lock (e.g. the first part of the dec_pending() or target drivers), because the flag is checked again with md->pushback_lock held when the bio is really queued to md->pushback as described above. So even if the flag is cleared after the lockless checkings, the bio isn't left in md->pushback but returned to applications with -EIO. Other notes on the current patch -------------------------------- - md->pushback is added to the struct mapped_device instead of using md->deferred directly because md->io_lock which protects md->deferred is rw_semaphore and can't be used in interrupt context like dec_pending(), and md->io_lock protects the DMF_BLOCK_IO flag of md->flags too. - Don't issue lock_fs() in dm_suspend() if the DM_NOFLUSH_FLAG ioctl option is specified, because I/Os generated by lock_fs() would be pushed back and never return if there were no valid devices. - If an error occurs in dm_suspend() after the DMF_NOFLUSH_SUSPENDING flag is set, md->pushback must be flushed because I/Os may be queued to the list already. (flush_and_out label in dm_suspend()) Test results ------------ I have tested using multipath target with the next patch. The following tests are for regression/compatibility: - I/Os succeed when valid paths exist; - I/Os fail when there are no valid paths and queue_if_no_path is not set; - I/Os are queued in the multipath target when there are no valid paths and queue_if_no_path is set; - The queued I/Os above fail when suspend is issued without the DM_NOFLUSH_FLAG ioctl option. I/Os spanning 2 multipath targets also fail. The following tests are for the normal code path of new pushback feature: - Queued I/Os in the multipath target are flushed from the target but don't return when suspend is issued with the DM_NOFLUSH_FLAG ioctl option; - The I/Os above are queued in the multipath target again when resume is issued without path recovery; - The I/Os above succeed when resume is issued after path recovery or table load; - Queued I/Os in the multipath target succeed when resume is issued with the DM_NOFLUSH_FLAG ioctl option after table load. I/Os spanning 2 multipath targets also succeed. The following tests are for the error paths of the new pushback feature: - When the bdget_disk() fails in dm_suspend(), the DMF_NOFLUSH_SUSPENDING flag is cleared and I/Os already queued to the pushback list are flushed properly. - When suspend with the DM_NOFLUSH_FLAG ioctl option is interrupted, o I/Os which had already been queued to the pushback list at the time don't return, and are re-issued at resume time; o I/Os which hadn't been returned at the time return with EIO. Signed-off-by: Kiyoshi Ueda <k-ueda@ct.jp.nec.com> Signed-off-by: Jun'ichi Nomura <j-nomura@ce.jp.nec.com> Signed-off-by: Alasdair G Kergon <agk@redhat.com> Cc: dm-devel@redhat.com Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-12-08 18:41:09 +08:00
}
EXPORT_SYMBOL_GPL(dm_noflush_suspending);
struct dm_md_mempools *dm_alloc_md_mempools(unsigned type, unsigned integrity)
{
struct dm_md_mempools *pools = kmalloc(sizeof(*pools), GFP_KERNEL);
unsigned int pool_size = (type == DM_TYPE_BIO_BASED) ? 16 : MIN_IOS;
if (!pools)
return NULL;
pools->io_pool = (type == DM_TYPE_BIO_BASED) ?
mempool_create_slab_pool(MIN_IOS, _io_cache) :
mempool_create_slab_pool(MIN_IOS, _rq_bio_info_cache);
if (!pools->io_pool)
goto free_pools_and_out;
pools->tio_pool = (type == DM_TYPE_BIO_BASED) ?
mempool_create_slab_pool(MIN_IOS, _tio_cache) :
mempool_create_slab_pool(MIN_IOS, _rq_tio_cache);
if (!pools->tio_pool)
goto free_io_pool_and_out;
pools->bs = bioset_create(pool_size, 0);
if (!pools->bs)
goto free_tio_pool_and_out;
if (integrity && bioset_integrity_create(pools->bs, pool_size))
goto free_bioset_and_out;
return pools;
free_bioset_and_out:
bioset_free(pools->bs);
free_tio_pool_and_out:
mempool_destroy(pools->tio_pool);
free_io_pool_and_out:
mempool_destroy(pools->io_pool);
free_pools_and_out:
kfree(pools);
return NULL;
}
void dm_free_md_mempools(struct dm_md_mempools *pools)
{
if (!pools)
return;
if (pools->io_pool)
mempool_destroy(pools->io_pool);
if (pools->tio_pool)
mempool_destroy(pools->tio_pool);
if (pools->bs)
bioset_free(pools->bs);
kfree(pools);
}
static const struct block_device_operations dm_blk_dops = {
.open = dm_blk_open,
.release = dm_blk_close,
.ioctl = dm_blk_ioctl,
.getgeo = dm_blk_getgeo,
.owner = THIS_MODULE
};
EXPORT_SYMBOL(dm_get_mapinfo);
/*
* module hooks
*/
module_init(dm_init);
module_exit(dm_exit);
module_param(major, uint, 0);
MODULE_PARM_DESC(major, "The major number of the device mapper");
MODULE_DESCRIPTION(DM_NAME " driver");
MODULE_AUTHOR("Joe Thornber <dm-devel@redhat.com>");
MODULE_LICENSE("GPL");