[PATCH] md: add write-intent-bitmap support to raid5
Most awkward part of this is delaying write requests until bitmap updates have been flushed. To achieve this, we have a sequence number (seq_flush) which is incremented each time the raid5 is unplugged. If the raid thread notices that this has changed, it flushes bitmap changes, and assigned the value of seq_flush to seq_write. When a write request arrives, it is given the number from seq_write, and that write request may not complete until seq_flush is larger than the saved seq number. We have a new queue for storing stripes which are waiting for a bitmap flush and an extra flag for stripes to record if the write was 'degraded' and so should not clear the a bit in the bitmap. Signed-off-by: Neil Brown <neilb@cse.unsw.edu.au> Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
This commit is contained in:
parent
0002b2718d
commit
72626685dc
|
@ -645,7 +645,7 @@ static int super_90_validate(mddev_t *mddev, mdk_rdev_t *rdev)
|
|||
|
||||
if (sb->state & (1<<MD_SB_BITMAP_PRESENT) &&
|
||||
mddev->bitmap_file == NULL) {
|
||||
if (mddev->level != 1) {
|
||||
if (mddev->level != 1 && mddev->level != 5) {
|
||||
/* FIXME use a better test */
|
||||
printk(KERN_WARNING "md: bitmaps only support for raid1\n");
|
||||
return -EINVAL;
|
||||
|
@ -3517,7 +3517,6 @@ void md_done_sync(mddev_t *mddev, int blocks, int ok)
|
|||
*/
|
||||
void md_write_start(mddev_t *mddev, struct bio *bi)
|
||||
{
|
||||
DEFINE_WAIT(w);
|
||||
if (bio_data_dir(bi) != WRITE)
|
||||
return;
|
||||
|
||||
|
|
|
@ -24,6 +24,8 @@
|
|||
#include <linux/bitops.h>
|
||||
#include <asm/atomic.h>
|
||||
|
||||
#include <linux/raid/bitmap.h>
|
||||
|
||||
/*
|
||||
* Stripe cache
|
||||
*/
|
||||
|
@ -79,8 +81,13 @@ static inline void __release_stripe(raid5_conf_t *conf, struct stripe_head *sh)
|
|||
if (test_bit(STRIPE_HANDLE, &sh->state)) {
|
||||
if (test_bit(STRIPE_DELAYED, &sh->state))
|
||||
list_add_tail(&sh->lru, &conf->delayed_list);
|
||||
else
|
||||
else if (test_bit(STRIPE_BIT_DELAY, &sh->state) &&
|
||||
conf->seq_write == sh->bm_seq)
|
||||
list_add_tail(&sh->lru, &conf->bitmap_list);
|
||||
else {
|
||||
clear_bit(STRIPE_BIT_DELAY, &sh->state);
|
||||
list_add_tail(&sh->lru, &conf->handle_list);
|
||||
}
|
||||
md_wakeup_thread(conf->mddev->thread);
|
||||
} else {
|
||||
if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) {
|
||||
|
@ -244,6 +251,9 @@ static struct stripe_head *get_active_stripe(raid5_conf_t *conf, sector_t sector
|
|||
spin_lock_irq(&conf->device_lock);
|
||||
|
||||
do {
|
||||
wait_event_lock_irq(conf->wait_for_stripe,
|
||||
conf->quiesce == 0,
|
||||
conf->device_lock, /* nothing */);
|
||||
sh = __find_stripe(conf, sector);
|
||||
if (!sh) {
|
||||
if (!conf->inactive_blocked)
|
||||
|
@ -803,6 +813,7 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, in
|
|||
{
|
||||
struct bio **bip;
|
||||
raid5_conf_t *conf = sh->raid_conf;
|
||||
int firstwrite=0;
|
||||
|
||||
PRINTK("adding bh b#%llu to stripe s#%llu\n",
|
||||
(unsigned long long)bi->bi_sector,
|
||||
|
@ -811,9 +822,11 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, in
|
|||
|
||||
spin_lock(&sh->lock);
|
||||
spin_lock_irq(&conf->device_lock);
|
||||
if (forwrite)
|
||||
if (forwrite) {
|
||||
bip = &sh->dev[dd_idx].towrite;
|
||||
else
|
||||
if (*bip == NULL && sh->dev[dd_idx].written == NULL)
|
||||
firstwrite = 1;
|
||||
} else
|
||||
bip = &sh->dev[dd_idx].toread;
|
||||
while (*bip && (*bip)->bi_sector < bi->bi_sector) {
|
||||
if ((*bip)->bi_sector + ((*bip)->bi_size >> 9) > bi->bi_sector)
|
||||
|
@ -836,6 +849,13 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, in
|
|||
(unsigned long long)bi->bi_sector,
|
||||
(unsigned long long)sh->sector, dd_idx);
|
||||
|
||||
if (conf->mddev->bitmap && firstwrite) {
|
||||
sh->bm_seq = conf->seq_write;
|
||||
bitmap_startwrite(conf->mddev->bitmap, sh->sector,
|
||||
STRIPE_SECTORS, 0);
|
||||
set_bit(STRIPE_BIT_DELAY, &sh->state);
|
||||
}
|
||||
|
||||
if (forwrite) {
|
||||
/* check if page is covered */
|
||||
sector_t sector = sh->dev[dd_idx].sector;
|
||||
|
@ -958,12 +978,13 @@ static void handle_stripe(struct stripe_head *sh)
|
|||
* need to be failed
|
||||
*/
|
||||
if (failed > 1 && to_read+to_write+written) {
|
||||
spin_lock_irq(&conf->device_lock);
|
||||
for (i=disks; i--; ) {
|
||||
int bitmap_end = 0;
|
||||
spin_lock_irq(&conf->device_lock);
|
||||
/* fail all writes first */
|
||||
bi = sh->dev[i].towrite;
|
||||
sh->dev[i].towrite = NULL;
|
||||
if (bi) to_write--;
|
||||
if (bi) { to_write--; bitmap_end = 1; }
|
||||
|
||||
if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags))
|
||||
wake_up(&conf->wait_for_overlap);
|
||||
|
@ -981,6 +1002,7 @@ static void handle_stripe(struct stripe_head *sh)
|
|||
/* and fail all 'written' */
|
||||
bi = sh->dev[i].written;
|
||||
sh->dev[i].written = NULL;
|
||||
if (bi) bitmap_end = 1;
|
||||
while (bi && bi->bi_sector < sh->dev[i].sector + STRIPE_SECTORS) {
|
||||
struct bio *bi2 = r5_next_bio(bi, sh->dev[i].sector);
|
||||
clear_bit(BIO_UPTODATE, &bi->bi_flags);
|
||||
|
@ -1009,8 +1031,11 @@ static void handle_stripe(struct stripe_head *sh)
|
|||
bi = nextbi;
|
||||
}
|
||||
}
|
||||
}
|
||||
spin_unlock_irq(&conf->device_lock);
|
||||
if (bitmap_end)
|
||||
bitmap_endwrite(conf->mddev->bitmap, sh->sector,
|
||||
STRIPE_SECTORS, 0, 0);
|
||||
}
|
||||
}
|
||||
if (failed > 1 && syncing) {
|
||||
md_done_sync(conf->mddev, STRIPE_SECTORS,0);
|
||||
|
@ -1038,6 +1063,7 @@ static void handle_stripe(struct stripe_head *sh)
|
|||
test_bit(R5_UPTODATE, &dev->flags) ) {
|
||||
/* We can return any write requests */
|
||||
struct bio *wbi, *wbi2;
|
||||
int bitmap_end = 0;
|
||||
PRINTK("Return write for disc %d\n", i);
|
||||
spin_lock_irq(&conf->device_lock);
|
||||
wbi = dev->written;
|
||||
|
@ -1051,7 +1077,13 @@ static void handle_stripe(struct stripe_head *sh)
|
|||
}
|
||||
wbi = wbi2;
|
||||
}
|
||||
if (dev->towrite == NULL)
|
||||
bitmap_end = 1;
|
||||
spin_unlock_irq(&conf->device_lock);
|
||||
if (bitmap_end)
|
||||
bitmap_endwrite(conf->mddev->bitmap, sh->sector,
|
||||
STRIPE_SECTORS,
|
||||
!test_bit(STRIPE_DEGRADED, &sh->state), 0);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -1175,7 +1207,8 @@ static void handle_stripe(struct stripe_head *sh)
|
|||
}
|
||||
}
|
||||
/* now if nothing is locked, and if we have enough data, we can start a write request */
|
||||
if (locked == 0 && (rcw == 0 ||rmw == 0)) {
|
||||
if (locked == 0 && (rcw == 0 ||rmw == 0) &&
|
||||
!test_bit(STRIPE_BIT_DELAY, &sh->state)) {
|
||||
PRINTK("Computing parity...\n");
|
||||
compute_parity(sh, rcw==0 ? RECONSTRUCT_WRITE : READ_MODIFY_WRITE);
|
||||
/* now every locked buffer is ready to be written */
|
||||
|
@ -1231,6 +1264,7 @@ static void handle_stripe(struct stripe_head *sh)
|
|||
dev = &sh->dev[failed_num];
|
||||
set_bit(R5_LOCKED, &dev->flags);
|
||||
set_bit(R5_Wantwrite, &dev->flags);
|
||||
clear_bit(STRIPE_DEGRADED, &sh->state);
|
||||
locked++;
|
||||
set_bit(STRIPE_INSYNC, &sh->state);
|
||||
set_bit(R5_Syncio, &dev->flags);
|
||||
|
@ -1298,6 +1332,8 @@ static void handle_stripe(struct stripe_head *sh)
|
|||
bi->bi_next = NULL;
|
||||
generic_make_request(bi);
|
||||
} else {
|
||||
if (rw == 1)
|
||||
set_bit(STRIPE_DEGRADED, &sh->state);
|
||||
PRINTK("skip op %ld on disc %d for sector %llu\n",
|
||||
bi->bi_rw, i, (unsigned long long)sh->sector);
|
||||
clear_bit(R5_LOCKED, &sh->dev[i].flags);
|
||||
|
@ -1322,6 +1358,20 @@ static inline void raid5_activate_delayed(raid5_conf_t *conf)
|
|||
}
|
||||
}
|
||||
|
||||
static inline void activate_bit_delay(raid5_conf_t *conf)
|
||||
{
|
||||
/* device_lock is held */
|
||||
struct list_head head;
|
||||
list_add(&head, &conf->bitmap_list);
|
||||
list_del_init(&conf->bitmap_list);
|
||||
while (!list_empty(&head)) {
|
||||
struct stripe_head *sh = list_entry(head.next, struct stripe_head, lru);
|
||||
list_del_init(&sh->lru);
|
||||
atomic_inc(&sh->count);
|
||||
__release_stripe(conf, sh);
|
||||
}
|
||||
}
|
||||
|
||||
static void unplug_slaves(mddev_t *mddev)
|
||||
{
|
||||
raid5_conf_t *conf = mddev_to_conf(mddev);
|
||||
|
@ -1354,8 +1404,10 @@ static void raid5_unplug_device(request_queue_t *q)
|
|||
|
||||
spin_lock_irqsave(&conf->device_lock, flags);
|
||||
|
||||
if (blk_remove_plug(q))
|
||||
if (blk_remove_plug(q)) {
|
||||
conf->seq_flush++;
|
||||
raid5_activate_delayed(conf);
|
||||
}
|
||||
md_wakeup_thread(mddev->thread);
|
||||
|
||||
spin_unlock_irqrestore(&conf->device_lock, flags);
|
||||
|
@ -1493,10 +1545,20 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i
|
|||
sector_t first_sector;
|
||||
int raid_disks = conf->raid_disks;
|
||||
int data_disks = raid_disks-1;
|
||||
sector_t max_sector = mddev->size << 1;
|
||||
int sync_blocks;
|
||||
|
||||
if (sector_nr >= mddev->size <<1) {
|
||||
if (sector_nr >= max_sector) {
|
||||
/* just being told to finish up .. nothing much to do */
|
||||
unplug_slaves(mddev);
|
||||
|
||||
if (mddev->curr_resync < max_sector) /* aborted */
|
||||
bitmap_end_sync(mddev->bitmap, mddev->curr_resync,
|
||||
&sync_blocks, 1);
|
||||
else /* compelted sync */
|
||||
conf->fullsync = 0;
|
||||
bitmap_close_sync(mddev->bitmap);
|
||||
|
||||
return 0;
|
||||
}
|
||||
/* if there is 1 or more failed drives and we are trying
|
||||
|
@ -1508,6 +1570,13 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i
|
|||
*skipped = 1;
|
||||
return rv;
|
||||
}
|
||||
if (!bitmap_start_sync(mddev->bitmap, sector_nr, &sync_blocks, 1) &&
|
||||
!conf->fullsync && sync_blocks >= STRIPE_SECTORS) {
|
||||
/* we can skip this block, and probably more */
|
||||
sync_blocks /= STRIPE_SECTORS;
|
||||
*skipped = 1;
|
||||
return sync_blocks * STRIPE_SECTORS; /* keep things rounded to whole stripes */
|
||||
}
|
||||
|
||||
x = sector_nr;
|
||||
chunk_offset = sector_div(x, sectors_per_chunk);
|
||||
|
@ -1525,6 +1594,7 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i
|
|||
set_current_state(TASK_UNINTERRUPTIBLE);
|
||||
schedule_timeout(1);
|
||||
}
|
||||
bitmap_start_sync(mddev->bitmap, sector_nr, &sync_blocks, 0);
|
||||
spin_lock(&sh->lock);
|
||||
set_bit(STRIPE_SYNCING, &sh->state);
|
||||
clear_bit(STRIPE_INSYNC, &sh->state);
|
||||
|
@ -1558,6 +1628,13 @@ static void raid5d (mddev_t *mddev)
|
|||
while (1) {
|
||||
struct list_head *first;
|
||||
|
||||
if (conf->seq_flush - conf->seq_write > 0) {
|
||||
int seq = conf->seq_flush;
|
||||
bitmap_unplug(mddev->bitmap);
|
||||
conf->seq_write = seq;
|
||||
activate_bit_delay(conf);
|
||||
}
|
||||
|
||||
if (list_empty(&conf->handle_list) &&
|
||||
atomic_read(&conf->preread_active_stripes) < IO_THRESHOLD &&
|
||||
!blk_queue_plugged(mddev->queue) &&
|
||||
|
@ -1621,6 +1698,7 @@ static int run (mddev_t *mddev)
|
|||
init_waitqueue_head(&conf->wait_for_overlap);
|
||||
INIT_LIST_HEAD(&conf->handle_list);
|
||||
INIT_LIST_HEAD(&conf->delayed_list);
|
||||
INIT_LIST_HEAD(&conf->bitmap_list);
|
||||
INIT_LIST_HEAD(&conf->inactive_list);
|
||||
atomic_set(&conf->active_stripes, 0);
|
||||
atomic_set(&conf->preread_active_stripes, 0);
|
||||
|
@ -1732,6 +1810,9 @@ memory = conf->max_nr_stripes * (sizeof(struct stripe_head) +
|
|||
|
||||
/* Ok, everything is just fine now */
|
||||
|
||||
if (mddev->bitmap)
|
||||
mddev->thread->timeout = mddev->bitmap->daemon_sleep * HZ;
|
||||
|
||||
mddev->queue->unplug_fn = raid5_unplug_device;
|
||||
mddev->queue->issue_flush_fn = raid5_issue_flush;
|
||||
|
||||
|
@ -1912,6 +1993,8 @@ static int raid5_add_disk(mddev_t *mddev, mdk_rdev_t *rdev)
|
|||
rdev->in_sync = 0;
|
||||
rdev->raid_disk = disk;
|
||||
found = 1;
|
||||
if (rdev->saved_raid_disk != disk)
|
||||
conf->fullsync = 1;
|
||||
p->rdev = rdev;
|
||||
break;
|
||||
}
|
||||
|
@ -1941,6 +2024,35 @@ static int raid5_resize(mddev_t *mddev, sector_t sectors)
|
|||
return 0;
|
||||
}
|
||||
|
||||
static void raid5_quiesce(mddev_t *mddev, int state)
|
||||
{
|
||||
raid5_conf_t *conf = mddev_to_conf(mddev);
|
||||
|
||||
switch(state) {
|
||||
case 1: /* stop all writes */
|
||||
spin_lock_irq(&conf->device_lock);
|
||||
conf->quiesce = 1;
|
||||
wait_event_lock_irq(conf->wait_for_stripe,
|
||||
atomic_read(&conf->active_stripes) == 0,
|
||||
conf->device_lock, /* nothing */);
|
||||
spin_unlock_irq(&conf->device_lock);
|
||||
break;
|
||||
|
||||
case 0: /* re-enable writes */
|
||||
spin_lock_irq(&conf->device_lock);
|
||||
conf->quiesce = 0;
|
||||
wake_up(&conf->wait_for_stripe);
|
||||
spin_unlock_irq(&conf->device_lock);
|
||||
break;
|
||||
}
|
||||
if (mddev->thread) {
|
||||
if (mddev->bitmap)
|
||||
mddev->thread->timeout = mddev->bitmap->daemon_sleep * HZ;
|
||||
else
|
||||
mddev->thread->timeout = MAX_SCHEDULE_TIMEOUT;
|
||||
md_wakeup_thread(mddev->thread);
|
||||
}
|
||||
}
|
||||
static mdk_personality_t raid5_personality=
|
||||
{
|
||||
.name = "raid5",
|
||||
|
@ -1955,6 +2067,7 @@ static mdk_personality_t raid5_personality=
|
|||
.spare_active = raid5_spare_active,
|
||||
.sync_request = sync_request,
|
||||
.resize = raid5_resize,
|
||||
.quiesce = raid5_quiesce,
|
||||
};
|
||||
|
||||
static int __init raid5_init (void)
|
||||
|
|
|
@ -134,6 +134,7 @@ struct stripe_head {
|
|||
unsigned long state; /* state flags */
|
||||
atomic_t count; /* nr of active thread/requests */
|
||||
spinlock_t lock;
|
||||
int bm_seq; /* sequence number for bitmap flushes */
|
||||
struct r5dev {
|
||||
struct bio req;
|
||||
struct bio_vec vec;
|
||||
|
@ -165,12 +166,13 @@ struct stripe_head {
|
|||
/*
|
||||
* Stripe state
|
||||
*/
|
||||
#define STRIPE_ERROR 1
|
||||
#define STRIPE_HANDLE 2
|
||||
#define STRIPE_SYNCING 3
|
||||
#define STRIPE_INSYNC 4
|
||||
#define STRIPE_PREREAD_ACTIVE 5
|
||||
#define STRIPE_DELAYED 6
|
||||
#define STRIPE_DEGRADED 7
|
||||
#define STRIPE_BIT_DELAY 8
|
||||
|
||||
/*
|
||||
* Plugging:
|
||||
|
@ -210,10 +212,20 @@ struct raid5_private_data {
|
|||
|
||||
struct list_head handle_list; /* stripes needing handling */
|
||||
struct list_head delayed_list; /* stripes that have plugged requests */
|
||||
struct list_head bitmap_list; /* stripes delaying awaiting bitmap update */
|
||||
atomic_t preread_active_stripes; /* stripes with scheduled io */
|
||||
|
||||
char cache_name[20];
|
||||
kmem_cache_t *slab_cache; /* for allocating stripes */
|
||||
|
||||
int seq_flush, seq_write;
|
||||
int quiesce;
|
||||
|
||||
int fullsync; /* set to 1 if a full sync is needed,
|
||||
* (fresh device added).
|
||||
* Cleared when a sync completes.
|
||||
*/
|
||||
|
||||
/*
|
||||
* Free stripes pool
|
||||
*/
|
||||
|
|
Loading…
Reference in New Issue