Merge branch 'for-next' of git://git.kernel.org/pub/scm/linux/kernel/git/shli/md
Pull MD updates from Shaohua Li: "A few fixes of MD for this merge window. Mostly bug fixes: - raid5 stripe batch fix from Amy - Read error handling for raid1 FailFast device from Gioh - raid10 recovery NULL pointer dereference fix from Guoqing - Support write hint for raid5 stripe cache from Mariusz - Fixes for device hot add/remove from Neil and Yufen - Improve flush bio scalability from Xiao" * 'for-next' of git://git.kernel.org/pub/scm/linux/kernel/git/shli/md: MD: fix lock contention for flush bios md/raid5: Assigning NULL to sh->batch_head before testing bit R5_Overlap of a stripe md/raid1: add error handling of read error from FailFast device md: fix NULL dereference of mddev->pers in remove_and_add_spares() raid5: copy write hint from origin bio to stripe md: fix two problems with setting the "re-add" device state. raid10: check bio in r10buf_pool_free to void NULL pointer dereference md: fix an error code format and remove unsed bio_sector
This commit is contained in:
commit
d60dafdca4
171
drivers/md/md.c
171
drivers/md/md.c
|
@ -132,6 +132,24 @@ static inline int speed_max(struct mddev *mddev)
|
|||
mddev->sync_speed_max : sysctl_speed_limit_max;
|
||||
}
|
||||
|
||||
static void * flush_info_alloc(gfp_t gfp_flags, void *data)
|
||||
{
|
||||
return kzalloc(sizeof(struct flush_info), gfp_flags);
|
||||
}
|
||||
static void flush_info_free(void *flush_info, void *data)
|
||||
{
|
||||
kfree(flush_info);
|
||||
}
|
||||
|
||||
static void * flush_bio_alloc(gfp_t gfp_flags, void *data)
|
||||
{
|
||||
return kzalloc(sizeof(struct flush_bio), gfp_flags);
|
||||
}
|
||||
static void flush_bio_free(void *flush_bio, void *data)
|
||||
{
|
||||
kfree(flush_bio);
|
||||
}
|
||||
|
||||
static struct ctl_table_header *raid_table_header;
|
||||
|
||||
static struct ctl_table raid_table[] = {
|
||||
|
@ -414,30 +432,53 @@ static int md_congested(void *data, int bits)
|
|||
/*
|
||||
* Generic flush handling for md
|
||||
*/
|
||||
|
||||
static void md_end_flush(struct bio *bio)
|
||||
static void submit_flushes(struct work_struct *ws)
|
||||
{
|
||||
struct md_rdev *rdev = bio->bi_private;
|
||||
struct mddev *mddev = rdev->mddev;
|
||||
struct flush_info *fi = container_of(ws, struct flush_info, flush_work);
|
||||
struct mddev *mddev = fi->mddev;
|
||||
struct bio *bio = fi->bio;
|
||||
|
||||
bio->bi_opf &= ~REQ_PREFLUSH;
|
||||
md_handle_request(mddev, bio);
|
||||
|
||||
mempool_free(fi, mddev->flush_pool);
|
||||
}
|
||||
|
||||
static void md_end_flush(struct bio *fbio)
|
||||
{
|
||||
struct flush_bio *fb = fbio->bi_private;
|
||||
struct md_rdev *rdev = fb->rdev;
|
||||
struct flush_info *fi = fb->fi;
|
||||
struct bio *bio = fi->bio;
|
||||
struct mddev *mddev = fi->mddev;
|
||||
|
||||
rdev_dec_pending(rdev, mddev);
|
||||
|
||||
if (atomic_dec_and_test(&mddev->flush_pending)) {
|
||||
/* The pre-request flush has finished */
|
||||
queue_work(md_wq, &mddev->flush_work);
|
||||
if (atomic_dec_and_test(&fi->flush_pending)) {
|
||||
if (bio->bi_iter.bi_size == 0)
|
||||
/* an empty barrier - all done */
|
||||
bio_endio(bio);
|
||||
else {
|
||||
INIT_WORK(&fi->flush_work, submit_flushes);
|
||||
queue_work(md_wq, &fi->flush_work);
|
||||
}
|
||||
}
|
||||
bio_put(bio);
|
||||
|
||||
mempool_free(fb, mddev->flush_bio_pool);
|
||||
bio_put(fbio);
|
||||
}
|
||||
|
||||
static void md_submit_flush_data(struct work_struct *ws);
|
||||
|
||||
static void submit_flushes(struct work_struct *ws)
|
||||
void md_flush_request(struct mddev *mddev, struct bio *bio)
|
||||
{
|
||||
struct mddev *mddev = container_of(ws, struct mddev, flush_work);
|
||||
struct md_rdev *rdev;
|
||||
struct flush_info *fi;
|
||||
|
||||
fi = mempool_alloc(mddev->flush_pool, GFP_NOIO);
|
||||
|
||||
fi->bio = bio;
|
||||
fi->mddev = mddev;
|
||||
atomic_set(&fi->flush_pending, 1);
|
||||
|
||||
INIT_WORK(&mddev->flush_work, md_submit_flush_data);
|
||||
atomic_set(&mddev->flush_pending, 1);
|
||||
rcu_read_lock();
|
||||
rdev_for_each_rcu(rdev, mddev)
|
||||
if (rdev->raid_disk >= 0 &&
|
||||
|
@ -447,59 +488,39 @@ static void submit_flushes(struct work_struct *ws)
|
|||
* we reclaim rcu_read_lock
|
||||
*/
|
||||
struct bio *bi;
|
||||
struct flush_bio *fb;
|
||||
atomic_inc(&rdev->nr_pending);
|
||||
atomic_inc(&rdev->nr_pending);
|
||||
rcu_read_unlock();
|
||||
|
||||
fb = mempool_alloc(mddev->flush_bio_pool, GFP_NOIO);
|
||||
fb->fi = fi;
|
||||
fb->rdev = rdev;
|
||||
|
||||
bi = bio_alloc_mddev(GFP_NOIO, 0, mddev);
|
||||
bi->bi_end_io = md_end_flush;
|
||||
bi->bi_private = rdev;
|
||||
bio_set_dev(bi, rdev->bdev);
|
||||
bi->bi_end_io = md_end_flush;
|
||||
bi->bi_private = fb;
|
||||
bi->bi_opf = REQ_OP_WRITE | REQ_PREFLUSH;
|
||||
atomic_inc(&mddev->flush_pending);
|
||||
|
||||
atomic_inc(&fi->flush_pending);
|
||||
submit_bio(bi);
|
||||
|
||||
rcu_read_lock();
|
||||
rdev_dec_pending(rdev, mddev);
|
||||
}
|
||||
rcu_read_unlock();
|
||||
if (atomic_dec_and_test(&mddev->flush_pending))
|
||||
queue_work(md_wq, &mddev->flush_work);
|
||||
}
|
||||
|
||||
static void md_submit_flush_data(struct work_struct *ws)
|
||||
{
|
||||
struct mddev *mddev = container_of(ws, struct mddev, flush_work);
|
||||
struct bio *bio = mddev->flush_bio;
|
||||
|
||||
/*
|
||||
* must reset flush_bio before calling into md_handle_request to avoid a
|
||||
* deadlock, because other bios passed md_handle_request suspend check
|
||||
* could wait for this and below md_handle_request could wait for those
|
||||
* bios because of suspend check
|
||||
*/
|
||||
mddev->flush_bio = NULL;
|
||||
wake_up(&mddev->sb_wait);
|
||||
|
||||
if (bio->bi_iter.bi_size == 0)
|
||||
/* an empty barrier - all done */
|
||||
bio_endio(bio);
|
||||
else {
|
||||
bio->bi_opf &= ~REQ_PREFLUSH;
|
||||
md_handle_request(mddev, bio);
|
||||
if (atomic_dec_and_test(&fi->flush_pending)) {
|
||||
if (bio->bi_iter.bi_size == 0)
|
||||
/* an empty barrier - all done */
|
||||
bio_endio(bio);
|
||||
else {
|
||||
INIT_WORK(&fi->flush_work, submit_flushes);
|
||||
queue_work(md_wq, &fi->flush_work);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void md_flush_request(struct mddev *mddev, struct bio *bio)
|
||||
{
|
||||
spin_lock_irq(&mddev->lock);
|
||||
wait_event_lock_irq(mddev->sb_wait,
|
||||
!mddev->flush_bio,
|
||||
mddev->lock);
|
||||
mddev->flush_bio = bio;
|
||||
spin_unlock_irq(&mddev->lock);
|
||||
|
||||
INIT_WORK(&mddev->flush_work, submit_flushes);
|
||||
queue_work(md_wq, &mddev->flush_work);
|
||||
}
|
||||
EXPORT_SYMBOL(md_flush_request);
|
||||
|
||||
static inline struct mddev *mddev_get(struct mddev *mddev)
|
||||
|
@ -546,7 +567,6 @@ void mddev_init(struct mddev *mddev)
|
|||
atomic_set(&mddev->openers, 0);
|
||||
atomic_set(&mddev->active_io, 0);
|
||||
spin_lock_init(&mddev->lock);
|
||||
atomic_set(&mddev->flush_pending, 0);
|
||||
init_waitqueue_head(&mddev->sb_wait);
|
||||
init_waitqueue_head(&mddev->recovery_wait);
|
||||
mddev->reshape_position = MaxSector;
|
||||
|
@ -2844,7 +2864,8 @@ state_store(struct md_rdev *rdev, const char *buf, size_t len)
|
|||
err = 0;
|
||||
}
|
||||
} else if (cmd_match(buf, "re-add")) {
|
||||
if (test_bit(Faulty, &rdev->flags) && (rdev->raid_disk == -1)) {
|
||||
if (test_bit(Faulty, &rdev->flags) && (rdev->raid_disk == -1) &&
|
||||
rdev->saved_raid_disk >= 0) {
|
||||
/* clear_bit is performed _after_ all the devices
|
||||
* have their local Faulty bit cleared. If any writes
|
||||
* happen in the meantime in the local node, they
|
||||
|
@ -5499,6 +5520,22 @@ int md_run(struct mddev *mddev)
|
|||
if (err)
|
||||
return err;
|
||||
}
|
||||
if (mddev->flush_pool == NULL) {
|
||||
mddev->flush_pool = mempool_create(NR_FLUSH_INFOS, flush_info_alloc,
|
||||
flush_info_free, mddev);
|
||||
if (!mddev->flush_pool) {
|
||||
err = -ENOMEM;
|
||||
goto abort;
|
||||
}
|
||||
}
|
||||
if (mddev->flush_bio_pool == NULL) {
|
||||
mddev->flush_bio_pool = mempool_create(NR_FLUSH_BIOS, flush_bio_alloc,
|
||||
flush_bio_free, mddev);
|
||||
if (!mddev->flush_bio_pool) {
|
||||
err = -ENOMEM;
|
||||
goto abort;
|
||||
}
|
||||
}
|
||||
|
||||
spin_lock(&pers_lock);
|
||||
pers = find_pers(mddev->level, mddev->clevel);
|
||||
|
@ -5654,6 +5691,18 @@ int md_run(struct mddev *mddev)
|
|||
sysfs_notify_dirent_safe(mddev->sysfs_action);
|
||||
sysfs_notify(&mddev->kobj, NULL, "degraded");
|
||||
return 0;
|
||||
|
||||
abort:
|
||||
if (mddev->flush_bio_pool) {
|
||||
mempool_destroy(mddev->flush_bio_pool);
|
||||
mddev->flush_bio_pool = NULL;
|
||||
}
|
||||
if (mddev->flush_pool){
|
||||
mempool_destroy(mddev->flush_pool);
|
||||
mddev->flush_pool = NULL;
|
||||
}
|
||||
|
||||
return err;
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(md_run);
|
||||
|
||||
|
@ -5864,6 +5913,14 @@ void md_stop(struct mddev *mddev)
|
|||
* This is called from dm-raid
|
||||
*/
|
||||
__md_stop(mddev);
|
||||
if (mddev->flush_bio_pool) {
|
||||
mempool_destroy(mddev->flush_bio_pool);
|
||||
mddev->flush_bio_pool = NULL;
|
||||
}
|
||||
if (mddev->flush_pool) {
|
||||
mempool_destroy(mddev->flush_pool);
|
||||
mddev->flush_pool = NULL;
|
||||
}
|
||||
bioset_exit(&mddev->bio_set);
|
||||
bioset_exit(&mddev->sync_set);
|
||||
}
|
||||
|
@ -6494,6 +6551,9 @@ static int hot_remove_disk(struct mddev *mddev, dev_t dev)
|
|||
char b[BDEVNAME_SIZE];
|
||||
struct md_rdev *rdev;
|
||||
|
||||
if (!mddev->pers)
|
||||
return -ENODEV;
|
||||
|
||||
rdev = find_rdev(mddev, dev);
|
||||
if (!rdev)
|
||||
return -ENXIO;
|
||||
|
@ -8611,6 +8671,7 @@ static int remove_and_add_spares(struct mddev *mddev,
|
|||
if (mddev->pers->hot_remove_disk(
|
||||
mddev, rdev) == 0) {
|
||||
sysfs_unlink_rdev(mddev, rdev);
|
||||
rdev->saved_raid_disk = rdev->raid_disk;
|
||||
rdev->raid_disk = -1;
|
||||
removed++;
|
||||
}
|
||||
|
|
|
@ -252,6 +252,19 @@ enum mddev_sb_flags {
|
|||
MD_SB_NEED_REWRITE, /* metadata write needs to be repeated */
|
||||
};
|
||||
|
||||
#define NR_FLUSH_INFOS 8
|
||||
#define NR_FLUSH_BIOS 64
|
||||
struct flush_info {
|
||||
struct bio *bio;
|
||||
struct mddev *mddev;
|
||||
struct work_struct flush_work;
|
||||
atomic_t flush_pending;
|
||||
};
|
||||
struct flush_bio {
|
||||
struct flush_info *fi;
|
||||
struct md_rdev *rdev;
|
||||
};
|
||||
|
||||
struct mddev {
|
||||
void *private;
|
||||
struct md_personality *pers;
|
||||
|
@ -457,13 +470,8 @@ struct mddev {
|
|||
* metadata and bitmap writes
|
||||
*/
|
||||
|
||||
/* Generic flush handling.
|
||||
* The last to finish preflush schedules a worker to submit
|
||||
* the rest of the request (without the REQ_PREFLUSH flag).
|
||||
*/
|
||||
struct bio *flush_bio;
|
||||
atomic_t flush_pending;
|
||||
struct work_struct flush_work;
|
||||
mempool_t *flush_pool;
|
||||
mempool_t *flush_bio_pool;
|
||||
struct work_struct event_work; /* used by dm to report failure event */
|
||||
void (*sync_super)(struct mddev *mddev, struct md_rdev *rdev);
|
||||
struct md_cluster_info *cluster_info;
|
||||
|
|
|
@ -2449,7 +2449,6 @@ static void handle_read_error(struct r1conf *conf, struct r1bio *r1_bio)
|
|||
struct mddev *mddev = conf->mddev;
|
||||
struct bio *bio;
|
||||
struct md_rdev *rdev;
|
||||
sector_t bio_sector;
|
||||
|
||||
clear_bit(R1BIO_ReadError, &r1_bio->state);
|
||||
/* we got a read error. Maybe the drive is bad. Maybe just
|
||||
|
@ -2462,7 +2461,6 @@ static void handle_read_error(struct r1conf *conf, struct r1bio *r1_bio)
|
|||
*/
|
||||
|
||||
bio = r1_bio->bios[r1_bio->read_disk];
|
||||
bio_sector = conf->mirrors[r1_bio->read_disk].rdev->data_offset + r1_bio->sector;
|
||||
bio_put(bio);
|
||||
r1_bio->bios[r1_bio->read_disk] = NULL;
|
||||
|
||||
|
@ -2473,6 +2471,8 @@ static void handle_read_error(struct r1conf *conf, struct r1bio *r1_bio)
|
|||
fix_read_error(conf, r1_bio->read_disk,
|
||||
r1_bio->sector, r1_bio->sectors);
|
||||
unfreeze_array(conf);
|
||||
} else if (mddev->ro == 0 && test_bit(FailFast, &rdev->flags)) {
|
||||
md_error(mddev, rdev);
|
||||
} else {
|
||||
r1_bio->bios[r1_bio->read_disk] = IO_BLOCKED;
|
||||
}
|
||||
|
|
|
@ -255,9 +255,11 @@ static void r10buf_pool_free(void *__r10_bio, void *data)
|
|||
for (j = conf->copies; j--; ) {
|
||||
struct bio *bio = r10bio->devs[j].bio;
|
||||
|
||||
rp = get_resync_pages(bio);
|
||||
resync_free_pages(rp);
|
||||
bio_put(bio);
|
||||
if (bio) {
|
||||
rp = get_resync_pages(bio);
|
||||
resync_free_pages(rp);
|
||||
bio_put(bio);
|
||||
}
|
||||
|
||||
bio = r10bio->devs[j].repl_bio;
|
||||
if (bio)
|
||||
|
@ -2362,7 +2364,7 @@ static void fix_read_error(struct r10conf *conf, struct mddev *mddev, struct r10
|
|||
{
|
||||
int sect = 0; /* Offset from r10_bio->sector */
|
||||
int sectors = r10_bio->sectors;
|
||||
struct md_rdev*rdev;
|
||||
struct md_rdev *rdev;
|
||||
int max_read_errors = atomic_read(&mddev->max_corr_read_errors);
|
||||
int d = r10_bio->devs[r10_bio->read_slot].devnum;
|
||||
|
||||
|
|
|
@ -1139,6 +1139,9 @@ again:
|
|||
bi->bi_io_vec[0].bv_len = STRIPE_SIZE;
|
||||
bi->bi_io_vec[0].bv_offset = 0;
|
||||
bi->bi_iter.bi_size = STRIPE_SIZE;
|
||||
bi->bi_write_hint = sh->dev[i].write_hint;
|
||||
if (!rrdev)
|
||||
sh->dev[i].write_hint = RWF_WRITE_LIFE_NOT_SET;
|
||||
/*
|
||||
* If this is discard request, set bi_vcnt 0. We don't
|
||||
* want to confuse SCSI because SCSI will replace payload
|
||||
|
@ -1190,6 +1193,8 @@ again:
|
|||
rbi->bi_io_vec[0].bv_len = STRIPE_SIZE;
|
||||
rbi->bi_io_vec[0].bv_offset = 0;
|
||||
rbi->bi_iter.bi_size = STRIPE_SIZE;
|
||||
rbi->bi_write_hint = sh->dev[i].write_hint;
|
||||
sh->dev[i].write_hint = RWF_WRITE_LIFE_NOT_SET;
|
||||
/*
|
||||
* If this is discard request, set bi_vcnt 0. We don't
|
||||
* want to confuse SCSI because SCSI will replace payload
|
||||
|
@ -3204,6 +3209,7 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx,
|
|||
(unsigned long long)sh->sector);
|
||||
|
||||
spin_lock_irq(&sh->stripe_lock);
|
||||
sh->dev[dd_idx].write_hint = bi->bi_write_hint;
|
||||
/* Don't allow new IO added to stripes in batch list */
|
||||
if (sh->batch_head)
|
||||
goto overlap;
|
||||
|
@ -4614,15 +4620,15 @@ static void break_stripe_batch_list(struct stripe_head *head_sh,
|
|||
|
||||
sh->check_state = head_sh->check_state;
|
||||
sh->reconstruct_state = head_sh->reconstruct_state;
|
||||
spin_lock_irq(&sh->stripe_lock);
|
||||
sh->batch_head = NULL;
|
||||
spin_unlock_irq(&sh->stripe_lock);
|
||||
for (i = 0; i < sh->disks; i++) {
|
||||
if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags))
|
||||
do_wakeup = 1;
|
||||
sh->dev[i].flags = head_sh->dev[i].flags &
|
||||
(~((1 << R5_WriteError) | (1 << R5_Overlap)));
|
||||
}
|
||||
spin_lock_irq(&sh->stripe_lock);
|
||||
sh->batch_head = NULL;
|
||||
spin_unlock_irq(&sh->stripe_lock);
|
||||
if (handle_flags == 0 ||
|
||||
sh->state & handle_flags)
|
||||
set_bit(STRIPE_HANDLE, &sh->state);
|
||||
|
|
|
@ -257,6 +257,7 @@ struct stripe_head {
|
|||
sector_t sector; /* sector of this page */
|
||||
unsigned long flags;
|
||||
u32 log_checksum;
|
||||
unsigned short write_hint;
|
||||
} dev[1]; /* allocated with extra space depending of RAID geometry */
|
||||
};
|
||||
|
||||
|
|
Loading…
Reference in New Issue