md/raid10: handle further errors during fix_read_error better.

If we find more read/write errors we should record a bad block before
failing the device.

Signed-off-by: NeilBrown <neilb@suse.de>
This commit is contained in:
NeilBrown 2011-07-28 11:39:25 +10:00
parent 5e5702898e
commit 58c54fcca3
1 changed files with 44 additions and 15 deletions

View File

@ -1749,6 +1749,26 @@ static void check_decay_read_errors(mddev_t *mddev, mdk_rdev_t *rdev)
atomic_set(&rdev->read_errors, read_errors >> hours_since_last);
}
static int r10_sync_page_io(mdk_rdev_t *rdev, sector_t sector,
int sectors, struct page *page, int rw)
{
sector_t first_bad;
int bad_sectors;
if (is_badblock(rdev, sector, sectors, &first_bad, &bad_sectors)
&& (rw == READ || test_bit(WriteErrorSeen, &rdev->flags)))
return -1;
if (sync_page_io(rdev, sector, sectors << 9, page, rw, false))
/* success */
return 1;
if (rw == WRITE)
set_bit(WriteErrorSeen, &rdev->flags);
/* need to record an error - either for the block or the device */
if (!rdev_set_badblocks(rdev, sector, sectors, 0))
md_error(rdev->mddev, rdev);
return 0;
}
/*
* This is a kernel thread which:
*
@ -1832,9 +1852,19 @@ static void fix_read_error(conf_t *conf, mddev_t *mddev, r10bio_t *r10_bio)
rcu_read_unlock();
if (!success) {
/* Cannot read from anywhere -- bye bye array */
/* Cannot read from anywhere, just mark the block
* as bad on the first device to discourage future
* reads.
*/
int dn = r10_bio->devs[r10_bio->read_slot].devnum;
md_error(mddev, conf->mirrors[dn].rdev);
rdev = conf->mirrors[dn].rdev;
if (!rdev_set_badblocks(
rdev,
r10_bio->devs[r10_bio->read_slot].addr
+ sect,
s, 0))
md_error(mddev, rdev);
break;
}
@ -1855,10 +1885,10 @@ static void fix_read_error(conf_t *conf, mddev_t *mddev, r10bio_t *r10_bio)
atomic_inc(&rdev->nr_pending);
rcu_read_unlock();
if (sync_page_io(rdev,
r10_bio->devs[sl].addr +
sect,
s<<9, conf->tmppage, WRITE, false)
if (r10_sync_page_io(rdev,
r10_bio->devs[sl].addr +
sect,
s<<9, conf->tmppage, WRITE)
== 0) {
/* Well, this device is dead */
printk(KERN_NOTICE
@ -1873,7 +1903,6 @@ static void fix_read_error(conf_t *conf, mddev_t *mddev, r10bio_t *r10_bio)
"drive\n",
mdname(mddev),
bdevname(rdev->bdev, b));
md_error(mddev, rdev);
}
rdev_dec_pending(rdev, mddev);
rcu_read_lock();
@ -1893,11 +1922,12 @@ static void fix_read_error(conf_t *conf, mddev_t *mddev, r10bio_t *r10_bio)
atomic_inc(&rdev->nr_pending);
rcu_read_unlock();
if (sync_page_io(rdev,
r10_bio->devs[sl].addr +
sect,
s<<9, conf->tmppage,
READ, false) == 0) {
switch (r10_sync_page_io(rdev,
r10_bio->devs[sl].addr +
sect,
s<<9, conf->tmppage,
READ)) {
case 0:
/* Well, this device is dead */
printk(KERN_NOTICE
"md/raid10:%s: unable to read back "
@ -1911,9 +1941,8 @@ static void fix_read_error(conf_t *conf, mddev_t *mddev, r10bio_t *r10_bio)
"drive\n",
mdname(mddev),
bdevname(rdev->bdev, b));
md_error(mddev, rdev);
} else {
break;
case 1:
printk(KERN_INFO
"md/raid10:%s: read error corrected"
" (%d sectors at %llu on %s)\n",