dm raid: fix transient device failure processing

This fix addresses the following 3 failure scenarios:

1) If a (transiently) inaccessible metadata device is being passed into the
constructor (e.g. a device tuple '254:4 254:5'), it is processed as if
'- -' was given.  This erroneously results in a status table line containing
'- -', which mistakenly differs from what has been passed in.  As a result,
userspace libdevmapper puts the device tuple seperate from the RAID device
thus not processing the dependencies properly.

2) False health status char 'A' instead of 'D' is emitted on the status
status info line for the meta/data device tuple in this metadata device
failure case.

3) If the metadata device is accessible when passed into the constructor
but the data device (partially) isn't, that leg may be set faulty by the
raid personality on access to the (partially) unavailable leg.  Restore
tried in a second raid device resume on such failed leg (status char 'D')
fails after the (partial) leg returned.

Fixes for aforementioned failure scenarios:

- don't release passed in devices in the constructor thus allowing the
  status table line to e.g. contain '254:4 254:5' rather than '- -'

- emit device status char 'D' rather than 'A' for the device tuple
  with the failed metadata device on the status info line

- when attempting to restore faulty devices in a second resume, allow the
  device hot remove function to succeed by setting the device to not in-sync

In case userspace intentionally passes '- -' into the constructor to avoid that
device tuple (e.g. to split off a raid1 leg temporarily for later re-addition),
the status table line will correctly show '- -' and the status info line will
provide a '-' device health character for the non-defined device tuple.

Signed-off-by: Heinz Mauelshagen <heinzm@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
This commit is contained in:
Heinz Mauelshagen 2017-01-14 03:53:07 +01:00 committed by Mike Snitzer
parent 7a308bb301
commit c63ede3b42
2 changed files with 42 additions and 49 deletions

View File

@ -314,3 +314,7 @@ Version History
1.9.0 Add support for RAID level takeover/reshape/region size 1.9.0 Add support for RAID level takeover/reshape/region size
and set size reduction. and set size reduction.
1.9.1 Fix activation of existing RAID 4/10 mapped devices 1.9.1 Fix activation of existing RAID 4/10 mapped devices
1.9.2 Don't emit '- -' on the status table line in case the constructor
fails reading a superblock. Correctly emit 'maj:min1 maj:min2' and
'D' on the status line. If '- -' is passed into the constructor, emit
'- -' on the table line and '-' as the status line health character.

View File

@ -2253,7 +2253,7 @@ static int super_validate(struct raid_set *rs, struct md_rdev *rdev)
struct mddev *mddev = &rs->md; struct mddev *mddev = &rs->md;
struct dm_raid_superblock *sb; struct dm_raid_superblock *sb;
if (rs_is_raid0(rs) || !rdev->sb_page) if (rs_is_raid0(rs) || !rdev->sb_page || rdev->raid_disk < 0)
return 0; return 0;
sb = page_address(rdev->sb_page); sb = page_address(rdev->sb_page);
@ -2316,21 +2316,19 @@ static int super_validate(struct raid_set *rs, struct md_rdev *rdev)
static int analyse_superblocks(struct dm_target *ti, struct raid_set *rs) static int analyse_superblocks(struct dm_target *ti, struct raid_set *rs)
{ {
int r; int r;
struct raid_dev *dev; struct md_rdev *rdev, *freshest;
struct md_rdev *rdev, *tmp, *freshest;
struct mddev *mddev = &rs->md; struct mddev *mddev = &rs->md;
freshest = NULL; freshest = NULL;
rdev_for_each_safe(rdev, tmp, mddev) { rdev_for_each(rdev, mddev) {
/* /*
* Skipping super_load due to CTR_FLAG_SYNC will cause * Skipping super_load due to CTR_FLAG_SYNC will cause
* the array to undergo initialization again as * the array to undergo initialization again as
* though it were new. This is the intended effect * though it were new. This is the intended effect
* of the "sync" directive. * of the "sync" directive.
* *
* When reshaping capability is added, we must ensure * With reshaping capability added, we must ensure that
* that the "sync" directive is disallowed during the * that the "sync" directive is disallowed during the reshape.
* reshape.
*/ */
if (test_bit(__CTR_FLAG_SYNC, &rs->ctr_flags)) if (test_bit(__CTR_FLAG_SYNC, &rs->ctr_flags))
continue; continue;
@ -2347,6 +2345,7 @@ static int analyse_superblocks(struct dm_target *ti, struct raid_set *rs)
case 0: case 0:
break; break;
default: default:
/* This is a failure to read the superblock from the metadata device. */
/* /*
* We have to keep any raid0 data/metadata device pairs or * We have to keep any raid0 data/metadata device pairs or
* the MD raid0 personality will fail to start the array. * the MD raid0 personality will fail to start the array.
@ -2354,33 +2353,17 @@ static int analyse_superblocks(struct dm_target *ti, struct raid_set *rs)
if (rs_is_raid0(rs)) if (rs_is_raid0(rs))
continue; continue;
dev = container_of(rdev, struct raid_dev, rdev);
if (dev->meta_dev)
dm_put_device(ti, dev->meta_dev);
dev->meta_dev = NULL;
rdev->meta_bdev = NULL;
if (rdev->sb_page)
put_page(rdev->sb_page);
rdev->sb_page = NULL;
rdev->sb_loaded = 0;
/* /*
* We might be able to salvage the data device * We keep the dm_devs to be able to emit the device tuple
* even though the meta device has failed. For * properly on the table line in raid_status() (rather than
* now, we behave as though '- -' had been * mistakenly acting as if '- -' got passed into the constructor).
* set for this device in the table. *
* The rdev has to stay on the same_set list to allow for
* the attempt to restore faulty devices on second resume.
*/ */
if (dev->data_dev) set_bit(Faulty, &rdev->flags);
dm_put_device(ti, dev->data_dev); rdev->raid_disk = rdev->saved_raid_disk = -1;
break;
dev->data_dev = NULL;
rdev->bdev = NULL;
list_del(&rdev->same_set);
} }
} }
@ -3078,10 +3061,13 @@ static const char *decipher_sync_action(struct mddev *mddev)
* 'D' = Dead/Failed device * 'D' = Dead/Failed device
* 'a' = Alive but not in-sync * 'a' = Alive but not in-sync
* 'A' = Alive and in-sync * 'A' = Alive and in-sync
* '-' = Non-existing device (i.e. uspace passed '- -' into the ctr)
*/ */
static const char *__raid_dev_status(struct md_rdev *rdev, bool array_in_sync) static const char *__raid_dev_status(struct md_rdev *rdev, bool array_in_sync)
{ {
if (test_bit(Faulty, &rdev->flags)) if (!rdev->bdev)
return "-";
else if (test_bit(Faulty, &rdev->flags))
return "D"; return "D";
else if (!array_in_sync || !test_bit(In_sync, &rdev->flags)) else if (!array_in_sync || !test_bit(In_sync, &rdev->flags))
return "a"; return "a";
@ -3183,7 +3169,6 @@ static void raid_status(struct dm_target *ti, status_type_t type,
sector_t progress, resync_max_sectors, resync_mismatches; sector_t progress, resync_max_sectors, resync_mismatches;
const char *sync_action; const char *sync_action;
struct raid_type *rt; struct raid_type *rt;
struct md_rdev *rdev;
switch (type) { switch (type) {
case STATUSTYPE_INFO: case STATUSTYPE_INFO:
@ -3204,9 +3189,9 @@ static void raid_status(struct dm_target *ti, status_type_t type,
atomic64_read(&mddev->resync_mismatches) : 0; atomic64_read(&mddev->resync_mismatches) : 0;
sync_action = decipher_sync_action(&rs->md); sync_action = decipher_sync_action(&rs->md);
/* HM FIXME: do we want another state char for raid0? It shows 'D' or 'A' now */ /* HM FIXME: do we want another state char for raid0? It shows 'D'/'A'/'-' now */
rdev_for_each(rdev, mddev) for (i = 0; i < rs->raid_disks; i++)
DMEMIT(__raid_dev_status(rdev, array_in_sync)); DMEMIT(__raid_dev_status(&rs->dev[i].rdev, array_in_sync));
/* /*
* In-sync/Reshape ratio: * In-sync/Reshape ratio:
@ -3427,7 +3412,7 @@ static void attempt_restore_of_faulty_devices(struct raid_set *rs)
memset(cleared_failed_devices, 0, sizeof(cleared_failed_devices)); memset(cleared_failed_devices, 0, sizeof(cleared_failed_devices));
for (i = 0; i < rs->md.raid_disks; i++) { for (i = 0; i < mddev->raid_disks; i++) {
r = &rs->dev[i].rdev; r = &rs->dev[i].rdev;
if (test_bit(Faulty, &r->flags) && r->sb_page && if (test_bit(Faulty, &r->flags) && r->sb_page &&
sync_page_io(r, 0, r->sb_size, r->sb_page, sync_page_io(r, 0, r->sb_size, r->sb_page,
@ -3445,22 +3430,26 @@ static void attempt_restore_of_faulty_devices(struct raid_set *rs)
* '>= 0' - meaning we must call this function * '>= 0' - meaning we must call this function
* ourselves. * ourselves.
*/ */
if ((r->raid_disk >= 0) &&
(mddev->pers->hot_remove_disk(mddev, r) != 0))
/* Failed to revive this device, try next */
continue;
r->raid_disk = i;
r->saved_raid_disk = i;
flags = r->flags; flags = r->flags;
clear_bit(In_sync, &r->flags); /* Mandatory for hot remove. */
if (r->raid_disk >= 0) {
if (mddev->pers->hot_remove_disk(mddev, r)) {
/* Failed to revive this device, try next */
r->flags = flags;
continue;
}
} else
r->raid_disk = r->saved_raid_disk = i;
clear_bit(Faulty, &r->flags); clear_bit(Faulty, &r->flags);
clear_bit(WriteErrorSeen, &r->flags); clear_bit(WriteErrorSeen, &r->flags);
clear_bit(In_sync, &r->flags);
if (mddev->pers->hot_add_disk(mddev, r)) { if (mddev->pers->hot_add_disk(mddev, r)) {
r->raid_disk = -1; /* Failed to revive this device, try next */
r->saved_raid_disk = -1; r->raid_disk = r->saved_raid_disk = -1;
r->flags = flags; r->flags = flags;
} else { } else {
clear_bit(In_sync, &r->flags);
r->recovery_offset = 0; r->recovery_offset = 0;
set_bit(i, (void *) cleared_failed_devices); set_bit(i, (void *) cleared_failed_devices);
cleared = true; cleared = true;
@ -3651,7 +3640,7 @@ static void raid_resume(struct dm_target *ti)
static struct target_type raid_target = { static struct target_type raid_target = {
.name = "raid", .name = "raid",
.version = {1, 9, 1}, .version = {1, 9, 2},
.module = THIS_MODULE, .module = THIS_MODULE,
.ctr = raid_ctr, .ctr = raid_ctr,
.dtr = raid_dtr, .dtr = raid_dtr,