raid5-cache: add journal hot add/remove support
Add support for journal disk hot add/remove. Mostly trival checks in md part. The raid5 part is a little tricky. For hot-remove, we can't wait pending write as it's called from raid5d. The wait will cause deadlock. We simplily fail the hot-remove. A hot-remove retry can success eventually since if journal disk is faulty all pending write will be failed and finish. For hot-add, since an array supporting journal but without journal disk will be marked read-only, we are safe to hot add journal without stopping IO (should be read IO, while journal only handles write IO). Signed-off-by: Shaohua Li <shli@fb.com> Signed-off-by: NeilBrown <neilb@suse.com>
This commit is contained in:
parent
9ebc6ef188
commit
f6b6ec5cfa
|
@ -2055,8 +2055,9 @@ static int bind_rdev_to_array(struct md_rdev *rdev, struct mddev *mddev)
|
||||||
return -EEXIST;
|
return -EEXIST;
|
||||||
|
|
||||||
/* make sure rdev->sectors exceeds mddev->dev_sectors */
|
/* make sure rdev->sectors exceeds mddev->dev_sectors */
|
||||||
if (rdev->sectors && (mddev->dev_sectors == 0 ||
|
if (!test_bit(Journal, &rdev->flags) &&
|
||||||
rdev->sectors < mddev->dev_sectors)) {
|
rdev->sectors &&
|
||||||
|
(mddev->dev_sectors == 0 || rdev->sectors < mddev->dev_sectors)) {
|
||||||
if (mddev->pers) {
|
if (mddev->pers) {
|
||||||
/* Cannot change size, so fail
|
/* Cannot change size, so fail
|
||||||
* If mddev->level <= 0, then we don't care
|
* If mddev->level <= 0, then we don't care
|
||||||
|
@ -2087,7 +2088,8 @@ static int bind_rdev_to_array(struct md_rdev *rdev, struct mddev *mddev)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
rcu_read_unlock();
|
rcu_read_unlock();
|
||||||
if (mddev->max_disks && rdev->desc_nr >= mddev->max_disks) {
|
if (!test_bit(Journal, &rdev->flags) &&
|
||||||
|
mddev->max_disks && rdev->desc_nr >= mddev->max_disks) {
|
||||||
printk(KERN_WARNING "md: %s: array is limited to %d devices\n",
|
printk(KERN_WARNING "md: %s: array is limited to %d devices\n",
|
||||||
mdname(mddev), mddev->max_disks);
|
mdname(mddev), mddev->max_disks);
|
||||||
return -EBUSY;
|
return -EBUSY;
|
||||||
|
@ -6044,8 +6046,23 @@ static int add_new_disk(struct mddev *mddev, mdu_disk_info_t *info)
|
||||||
else
|
else
|
||||||
clear_bit(WriteMostly, &rdev->flags);
|
clear_bit(WriteMostly, &rdev->flags);
|
||||||
|
|
||||||
if (info->state & (1<<MD_DISK_JOURNAL))
|
if (info->state & (1<<MD_DISK_JOURNAL)) {
|
||||||
|
struct md_rdev *rdev2;
|
||||||
|
bool has_journal = false;
|
||||||
|
|
||||||
|
/* make sure no existing journal disk */
|
||||||
|
rdev_for_each(rdev2, mddev) {
|
||||||
|
if (test_bit(Journal, &rdev2->flags)) {
|
||||||
|
has_journal = true;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (has_journal) {
|
||||||
|
export_rdev(rdev);
|
||||||
|
return -EBUSY;
|
||||||
|
}
|
||||||
set_bit(Journal, &rdev->flags);
|
set_bit(Journal, &rdev->flags);
|
||||||
|
}
|
||||||
/*
|
/*
|
||||||
* check whether the device shows up in other nodes
|
* check whether the device shows up in other nodes
|
||||||
*/
|
*/
|
||||||
|
@ -8181,19 +8198,20 @@ static int remove_and_add_spares(struct mddev *mddev,
|
||||||
continue;
|
continue;
|
||||||
if (test_bit(Faulty, &rdev->flags))
|
if (test_bit(Faulty, &rdev->flags))
|
||||||
continue;
|
continue;
|
||||||
if (test_bit(Journal, &rdev->flags))
|
if (!test_bit(Journal, &rdev->flags)) {
|
||||||
continue;
|
if (mddev->ro &&
|
||||||
if (mddev->ro &&
|
! (rdev->saved_raid_disk >= 0 &&
|
||||||
! (rdev->saved_raid_disk >= 0 &&
|
!test_bit(Bitmap_sync, &rdev->flags)))
|
||||||
!test_bit(Bitmap_sync, &rdev->flags)))
|
continue;
|
||||||
continue;
|
|
||||||
|
|
||||||
rdev->recovery_offset = 0;
|
rdev->recovery_offset = 0;
|
||||||
|
}
|
||||||
if (mddev->pers->
|
if (mddev->pers->
|
||||||
hot_add_disk(mddev, rdev) == 0) {
|
hot_add_disk(mddev, rdev) == 0) {
|
||||||
if (sysfs_link_rdev(mddev, rdev))
|
if (sysfs_link_rdev(mddev, rdev))
|
||||||
/* failure here is OK */;
|
/* failure here is OK */;
|
||||||
spares++;
|
if (!test_bit(Journal, &rdev->flags))
|
||||||
|
spares++;
|
||||||
md_new_event(mddev);
|
md_new_event(mddev);
|
||||||
set_bit(MD_CHANGE_DEVS, &mddev->flags);
|
set_bit(MD_CHANGE_DEVS, &mddev->flags);
|
||||||
}
|
}
|
||||||
|
|
|
@ -799,10 +799,18 @@ void r5l_quiesce(struct r5l_log *log, int state)
|
||||||
|
|
||||||
bool r5l_log_disk_error(struct r5conf *conf)
|
bool r5l_log_disk_error(struct r5conf *conf)
|
||||||
{
|
{
|
||||||
|
struct r5l_log *log;
|
||||||
|
bool ret;
|
||||||
/* don't allow write if journal disk is missing */
|
/* don't allow write if journal disk is missing */
|
||||||
if (!conf->log)
|
rcu_read_lock();
|
||||||
return test_bit(MD_HAS_JOURNAL, &conf->mddev->flags);
|
log = rcu_dereference(conf->log);
|
||||||
return test_bit(Faulty, &conf->log->rdev->flags);
|
|
||||||
|
if (!log)
|
||||||
|
ret = test_bit(MD_HAS_JOURNAL, &conf->mddev->flags);
|
||||||
|
else
|
||||||
|
ret = test_bit(Faulty, &log->rdev->flags);
|
||||||
|
rcu_read_unlock();
|
||||||
|
return ret;
|
||||||
}
|
}
|
||||||
|
|
||||||
struct r5l_recovery_ctx {
|
struct r5l_recovery_ctx {
|
||||||
|
@ -1165,7 +1173,7 @@ int r5l_init_log(struct r5conf *conf, struct md_rdev *rdev)
|
||||||
if (r5l_load_log(log))
|
if (r5l_load_log(log))
|
||||||
goto error;
|
goto error;
|
||||||
|
|
||||||
conf->log = log;
|
rcu_assign_pointer(conf->log, log);
|
||||||
return 0;
|
return 0;
|
||||||
error:
|
error:
|
||||||
md_unregister_thread(&log->reclaim_thread);
|
md_unregister_thread(&log->reclaim_thread);
|
||||||
|
|
|
@ -7139,14 +7139,19 @@ static int raid5_remove_disk(struct mddev *mddev, struct md_rdev *rdev)
|
||||||
struct disk_info *p = conf->disks + number;
|
struct disk_info *p = conf->disks + number;
|
||||||
|
|
||||||
print_raid5_conf(conf);
|
print_raid5_conf(conf);
|
||||||
if (test_bit(Journal, &rdev->flags)) {
|
if (test_bit(Journal, &rdev->flags) && conf->log) {
|
||||||
|
struct r5l_log *log;
|
||||||
/*
|
/*
|
||||||
* journal disk is not removable, but we need give a chance to
|
* we can't wait pending write here, as this is called in
|
||||||
* update superblock of other disks. Otherwise journal disk
|
* raid5d, wait will deadlock.
|
||||||
* will be considered as 'fresh'
|
|
||||||
*/
|
*/
|
||||||
set_bit(MD_CHANGE_DEVS, &mddev->flags);
|
if (atomic_read(&mddev->writes_pending))
|
||||||
return -EINVAL;
|
return -EBUSY;
|
||||||
|
log = conf->log;
|
||||||
|
conf->log = NULL;
|
||||||
|
synchronize_rcu();
|
||||||
|
r5l_exit_log(log);
|
||||||
|
return 0;
|
||||||
}
|
}
|
||||||
if (rdev == p->rdev)
|
if (rdev == p->rdev)
|
||||||
rdevp = &p->rdev;
|
rdevp = &p->rdev;
|
||||||
|
@ -7210,8 +7215,21 @@ static int raid5_add_disk(struct mddev *mddev, struct md_rdev *rdev)
|
||||||
int first = 0;
|
int first = 0;
|
||||||
int last = conf->raid_disks - 1;
|
int last = conf->raid_disks - 1;
|
||||||
|
|
||||||
if (test_bit(Journal, &rdev->flags))
|
if (test_bit(Journal, &rdev->flags)) {
|
||||||
return -EINVAL;
|
char b[BDEVNAME_SIZE];
|
||||||
|
if (conf->log)
|
||||||
|
return -EBUSY;
|
||||||
|
|
||||||
|
rdev->raid_disk = 0;
|
||||||
|
/*
|
||||||
|
* The array is in readonly mode if journal is missing, so no
|
||||||
|
* write requests running. We should be safe
|
||||||
|
*/
|
||||||
|
r5l_init_log(conf, rdev);
|
||||||
|
printk(KERN_INFO"md/raid:%s: using device %s as journal\n",
|
||||||
|
mdname(mddev), bdevname(rdev->bdev, b));
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
if (mddev->recovery_disabled == conf->recovery_disabled)
|
if (mddev->recovery_disabled == conf->recovery_disabled)
|
||||||
return -EBUSY;
|
return -EBUSY;
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue