md/raid6: move the spare page to a percpu allocation
In preparation for asynchronous handling of raid6 operations move the spare page to a percpu allocation to allow multiple simultaneous synchronous raid6 recovery operations. Make this allocation cpu hotplug aware to maximize allocation efficiency. Signed-off-by: Dan Williams <dan.j.williams@intel.com>
This commit is contained in:
parent
a11034b428
commit
36d1c6476b
|
@ -48,6 +48,7 @@
|
||||||
#include <linux/raid/pq.h>
|
#include <linux/raid/pq.h>
|
||||||
#include <linux/async_tx.h>
|
#include <linux/async_tx.h>
|
||||||
#include <linux/seq_file.h>
|
#include <linux/seq_file.h>
|
||||||
|
#include <linux/cpu.h>
|
||||||
#include "md.h"
|
#include "md.h"
|
||||||
#include "raid5.h"
|
#include "raid5.h"
|
||||||
#include "bitmap.h"
|
#include "bitmap.h"
|
||||||
|
@ -2565,14 +2566,15 @@ static void handle_parity_checks5(raid5_conf_t *conf, struct stripe_head *sh,
|
||||||
|
|
||||||
|
|
||||||
static void handle_parity_checks6(raid5_conf_t *conf, struct stripe_head *sh,
|
static void handle_parity_checks6(raid5_conf_t *conf, struct stripe_head *sh,
|
||||||
struct stripe_head_state *s,
|
struct stripe_head_state *s,
|
||||||
struct r6_state *r6s, struct page *tmp_page,
|
struct r6_state *r6s, int disks)
|
||||||
int disks)
|
|
||||||
{
|
{
|
||||||
int update_p = 0, update_q = 0;
|
int update_p = 0, update_q = 0;
|
||||||
struct r5dev *dev;
|
struct r5dev *dev;
|
||||||
int pd_idx = sh->pd_idx;
|
int pd_idx = sh->pd_idx;
|
||||||
int qd_idx = sh->qd_idx;
|
int qd_idx = sh->qd_idx;
|
||||||
|
unsigned long cpu;
|
||||||
|
struct page *tmp_page;
|
||||||
|
|
||||||
set_bit(STRIPE_HANDLE, &sh->state);
|
set_bit(STRIPE_HANDLE, &sh->state);
|
||||||
|
|
||||||
|
@ -2583,78 +2585,75 @@ static void handle_parity_checks6(raid5_conf_t *conf, struct stripe_head *sh,
|
||||||
* case we can only check one of them, possibly using the
|
* case we can only check one of them, possibly using the
|
||||||
* other to generate missing data
|
* other to generate missing data
|
||||||
*/
|
*/
|
||||||
|
cpu = get_cpu();
|
||||||
/* If !tmp_page, we cannot do the calculations,
|
tmp_page = per_cpu_ptr(conf->percpu, cpu)->spare_page;
|
||||||
* but as we have set STRIPE_HANDLE, we will soon be called
|
if (s->failed == r6s->q_failed) {
|
||||||
* by stripe_handle with a tmp_page - just wait until then.
|
/* The only possible failed device holds 'Q', so it
|
||||||
*/
|
* makes sense to check P (If anything else were failed,
|
||||||
if (tmp_page) {
|
* we would have used P to recreate it).
|
||||||
if (s->failed == r6s->q_failed) {
|
|
||||||
/* The only possible failed device holds 'Q', so it
|
|
||||||
* makes sense to check P (If anything else were failed,
|
|
||||||
* we would have used P to recreate it).
|
|
||||||
*/
|
|
||||||
compute_block_1(sh, pd_idx, 1);
|
|
||||||
if (!page_is_zero(sh->dev[pd_idx].page)) {
|
|
||||||
compute_block_1(sh, pd_idx, 0);
|
|
||||||
update_p = 1;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if (!r6s->q_failed && s->failed < 2) {
|
|
||||||
/* q is not failed, and we didn't use it to generate
|
|
||||||
* anything, so it makes sense to check it
|
|
||||||
*/
|
|
||||||
memcpy(page_address(tmp_page),
|
|
||||||
page_address(sh->dev[qd_idx].page),
|
|
||||||
STRIPE_SIZE);
|
|
||||||
compute_parity6(sh, UPDATE_PARITY);
|
|
||||||
if (memcmp(page_address(tmp_page),
|
|
||||||
page_address(sh->dev[qd_idx].page),
|
|
||||||
STRIPE_SIZE) != 0) {
|
|
||||||
clear_bit(STRIPE_INSYNC, &sh->state);
|
|
||||||
update_q = 1;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if (update_p || update_q) {
|
|
||||||
conf->mddev->resync_mismatches += STRIPE_SECTORS;
|
|
||||||
if (test_bit(MD_RECOVERY_CHECK, &conf->mddev->recovery))
|
|
||||||
/* don't try to repair!! */
|
|
||||||
update_p = update_q = 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
/* now write out any block on a failed drive,
|
|
||||||
* or P or Q if they need it
|
|
||||||
*/
|
*/
|
||||||
|
compute_block_1(sh, pd_idx, 1);
|
||||||
if (s->failed == 2) {
|
if (!page_is_zero(sh->dev[pd_idx].page)) {
|
||||||
dev = &sh->dev[r6s->failed_num[1]];
|
compute_block_1(sh, pd_idx, 0);
|
||||||
s->locked++;
|
update_p = 1;
|
||||||
set_bit(R5_LOCKED, &dev->flags);
|
|
||||||
set_bit(R5_Wantwrite, &dev->flags);
|
|
||||||
}
|
}
|
||||||
if (s->failed >= 1) {
|
|
||||||
dev = &sh->dev[r6s->failed_num[0]];
|
|
||||||
s->locked++;
|
|
||||||
set_bit(R5_LOCKED, &dev->flags);
|
|
||||||
set_bit(R5_Wantwrite, &dev->flags);
|
|
||||||
}
|
|
||||||
|
|
||||||
if (update_p) {
|
|
||||||
dev = &sh->dev[pd_idx];
|
|
||||||
s->locked++;
|
|
||||||
set_bit(R5_LOCKED, &dev->flags);
|
|
||||||
set_bit(R5_Wantwrite, &dev->flags);
|
|
||||||
}
|
|
||||||
if (update_q) {
|
|
||||||
dev = &sh->dev[qd_idx];
|
|
||||||
s->locked++;
|
|
||||||
set_bit(R5_LOCKED, &dev->flags);
|
|
||||||
set_bit(R5_Wantwrite, &dev->flags);
|
|
||||||
}
|
|
||||||
clear_bit(STRIPE_DEGRADED, &sh->state);
|
|
||||||
|
|
||||||
set_bit(STRIPE_INSYNC, &sh->state);
|
|
||||||
}
|
}
|
||||||
|
if (!r6s->q_failed && s->failed < 2) {
|
||||||
|
/* q is not failed, and we didn't use it to generate
|
||||||
|
* anything, so it makes sense to check it
|
||||||
|
*/
|
||||||
|
memcpy(page_address(tmp_page),
|
||||||
|
page_address(sh->dev[qd_idx].page),
|
||||||
|
STRIPE_SIZE);
|
||||||
|
compute_parity6(sh, UPDATE_PARITY);
|
||||||
|
if (memcmp(page_address(tmp_page),
|
||||||
|
page_address(sh->dev[qd_idx].page),
|
||||||
|
STRIPE_SIZE) != 0) {
|
||||||
|
clear_bit(STRIPE_INSYNC, &sh->state);
|
||||||
|
update_q = 1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
put_cpu();
|
||||||
|
|
||||||
|
if (update_p || update_q) {
|
||||||
|
conf->mddev->resync_mismatches += STRIPE_SECTORS;
|
||||||
|
if (test_bit(MD_RECOVERY_CHECK, &conf->mddev->recovery))
|
||||||
|
/* don't try to repair!! */
|
||||||
|
update_p = update_q = 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* now write out any block on a failed drive,
|
||||||
|
* or P or Q if they need it
|
||||||
|
*/
|
||||||
|
|
||||||
|
if (s->failed == 2) {
|
||||||
|
dev = &sh->dev[r6s->failed_num[1]];
|
||||||
|
s->locked++;
|
||||||
|
set_bit(R5_LOCKED, &dev->flags);
|
||||||
|
set_bit(R5_Wantwrite, &dev->flags);
|
||||||
|
}
|
||||||
|
if (s->failed >= 1) {
|
||||||
|
dev = &sh->dev[r6s->failed_num[0]];
|
||||||
|
s->locked++;
|
||||||
|
set_bit(R5_LOCKED, &dev->flags);
|
||||||
|
set_bit(R5_Wantwrite, &dev->flags);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (update_p) {
|
||||||
|
dev = &sh->dev[pd_idx];
|
||||||
|
s->locked++;
|
||||||
|
set_bit(R5_LOCKED, &dev->flags);
|
||||||
|
set_bit(R5_Wantwrite, &dev->flags);
|
||||||
|
}
|
||||||
|
if (update_q) {
|
||||||
|
dev = &sh->dev[qd_idx];
|
||||||
|
s->locked++;
|
||||||
|
set_bit(R5_LOCKED, &dev->flags);
|
||||||
|
set_bit(R5_Wantwrite, &dev->flags);
|
||||||
|
}
|
||||||
|
clear_bit(STRIPE_DEGRADED, &sh->state);
|
||||||
|
|
||||||
|
set_bit(STRIPE_INSYNC, &sh->state);
|
||||||
}
|
}
|
||||||
|
|
||||||
static void handle_stripe_expansion(raid5_conf_t *conf, struct stripe_head *sh,
|
static void handle_stripe_expansion(raid5_conf_t *conf, struct stripe_head *sh,
|
||||||
|
@ -3009,7 +3008,7 @@ static bool handle_stripe5(struct stripe_head *sh)
|
||||||
return blocked_rdev == NULL;
|
return blocked_rdev == NULL;
|
||||||
}
|
}
|
||||||
|
|
||||||
static bool handle_stripe6(struct stripe_head *sh, struct page *tmp_page)
|
static bool handle_stripe6(struct stripe_head *sh)
|
||||||
{
|
{
|
||||||
raid5_conf_t *conf = sh->raid_conf;
|
raid5_conf_t *conf = sh->raid_conf;
|
||||||
int disks = sh->disks;
|
int disks = sh->disks;
|
||||||
|
@ -3164,7 +3163,7 @@ static bool handle_stripe6(struct stripe_head *sh, struct page *tmp_page)
|
||||||
* data is available
|
* data is available
|
||||||
*/
|
*/
|
||||||
if (s.syncing && s.locked == 0 && !test_bit(STRIPE_INSYNC, &sh->state))
|
if (s.syncing && s.locked == 0 && !test_bit(STRIPE_INSYNC, &sh->state))
|
||||||
handle_parity_checks6(conf, sh, &s, &r6s, tmp_page, disks);
|
handle_parity_checks6(conf, sh, &s, &r6s, disks);
|
||||||
|
|
||||||
if (s.syncing && s.locked == 0 && test_bit(STRIPE_INSYNC, &sh->state)) {
|
if (s.syncing && s.locked == 0 && test_bit(STRIPE_INSYNC, &sh->state)) {
|
||||||
md_done_sync(conf->mddev, STRIPE_SECTORS,1);
|
md_done_sync(conf->mddev, STRIPE_SECTORS,1);
|
||||||
|
@ -3247,16 +3246,14 @@ static bool handle_stripe6(struct stripe_head *sh, struct page *tmp_page)
|
||||||
}
|
}
|
||||||
|
|
||||||
/* returns true if the stripe was handled */
|
/* returns true if the stripe was handled */
|
||||||
static bool handle_stripe(struct stripe_head *sh, struct page *tmp_page)
|
static bool handle_stripe(struct stripe_head *sh)
|
||||||
{
|
{
|
||||||
if (sh->raid_conf->level == 6)
|
if (sh->raid_conf->level == 6)
|
||||||
return handle_stripe6(sh, tmp_page);
|
return handle_stripe6(sh);
|
||||||
else
|
else
|
||||||
return handle_stripe5(sh);
|
return handle_stripe5(sh);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
static void raid5_activate_delayed(raid5_conf_t *conf)
|
static void raid5_activate_delayed(raid5_conf_t *conf)
|
||||||
{
|
{
|
||||||
if (atomic_read(&conf->preread_active_stripes) < IO_THRESHOLD) {
|
if (atomic_read(&conf->preread_active_stripes) < IO_THRESHOLD) {
|
||||||
|
@ -4047,7 +4044,7 @@ static inline sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *ski
|
||||||
spin_unlock(&sh->lock);
|
spin_unlock(&sh->lock);
|
||||||
|
|
||||||
/* wait for any blocked device to be handled */
|
/* wait for any blocked device to be handled */
|
||||||
while(unlikely(!handle_stripe(sh, NULL)))
|
while (unlikely(!handle_stripe(sh)))
|
||||||
;
|
;
|
||||||
release_stripe(sh);
|
release_stripe(sh);
|
||||||
|
|
||||||
|
@ -4104,7 +4101,7 @@ static int retry_aligned_read(raid5_conf_t *conf, struct bio *raid_bio)
|
||||||
return handled;
|
return handled;
|
||||||
}
|
}
|
||||||
|
|
||||||
handle_stripe(sh, NULL);
|
handle_stripe(sh);
|
||||||
release_stripe(sh);
|
release_stripe(sh);
|
||||||
handled++;
|
handled++;
|
||||||
}
|
}
|
||||||
|
@ -4168,7 +4165,7 @@ static void raid5d(mddev_t *mddev)
|
||||||
spin_unlock_irq(&conf->device_lock);
|
spin_unlock_irq(&conf->device_lock);
|
||||||
|
|
||||||
handled++;
|
handled++;
|
||||||
handle_stripe(sh, conf->spare_page);
|
handle_stripe(sh);
|
||||||
release_stripe(sh);
|
release_stripe(sh);
|
||||||
|
|
||||||
spin_lock_irq(&conf->device_lock);
|
spin_lock_irq(&conf->device_lock);
|
||||||
|
@ -4309,15 +4306,104 @@ raid5_size(mddev_t *mddev, sector_t sectors, int raid_disks)
|
||||||
return sectors * (raid_disks - conf->max_degraded);
|
return sectors * (raid_disks - conf->max_degraded);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static void raid5_free_percpu(raid5_conf_t *conf)
|
||||||
|
{
|
||||||
|
struct raid5_percpu *percpu;
|
||||||
|
unsigned long cpu;
|
||||||
|
|
||||||
|
if (!conf->percpu)
|
||||||
|
return;
|
||||||
|
|
||||||
|
get_online_cpus();
|
||||||
|
for_each_possible_cpu(cpu) {
|
||||||
|
percpu = per_cpu_ptr(conf->percpu, cpu);
|
||||||
|
safe_put_page(percpu->spare_page);
|
||||||
|
}
|
||||||
|
#ifdef CONFIG_HOTPLUG_CPU
|
||||||
|
unregister_cpu_notifier(&conf->cpu_notify);
|
||||||
|
#endif
|
||||||
|
put_online_cpus();
|
||||||
|
|
||||||
|
free_percpu(conf->percpu);
|
||||||
|
}
|
||||||
|
|
||||||
static void free_conf(raid5_conf_t *conf)
|
static void free_conf(raid5_conf_t *conf)
|
||||||
{
|
{
|
||||||
shrink_stripes(conf);
|
shrink_stripes(conf);
|
||||||
safe_put_page(conf->spare_page);
|
raid5_free_percpu(conf);
|
||||||
kfree(conf->disks);
|
kfree(conf->disks);
|
||||||
kfree(conf->stripe_hashtbl);
|
kfree(conf->stripe_hashtbl);
|
||||||
kfree(conf);
|
kfree(conf);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#ifdef CONFIG_HOTPLUG_CPU
|
||||||
|
static int raid456_cpu_notify(struct notifier_block *nfb, unsigned long action,
|
||||||
|
void *hcpu)
|
||||||
|
{
|
||||||
|
raid5_conf_t *conf = container_of(nfb, raid5_conf_t, cpu_notify);
|
||||||
|
long cpu = (long)hcpu;
|
||||||
|
struct raid5_percpu *percpu = per_cpu_ptr(conf->percpu, cpu);
|
||||||
|
|
||||||
|
switch (action) {
|
||||||
|
case CPU_UP_PREPARE:
|
||||||
|
case CPU_UP_PREPARE_FROZEN:
|
||||||
|
if (!percpu->spare_page)
|
||||||
|
percpu->spare_page = alloc_page(GFP_KERNEL);
|
||||||
|
if (!percpu->spare_page) {
|
||||||
|
pr_err("%s: failed memory allocation for cpu%ld\n",
|
||||||
|
__func__, cpu);
|
||||||
|
return NOTIFY_BAD;
|
||||||
|
}
|
||||||
|
break;
|
||||||
|
case CPU_DEAD:
|
||||||
|
case CPU_DEAD_FROZEN:
|
||||||
|
safe_put_page(percpu->spare_page);
|
||||||
|
percpu->spare_page = NULL;
|
||||||
|
break;
|
||||||
|
default:
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
return NOTIFY_OK;
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
static int raid5_alloc_percpu(raid5_conf_t *conf)
|
||||||
|
{
|
||||||
|
unsigned long cpu;
|
||||||
|
struct page *spare_page;
|
||||||
|
struct raid5_percpu *allcpus;
|
||||||
|
int err;
|
||||||
|
|
||||||
|
/* the only percpu data is the raid6 spare page */
|
||||||
|
if (conf->level != 6)
|
||||||
|
return 0;
|
||||||
|
|
||||||
|
allcpus = alloc_percpu(struct raid5_percpu);
|
||||||
|
if (!allcpus)
|
||||||
|
return -ENOMEM;
|
||||||
|
conf->percpu = allcpus;
|
||||||
|
|
||||||
|
get_online_cpus();
|
||||||
|
err = 0;
|
||||||
|
for_each_present_cpu(cpu) {
|
||||||
|
spare_page = alloc_page(GFP_KERNEL);
|
||||||
|
if (!spare_page) {
|
||||||
|
err = -ENOMEM;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
per_cpu_ptr(conf->percpu, cpu)->spare_page = spare_page;
|
||||||
|
}
|
||||||
|
#ifdef CONFIG_HOTPLUG_CPU
|
||||||
|
conf->cpu_notify.notifier_call = raid456_cpu_notify;
|
||||||
|
conf->cpu_notify.priority = 0;
|
||||||
|
if (err == 0)
|
||||||
|
err = register_cpu_notifier(&conf->cpu_notify);
|
||||||
|
#endif
|
||||||
|
put_online_cpus();
|
||||||
|
|
||||||
|
return err;
|
||||||
|
}
|
||||||
|
|
||||||
static raid5_conf_t *setup_conf(mddev_t *mddev)
|
static raid5_conf_t *setup_conf(mddev_t *mddev)
|
||||||
{
|
{
|
||||||
raid5_conf_t *conf;
|
raid5_conf_t *conf;
|
||||||
|
@ -4372,11 +4458,10 @@ static raid5_conf_t *setup_conf(mddev_t *mddev)
|
||||||
if ((conf->stripe_hashtbl = kzalloc(PAGE_SIZE, GFP_KERNEL)) == NULL)
|
if ((conf->stripe_hashtbl = kzalloc(PAGE_SIZE, GFP_KERNEL)) == NULL)
|
||||||
goto abort;
|
goto abort;
|
||||||
|
|
||||||
if (mddev->new_level == 6) {
|
conf->level = mddev->new_level;
|
||||||
conf->spare_page = alloc_page(GFP_KERNEL);
|
if (raid5_alloc_percpu(conf) != 0)
|
||||||
if (!conf->spare_page)
|
goto abort;
|
||||||
goto abort;
|
|
||||||
}
|
|
||||||
spin_lock_init(&conf->device_lock);
|
spin_lock_init(&conf->device_lock);
|
||||||
init_waitqueue_head(&conf->wait_for_stripe);
|
init_waitqueue_head(&conf->wait_for_stripe);
|
||||||
init_waitqueue_head(&conf->wait_for_overlap);
|
init_waitqueue_head(&conf->wait_for_overlap);
|
||||||
|
@ -4412,7 +4497,6 @@ static raid5_conf_t *setup_conf(mddev_t *mddev)
|
||||||
}
|
}
|
||||||
|
|
||||||
conf->chunk_size = mddev->new_chunk;
|
conf->chunk_size = mddev->new_chunk;
|
||||||
conf->level = mddev->new_level;
|
|
||||||
if (conf->level == 6)
|
if (conf->level == 6)
|
||||||
conf->max_degraded = 2;
|
conf->max_degraded = 2;
|
||||||
else
|
else
|
||||||
|
|
|
@ -383,8 +383,13 @@ struct raid5_private_data {
|
||||||
* (fresh device added).
|
* (fresh device added).
|
||||||
* Cleared when a sync completes.
|
* Cleared when a sync completes.
|
||||||
*/
|
*/
|
||||||
|
/* per cpu variables */
|
||||||
struct page *spare_page; /* Used when checking P/Q in raid6 */
|
struct raid5_percpu {
|
||||||
|
struct page *spare_page; /* Used when checking P/Q in raid6 */
|
||||||
|
} *percpu;
|
||||||
|
#ifdef CONFIG_HOTPLUG_CPU
|
||||||
|
struct notifier_block cpu_notify;
|
||||||
|
#endif
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Free stripes pool
|
* Free stripes pool
|
||||||
|
|
Loading…
Reference in New Issue