[PATCH] md: support md/linear array with components greater than 2 terabytes.
linear currently uses division by the size of the smallest componenet device to find which device a request goes to. If that smallest device is larger than 2 terabytes, then the division will not work on some systems. So we introduce a pre-shift, and take care not to make the hash table too large, much like the code in raid0. Also get rid of conf->nr_zones, which is not needed. Signed-off-by: Neil Brown <neilb@cse.unsw.edu.au> Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
This commit is contained in:
parent
4b6d287f62
commit
15945fee6f
|
@ -38,7 +38,8 @@ static inline dev_info_t *which_dev(mddev_t *mddev, sector_t sector)
|
||||||
/*
|
/*
|
||||||
* sector_div(a,b) returns the remainer and sets a to a/b
|
* sector_div(a,b) returns the remainer and sets a to a/b
|
||||||
*/
|
*/
|
||||||
(void)sector_div(block, conf->smallest->size);
|
block >>= conf->preshift;
|
||||||
|
(void)sector_div(block, conf->hash_spacing);
|
||||||
hash = conf->hash_table[block];
|
hash = conf->hash_table[block];
|
||||||
|
|
||||||
while ((sector>>1) >= (hash->size + hash->offset))
|
while ((sector>>1) >= (hash->size + hash->offset))
|
||||||
|
@ -47,7 +48,7 @@ static inline dev_info_t *which_dev(mddev_t *mddev, sector_t sector)
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* linear_mergeable_bvec -- tell bio layer if a two requests can be merged
|
* linear_mergeable_bvec -- tell bio layer if two requests can be merged
|
||||||
* @q: request queue
|
* @q: request queue
|
||||||
* @bio: the buffer head that's been built up so far
|
* @bio: the buffer head that's been built up so far
|
||||||
* @biovec: the request that could be merged to it.
|
* @biovec: the request that could be merged to it.
|
||||||
|
@ -116,7 +117,7 @@ static int linear_run (mddev_t *mddev)
|
||||||
dev_info_t **table;
|
dev_info_t **table;
|
||||||
mdk_rdev_t *rdev;
|
mdk_rdev_t *rdev;
|
||||||
int i, nb_zone, cnt;
|
int i, nb_zone, cnt;
|
||||||
sector_t start;
|
sector_t min_spacing;
|
||||||
sector_t curr_offset;
|
sector_t curr_offset;
|
||||||
struct list_head *tmp;
|
struct list_head *tmp;
|
||||||
|
|
||||||
|
@ -127,11 +128,6 @@ static int linear_run (mddev_t *mddev)
|
||||||
memset(conf, 0, sizeof(*conf) + mddev->raid_disks*sizeof(dev_info_t));
|
memset(conf, 0, sizeof(*conf) + mddev->raid_disks*sizeof(dev_info_t));
|
||||||
mddev->private = conf;
|
mddev->private = conf;
|
||||||
|
|
||||||
/*
|
|
||||||
* Find the smallest device.
|
|
||||||
*/
|
|
||||||
|
|
||||||
conf->smallest = NULL;
|
|
||||||
cnt = 0;
|
cnt = 0;
|
||||||
mddev->array_size = 0;
|
mddev->array_size = 0;
|
||||||
|
|
||||||
|
@ -159,8 +155,6 @@ static int linear_run (mddev_t *mddev)
|
||||||
disk->size = rdev->size;
|
disk->size = rdev->size;
|
||||||
mddev->array_size += rdev->size;
|
mddev->array_size += rdev->size;
|
||||||
|
|
||||||
if (!conf->smallest || (disk->size < conf->smallest->size))
|
|
||||||
conf->smallest = disk;
|
|
||||||
cnt++;
|
cnt++;
|
||||||
}
|
}
|
||||||
if (cnt != mddev->raid_disks) {
|
if (cnt != mddev->raid_disks) {
|
||||||
|
@ -168,6 +162,36 @@ static int linear_run (mddev_t *mddev)
|
||||||
goto out;
|
goto out;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
min_spacing = mddev->array_size;
|
||||||
|
sector_div(min_spacing, PAGE_SIZE/sizeof(struct dev_info *));
|
||||||
|
|
||||||
|
/* min_spacing is the minimum spacing that will fit the hash
|
||||||
|
* table in one PAGE. This may be much smaller than needed.
|
||||||
|
* We find the smallest non-terminal set of consecutive devices
|
||||||
|
* that is larger than min_spacing as use the size of that as
|
||||||
|
* the actual spacing
|
||||||
|
*/
|
||||||
|
conf->hash_spacing = mddev->array_size;
|
||||||
|
for (i=0; i < cnt-1 ; i++) {
|
||||||
|
sector_t sz = 0;
|
||||||
|
int j;
|
||||||
|
for (j=i; i<cnt-1 && sz < min_spacing ; j++)
|
||||||
|
sz += conf->disks[j].size;
|
||||||
|
if (sz >= min_spacing && sz < conf->hash_spacing)
|
||||||
|
conf->hash_spacing = sz;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* hash_spacing may be too large for sector_div to work with,
|
||||||
|
* so we might need to pre-shift
|
||||||
|
*/
|
||||||
|
conf->preshift = 0;
|
||||||
|
if (sizeof(sector_t) > sizeof(u32)) {
|
||||||
|
sector_t space = conf->hash_spacing;
|
||||||
|
while (space > (sector_t)(~(u32)0)) {
|
||||||
|
space >>= 1;
|
||||||
|
conf->preshift++;
|
||||||
|
}
|
||||||
|
}
|
||||||
/*
|
/*
|
||||||
* This code was restructured to work around a gcc-2.95.3 internal
|
* This code was restructured to work around a gcc-2.95.3 internal
|
||||||
* compiler error. Alter it with care.
|
* compiler error. Alter it with care.
|
||||||
|
@ -177,39 +201,52 @@ static int linear_run (mddev_t *mddev)
|
||||||
unsigned round;
|
unsigned round;
|
||||||
unsigned long base;
|
unsigned long base;
|
||||||
|
|
||||||
sz = mddev->array_size;
|
sz = mddev->array_size >> conf->preshift;
|
||||||
base = conf->smallest->size;
|
sz += 1; /* force round-up */
|
||||||
|
base = conf->hash_spacing >> conf->preshift;
|
||||||
round = sector_div(sz, base);
|
round = sector_div(sz, base);
|
||||||
nb_zone = conf->nr_zones = sz + (round ? 1 : 0);
|
nb_zone = sz + (round ? 1 : 0);
|
||||||
}
|
}
|
||||||
|
BUG_ON(nb_zone > PAGE_SIZE / sizeof(struct dev_info *));
|
||||||
|
|
||||||
conf->hash_table = kmalloc (sizeof (dev_info_t*) * nb_zone,
|
conf->hash_table = kmalloc (sizeof (struct dev_info *) * nb_zone,
|
||||||
GFP_KERNEL);
|
GFP_KERNEL);
|
||||||
if (!conf->hash_table)
|
if (!conf->hash_table)
|
||||||
goto out;
|
goto out;
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Here we generate the linear hash table
|
* Here we generate the linear hash table
|
||||||
|
* First calculate the device offsets.
|
||||||
*/
|
*/
|
||||||
|
conf->disks[0].offset = 0;
|
||||||
|
for (i=1; i<mddev->raid_disks; i++)
|
||||||
|
conf->disks[i].offset =
|
||||||
|
conf->disks[i-1].offset +
|
||||||
|
conf->disks[i-1].size;
|
||||||
|
|
||||||
table = conf->hash_table;
|
table = conf->hash_table;
|
||||||
start = 0;
|
|
||||||
curr_offset = 0;
|
curr_offset = 0;
|
||||||
for (i = 0; i < cnt; i++) {
|
i = 0;
|
||||||
dev_info_t *disk = conf->disks + i;
|
for (curr_offset = 0;
|
||||||
|
curr_offset < mddev->array_size;
|
||||||
|
curr_offset += conf->hash_spacing) {
|
||||||
|
|
||||||
disk->offset = curr_offset;
|
while (i < mddev->raid_disks-1 &&
|
||||||
curr_offset += disk->size;
|
curr_offset >= conf->disks[i+1].offset)
|
||||||
|
i++;
|
||||||
|
|
||||||
/* 'curr_offset' is the end of this disk
|
*table ++ = conf->disks + i;
|
||||||
* 'start' is the start of table
|
|
||||||
*/
|
|
||||||
while (start < curr_offset) {
|
|
||||||
*table++ = disk;
|
|
||||||
start += conf->smallest->size;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
if (table-conf->hash_table != nb_zone)
|
|
||||||
BUG();
|
if (conf->preshift) {
|
||||||
|
conf->hash_spacing >>= conf->preshift;
|
||||||
|
/* round hash_spacing up so that when we divide by it,
|
||||||
|
* we err on the side of "too-low", which is safest.
|
||||||
|
*/
|
||||||
|
conf->hash_spacing++;
|
||||||
|
}
|
||||||
|
|
||||||
|
BUG_ON(table - conf->hash_table > nb_zone);
|
||||||
|
|
||||||
blk_queue_merge_bvec(mddev->queue, linear_mergeable_bvec);
|
blk_queue_merge_bvec(mddev->queue, linear_mergeable_bvec);
|
||||||
mddev->queue->unplug_fn = linear_unplug;
|
mddev->queue->unplug_fn = linear_unplug;
|
||||||
|
@ -299,7 +336,7 @@ static void linear_status (struct seq_file *seq, mddev_t *mddev)
|
||||||
sector_t s = 0;
|
sector_t s = 0;
|
||||||
|
|
||||||
seq_printf(seq, " ");
|
seq_printf(seq, " ");
|
||||||
for (j = 0; j < conf->nr_zones; j++)
|
for (j = 0; j < mddev->raid_disks; j++)
|
||||||
{
|
{
|
||||||
char b[BDEVNAME_SIZE];
|
char b[BDEVNAME_SIZE];
|
||||||
s += conf->smallest_size;
|
s += conf->smallest_size;
|
||||||
|
|
|
@ -14,8 +14,8 @@ typedef struct dev_info dev_info_t;
|
||||||
struct linear_private_data
|
struct linear_private_data
|
||||||
{
|
{
|
||||||
dev_info_t **hash_table;
|
dev_info_t **hash_table;
|
||||||
dev_info_t *smallest;
|
sector_t hash_spacing;
|
||||||
int nr_zones;
|
int preshift; /* shift before dividing by hash_spacing */
|
||||||
dev_info_t disks[0];
|
dev_info_t disks[0];
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue