Distributed Spare (dRAID) Feature

This patch adds a new top-level vdev type called dRAID, which stands
for Distributed parity RAID.  This pool configuration allows all dRAID
vdevs to participate when rebuilding to a distributed hot spare device.
This can substantially reduce the total time required to restore full
parity to pool with a failed device.

A dRAID pool can be created using the new top-level `draid` type.
Like `raidz`, the desired redundancy is specified after the type:
`draid[1,2,3]`.  No additional information is required to create the
pool and reasonable default values will be chosen based on the number
of child vdevs in the dRAID vdev.

    zpool create <pool> draid[1,2,3] <vdevs...>

Unlike raidz, additional optional dRAID configuration values can be
provided as part of the draid type as colon separated values. This
allows administrators to fully specify a layout for either performance
or capacity reasons.  The supported options include:

    zpool create <pool> \
        draid[<parity>][:<data>d][:<children>c][:<spares>s] \
        <vdevs...>

    - draid[parity]       - Parity level (default 1)
    - draid[:<data>d]     - Data devices per group (default 8)
    - draid[:<children>c] - Expected number of child vdevs
    - draid[:<spares>s]   - Distributed hot spares (default 0)

Abbreviated example `zpool status` output for a 68 disk dRAID pool
with two distributed spares using special allocation classes.

```
  pool: tank
 state: ONLINE
config:

    NAME                  STATE     READ WRITE CKSUM
    slag7                 ONLINE       0     0     0
      draid2:8d:68c:2s-0  ONLINE       0     0     0
        L0                ONLINE       0     0     0
        L1                ONLINE       0     0     0
        ...
        U25               ONLINE       0     0     0
        U26               ONLINE       0     0     0
        spare-53          ONLINE       0     0     0
          U27             ONLINE       0     0     0
          draid2-0-0      ONLINE       0     0     0
        U28               ONLINE       0     0     0
        U29               ONLINE       0     0     0
        ...
        U42               ONLINE       0     0     0
        U43               ONLINE       0     0     0
    special
      mirror-1            ONLINE       0     0     0
        L5                ONLINE       0     0     0
        U5                ONLINE       0     0     0
      mirror-2            ONLINE       0     0     0
        L6                ONLINE       0     0     0
        U6                ONLINE       0     0     0
    spares
      draid2-0-0          INUSE     currently in use
      draid2-0-1          AVAIL
```

When adding test coverage for the new dRAID vdev type the following
options were added to the ztest command.  These options are leverages
by zloop.sh to test a wide range of dRAID configurations.

    -K draid|raidz|random - kind of RAID to test
    -D <value>            - dRAID data drives per group
    -S <value>            - dRAID distributed hot spares
    -R <value>            - RAID parity (raidz or dRAID)

The zpool_create, zpool_import, redundancy, replacement and fault
test groups have all been updated provide test coverage for the
dRAID feature.

Co-authored-by: Isaac Huang <he.huang@intel.com>
Co-authored-by: Mark Maybee <mmaybee@cray.com>
Co-authored-by: Don Brady <don.brady@delphix.com>
Co-authored-by: Matthew Ahrens <mahrens@delphix.com>
Co-authored-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Mark Maybee <mmaybee@cray.com>
Reviewed-by: Matt Ahrens <matt@delphix.com>
Reviewed-by: Tony Hutter <hutter2@llnl.gov>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Closes #10102
This commit is contained in:
Brian Behlendorf 2020-11-13 13:51:51 -08:00 committed by GitHub
parent a724db0374
commit b2255edcc0
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
153 changed files with 10203 additions and 1882 deletions

View File

@ -83,8 +83,17 @@ run_gen_bench_impl(const char *impl)
/* create suitable raidz_map */
ncols = rto_opts.rto_dcols + fn + 1;
zio_bench.io_size = 1ULL << ds;
rm_bench = vdev_raidz_map_alloc(&zio_bench,
BENCH_ASHIFT, ncols, fn+1);
if (rto_opts.rto_expand) {
rm_bench = vdev_raidz_map_alloc_expanded(
zio_bench.io_abd,
zio_bench.io_size, zio_bench.io_offset,
rto_opts.rto_ashift, ncols+1, ncols,
fn+1, rto_opts.rto_expand_offset);
} else {
rm_bench = vdev_raidz_map_alloc(&zio_bench,
BENCH_ASHIFT, ncols, fn+1);
}
/* estimate iteration count */
iter_cnt = GEN_BENCH_MEMORY;
@ -163,8 +172,16 @@ run_rec_bench_impl(const char *impl)
(1ULL << BENCH_ASHIFT))
continue;
rm_bench = vdev_raidz_map_alloc(&zio_bench,
BENCH_ASHIFT, ncols, PARITY_PQR);
if (rto_opts.rto_expand) {
rm_bench = vdev_raidz_map_alloc_expanded(
zio_bench.io_abd,
zio_bench.io_size, zio_bench.io_offset,
BENCH_ASHIFT, ncols+1, ncols,
PARITY_PQR, rto_opts.rto_expand_offset);
} else {
rm_bench = vdev_raidz_map_alloc(&zio_bench,
BENCH_ASHIFT, ncols, PARITY_PQR);
}
/* estimate iteration count */
iter_cnt = (REC_BENCH_MEMORY);

View File

@ -77,16 +77,20 @@ static void print_opts(raidz_test_opts_t *opts, boolean_t force)
(void) fprintf(stdout, DBLSEP "Running with options:\n"
" (-a) zio ashift : %zu\n"
" (-o) zio offset : 1 << %zu\n"
" (-e) expanded map : %s\n"
" (-r) reflow offset : %llx\n"
" (-d) number of raidz data columns : %zu\n"
" (-s) size of DATA : 1 << %zu\n"
" (-S) sweep parameters : %s \n"
" (-v) verbose : %s \n\n",
opts->rto_ashift, /* -a */
ilog2(opts->rto_offset), /* -o */
opts->rto_dcols, /* -d */
ilog2(opts->rto_dsize), /* -s */
opts->rto_sweep ? "yes" : "no", /* -S */
verbose); /* -v */
opts->rto_ashift, /* -a */
ilog2(opts->rto_offset), /* -o */
opts->rto_expand ? "yes" : "no", /* -e */
(u_longlong_t)opts->rto_expand_offset, /* -r */
opts->rto_dcols, /* -d */
ilog2(opts->rto_dsize), /* -s */
opts->rto_sweep ? "yes" : "no", /* -S */
verbose); /* -v */
}
}
@ -104,6 +108,8 @@ static void usage(boolean_t requested)
"\t[-S parameter sweep (default: %s)]\n"
"\t[-t timeout for parameter sweep test]\n"
"\t[-B benchmark all raidz implementations]\n"
"\t[-e use expanded raidz map (default: %s)]\n"
"\t[-r expanded raidz map reflow offset (default: %llx)]\n"
"\t[-v increase verbosity (default: %zu)]\n"
"\t[-h (print help)]\n"
"\t[-T test the test, see if failure would be detected]\n"
@ -114,6 +120,8 @@ static void usage(boolean_t requested)
o->rto_dcols, /* -d */
ilog2(o->rto_dsize), /* -s */
rto_opts.rto_sweep ? "yes" : "no", /* -S */
rto_opts.rto_expand ? "yes" : "no", /* -e */
(u_longlong_t)o->rto_expand_offset, /* -r */
o->rto_v); /* -d */
exit(requested ? 0 : 1);
@ -128,7 +136,7 @@ static void process_options(int argc, char **argv)
bcopy(&rto_opts_defaults, o, sizeof (*o));
while ((opt = getopt(argc, argv, "TDBSvha:o:d:s:t:")) != -1) {
while ((opt = getopt(argc, argv, "TDBSvha:er:o:d:s:t:")) != -1) {
value = 0;
switch (opt) {
@ -136,6 +144,12 @@ static void process_options(int argc, char **argv)
value = strtoull(optarg, NULL, 0);
o->rto_ashift = MIN(13, MAX(9, value));
break;
case 'e':
o->rto_expand = 1;
break;
case 'r':
o->rto_expand_offset = strtoull(optarg, NULL, 0);
break;
case 'o':
value = strtoull(optarg, NULL, 0);
o->rto_offset = ((1ULL << MIN(12, value)) >> 9) << 9;
@ -179,25 +193,34 @@ static void process_options(int argc, char **argv)
}
}
#define DATA_COL(rm, i) ((rm)->rm_col[raidz_parity(rm) + (i)].rc_abd)
#define DATA_COL_SIZE(rm, i) ((rm)->rm_col[raidz_parity(rm) + (i)].rc_size)
#define DATA_COL(rr, i) ((rr)->rr_col[rr->rr_firstdatacol + (i)].rc_abd)
#define DATA_COL_SIZE(rr, i) ((rr)->rr_col[rr->rr_firstdatacol + (i)].rc_size)
#define CODE_COL(rm, i) ((rm)->rm_col[(i)].rc_abd)
#define CODE_COL_SIZE(rm, i) ((rm)->rm_col[(i)].rc_size)
#define CODE_COL(rr, i) ((rr)->rr_col[(i)].rc_abd)
#define CODE_COL_SIZE(rr, i) ((rr)->rr_col[(i)].rc_size)
static int
cmp_code(raidz_test_opts_t *opts, const raidz_map_t *rm, const int parity)
{
int i, ret = 0;
int r, i, ret = 0;
VERIFY(parity >= 1 && parity <= 3);
for (i = 0; i < parity; i++) {
if (abd_cmp(CODE_COL(rm, i), CODE_COL(opts->rm_golden, i))
!= 0) {
ret++;
LOG_OPT(D_DEBUG, opts,
"\nParity block [%d] different!\n", i);
for (r = 0; r < rm->rm_nrows; r++) {
raidz_row_t * const rr = rm->rm_row[r];
raidz_row_t * const rrg = opts->rm_golden->rm_row[r];
for (i = 0; i < parity; i++) {
if (CODE_COL_SIZE(rrg, i) == 0) {
VERIFY0(CODE_COL_SIZE(rr, i));
continue;
}
if (abd_cmp(CODE_COL(rr, i),
CODE_COL(rrg, i)) != 0) {
ret++;
LOG_OPT(D_DEBUG, opts,
"\nParity block [%d] different!\n", i);
}
}
}
return (ret);
@ -206,16 +229,26 @@ cmp_code(raidz_test_opts_t *opts, const raidz_map_t *rm, const int parity)
static int
cmp_data(raidz_test_opts_t *opts, raidz_map_t *rm)
{
int i, ret = 0;
int dcols = opts->rm_golden->rm_cols - raidz_parity(opts->rm_golden);
int r, i, dcols, ret = 0;
for (i = 0; i < dcols; i++) {
if (abd_cmp(DATA_COL(opts->rm_golden, i), DATA_COL(rm, i))
!= 0) {
ret++;
for (r = 0; r < rm->rm_nrows; r++) {
raidz_row_t *rr = rm->rm_row[r];
raidz_row_t *rrg = opts->rm_golden->rm_row[r];
dcols = opts->rm_golden->rm_row[0]->rr_cols -
raidz_parity(opts->rm_golden);
for (i = 0; i < dcols; i++) {
if (DATA_COL_SIZE(rrg, i) == 0) {
VERIFY0(DATA_COL_SIZE(rr, i));
continue;
}
LOG_OPT(D_DEBUG, opts,
"\nData block [%d] different!\n", i);
if (abd_cmp(DATA_COL(rrg, i),
DATA_COL(rr, i)) != 0) {
ret++;
LOG_OPT(D_DEBUG, opts,
"\nData block [%d] different!\n", i);
}
}
}
return (ret);
@ -236,12 +269,13 @@ init_rand(void *data, size_t size, void *private)
static void
corrupt_colums(raidz_map_t *rm, const int *tgts, const int cnt)
{
int i;
raidz_col_t *col;
for (i = 0; i < cnt; i++) {
col = &rm->rm_col[tgts[i]];
abd_iterate_func(col->rc_abd, 0, col->rc_size, init_rand, NULL);
for (int r = 0; r < rm->rm_nrows; r++) {
raidz_row_t *rr = rm->rm_row[r];
for (int i = 0; i < cnt; i++) {
raidz_col_t *col = &rr->rr_col[tgts[i]];
abd_iterate_func(col->rc_abd, 0, col->rc_size,
init_rand, NULL);
}
}
}
@ -288,10 +322,22 @@ init_raidz_golden_map(raidz_test_opts_t *opts, const int parity)
VERIFY0(vdev_raidz_impl_set("original"));
opts->rm_golden = vdev_raidz_map_alloc(opts->zio_golden,
opts->rto_ashift, total_ncols, parity);
rm_test = vdev_raidz_map_alloc(zio_test,
opts->rto_ashift, total_ncols, parity);
if (opts->rto_expand) {
opts->rm_golden =
vdev_raidz_map_alloc_expanded(opts->zio_golden->io_abd,
opts->zio_golden->io_size, opts->zio_golden->io_offset,
opts->rto_ashift, total_ncols+1, total_ncols,
parity, opts->rto_expand_offset);
rm_test = vdev_raidz_map_alloc_expanded(zio_test->io_abd,
zio_test->io_size, zio_test->io_offset,
opts->rto_ashift, total_ncols+1, total_ncols,
parity, opts->rto_expand_offset);
} else {
opts->rm_golden = vdev_raidz_map_alloc(opts->zio_golden,
opts->rto_ashift, total_ncols, parity);
rm_test = vdev_raidz_map_alloc(zio_test,
opts->rto_ashift, total_ncols, parity);
}
VERIFY(opts->zio_golden);
VERIFY(opts->rm_golden);
@ -312,6 +358,188 @@ init_raidz_golden_map(raidz_test_opts_t *opts, const int parity)
return (err);
}
/*
* If reflow is not in progress, reflow_offset should be UINT64_MAX.
* For each row, if the row is entirely before reflow_offset, it will
* come from the new location. Otherwise this row will come from the
* old location. Therefore, rows that straddle the reflow_offset will
* come from the old location.
*
* NOTE: Until raidz expansion is implemented this function is only
* needed by raidz_test.c to the multi-row raid_map_t functionality.
*/
raidz_map_t *
vdev_raidz_map_alloc_expanded(abd_t *abd, uint64_t size, uint64_t offset,
uint64_t ashift, uint64_t physical_cols, uint64_t logical_cols,
uint64_t nparity, uint64_t reflow_offset)
{
/* The zio's size in units of the vdev's minimum sector size. */
uint64_t s = size >> ashift;
uint64_t q, r, bc, devidx, asize = 0, tot;
/*
* "Quotient": The number of data sectors for this stripe on all but
* the "big column" child vdevs that also contain "remainder" data.
* AKA "full rows"
*/
q = s / (logical_cols - nparity);
/*
* "Remainder": The number of partial stripe data sectors in this I/O.
* This will add a sector to some, but not all, child vdevs.
*/
r = s - q * (logical_cols - nparity);
/* The number of "big columns" - those which contain remainder data. */
bc = (r == 0 ? 0 : r + nparity);
/*
* The total number of data and parity sectors associated with
* this I/O.
*/
tot = s + nparity * (q + (r == 0 ? 0 : 1));
/* How many rows contain data (not skip) */
uint64_t rows = howmany(tot, logical_cols);
int cols = MIN(tot, logical_cols);
raidz_map_t *rm = kmem_zalloc(offsetof(raidz_map_t, rm_row[rows]),
KM_SLEEP);
rm->rm_nrows = rows;
for (uint64_t row = 0; row < rows; row++) {
raidz_row_t *rr = kmem_alloc(offsetof(raidz_row_t,
rr_col[cols]), KM_SLEEP);
rm->rm_row[row] = rr;
/* The starting RAIDZ (parent) vdev sector of the row. */
uint64_t b = (offset >> ashift) + row * logical_cols;
/*
* If we are in the middle of a reflow, and any part of this
* row has not been copied, then use the old location of
* this row.
*/
int row_phys_cols = physical_cols;
if (b + (logical_cols - nparity) > reflow_offset >> ashift)
row_phys_cols--;
/* starting child of this row */
uint64_t child_id = b % row_phys_cols;
/* The starting byte offset on each child vdev. */
uint64_t child_offset = (b / row_phys_cols) << ashift;
/*
* We set cols to the entire width of the block, even
* if this row is shorter. This is needed because parity
* generation (for Q and R) needs to know the entire width,
* because it treats the short row as though it was
* full-width (and the "phantom" sectors were zero-filled).
*
* Another approach to this would be to set cols shorter
* (to just the number of columns that we might do i/o to)
* and have another mechanism to tell the parity generation
* about the "entire width". Reconstruction (at least
* vdev_raidz_reconstruct_general()) would also need to
* know about the "entire width".
*/
rr->rr_cols = cols;
rr->rr_bigcols = bc;
rr->rr_missingdata = 0;
rr->rr_missingparity = 0;
rr->rr_firstdatacol = nparity;
rr->rr_abd_copy = NULL;
rr->rr_abd_empty = NULL;
rr->rr_nempty = 0;
for (int c = 0; c < rr->rr_cols; c++, child_id++) {
if (child_id >= row_phys_cols) {
child_id -= row_phys_cols;
child_offset += 1ULL << ashift;
}
rr->rr_col[c].rc_devidx = child_id;
rr->rr_col[c].rc_offset = child_offset;
rr->rr_col[c].rc_gdata = NULL;
rr->rr_col[c].rc_orig_data = NULL;
rr->rr_col[c].rc_error = 0;
rr->rr_col[c].rc_tried = 0;
rr->rr_col[c].rc_skipped = 0;
rr->rr_col[c].rc_need_orig_restore = B_FALSE;
uint64_t dc = c - rr->rr_firstdatacol;
if (c < rr->rr_firstdatacol) {
rr->rr_col[c].rc_size = 1ULL << ashift;
rr->rr_col[c].rc_abd =
abd_alloc_linear(rr->rr_col[c].rc_size,
B_TRUE);
} else if (row == rows - 1 && bc != 0 && c >= bc) {
/*
* Past the end, this for parity generation.
*/
rr->rr_col[c].rc_size = 0;
rr->rr_col[c].rc_abd = NULL;
} else {
/*
* "data column" (col excluding parity)
* Add an ASCII art diagram here
*/
uint64_t off;
if (c < bc || r == 0) {
off = dc * rows + row;
} else {
off = r * rows +
(dc - r) * (rows - 1) + row;
}
rr->rr_col[c].rc_size = 1ULL << ashift;
rr->rr_col[c].rc_abd =
abd_get_offset(abd, off << ashift);
}
asize += rr->rr_col[c].rc_size;
}
/*
* If all data stored spans all columns, there's a danger that
* parity will always be on the same device and, since parity
* isn't read during normal operation, that that device's I/O
* bandwidth won't be used effectively. We therefore switch
* the parity every 1MB.
*
* ...at least that was, ostensibly, the theory. As a practical
* matter unless we juggle the parity between all devices
* evenly, we won't see any benefit. Further, occasional writes
* that aren't a multiple of the LCM of the number of children
* and the minimum stripe width are sufficient to avoid pessimal
* behavior. Unfortunately, this decision created an implicit
* on-disk format requirement that we need to support for all
* eternity, but only for single-parity RAID-Z.
*
* If we intend to skip a sector in the zeroth column for
* padding we must make sure to note this swap. We will never
* intend to skip the first column since at least one data and
* one parity column must appear in each row.
*/
if (rr->rr_firstdatacol == 1 && rr->rr_cols > 1 &&
(offset & (1ULL << 20))) {
ASSERT(rr->rr_cols >= 2);
ASSERT(rr->rr_col[0].rc_size == rr->rr_col[1].rc_size);
devidx = rr->rr_col[0].rc_devidx;
uint64_t o = rr->rr_col[0].rc_offset;
rr->rr_col[0].rc_devidx = rr->rr_col[1].rc_devidx;
rr->rr_col[0].rc_offset = rr->rr_col[1].rc_offset;
rr->rr_col[1].rc_devidx = devidx;
rr->rr_col[1].rc_offset = o;
}
}
ASSERT3U(asize, ==, tot << ashift);
/* init RAIDZ parity ops */
rm->rm_ops = vdev_raidz_math_get_ops();
return (rm);
}
static raidz_map_t *
init_raidz_map(raidz_test_opts_t *opts, zio_t **zio, const int parity)
{
@ -330,8 +558,15 @@ init_raidz_map(raidz_test_opts_t *opts, zio_t **zio, const int parity)
(*zio)->io_abd = raidz_alloc(alloc_dsize);
init_zio_abd(*zio);
rm = vdev_raidz_map_alloc(*zio, opts->rto_ashift,
total_ncols, parity);
if (opts->rto_expand) {
rm = vdev_raidz_map_alloc_expanded((*zio)->io_abd,
(*zio)->io_size, (*zio)->io_offset,
opts->rto_ashift, total_ncols+1, total_ncols,
parity, opts->rto_expand_offset);
} else {
rm = vdev_raidz_map_alloc(*zio, opts->rto_ashift,
total_ncols, parity);
}
VERIFY(rm);
/* Make sure code columns are destroyed */
@ -420,7 +655,7 @@ run_rec_check_impl(raidz_test_opts_t *opts, raidz_map_t *rm, const int fn)
if (fn < RAIDZ_REC_PQ) {
/* can reconstruct 1 failed data disk */
for (x0 = 0; x0 < opts->rto_dcols; x0++) {
if (x0 >= rm->rm_cols - raidz_parity(rm))
if (x0 >= rm->rm_row[0]->rr_cols - raidz_parity(rm))
continue;
/* Check if should stop */
@ -445,10 +680,11 @@ run_rec_check_impl(raidz_test_opts_t *opts, raidz_map_t *rm, const int fn)
} else if (fn < RAIDZ_REC_PQR) {
/* can reconstruct 2 failed data disk */
for (x0 = 0; x0 < opts->rto_dcols; x0++) {
if (x0 >= rm->rm_cols - raidz_parity(rm))
if (x0 >= rm->rm_row[0]->rr_cols - raidz_parity(rm))
continue;
for (x1 = x0 + 1; x1 < opts->rto_dcols; x1++) {
if (x1 >= rm->rm_cols - raidz_parity(rm))
if (x1 >= rm->rm_row[0]->rr_cols -
raidz_parity(rm))
continue;
/* Check if should stop */
@ -475,14 +711,15 @@ run_rec_check_impl(raidz_test_opts_t *opts, raidz_map_t *rm, const int fn)
} else {
/* can reconstruct 3 failed data disk */
for (x0 = 0; x0 < opts->rto_dcols; x0++) {
if (x0 >= rm->rm_cols - raidz_parity(rm))
if (x0 >= rm->rm_row[0]->rr_cols - raidz_parity(rm))
continue;
for (x1 = x0 + 1; x1 < opts->rto_dcols; x1++) {
if (x1 >= rm->rm_cols - raidz_parity(rm))
if (x1 >= rm->rm_row[0]->rr_cols -
raidz_parity(rm))
continue;
for (x2 = x1 + 1; x2 < opts->rto_dcols; x2++) {
if (x2 >=
rm->rm_cols - raidz_parity(rm))
if (x2 >= rm->rm_row[0]->rr_cols -
raidz_parity(rm))
continue;
/* Check if should stop */
@ -700,6 +937,8 @@ run_sweep(void)
opts->rto_dcols = dcols_v[d];
opts->rto_offset = (1 << ashift_v[a]) * rand();
opts->rto_dsize = size_v[s];
opts->rto_expand = rto_opts.rto_expand;
opts->rto_expand_offset = rto_opts.rto_expand_offset;
opts->rto_v = 0; /* be quiet */
VERIFY3P(thread_create(NULL, 0, sweep_thread, (void *) opts,
@ -732,6 +971,7 @@ exit:
return (sweep_state == SWEEP_ERROR ? SWEEP_ERROR : 0);
}
int
main(int argc, char **argv)
{

View File

@ -44,13 +44,15 @@ static const char *raidz_impl_names[] = {
typedef struct raidz_test_opts {
size_t rto_ashift;
size_t rto_offset;
uint64_t rto_offset;
size_t rto_dcols;
size_t rto_dsize;
size_t rto_v;
size_t rto_sweep;
size_t rto_sweep_timeout;
size_t rto_benchmark;
size_t rto_expand;
uint64_t rto_expand_offset;
size_t rto_sanity;
size_t rto_gdb;
@ -69,6 +71,8 @@ static const raidz_test_opts_t rto_opts_defaults = {
.rto_v = 0,
.rto_sweep = 0,
.rto_benchmark = 0,
.rto_expand = 0,
.rto_expand_offset = -1ULL,
.rto_sanity = 0,
.rto_gdb = 0,
.rto_should_stop = B_FALSE
@ -113,4 +117,7 @@ void init_zio_abd(zio_t *zio);
void run_raidz_benchmark(void);
struct raidz_map *vdev_raidz_map_alloc_expanded(abd_t *, uint64_t, uint64_t,
uint64_t, uint64_t, uint64_t, uint64_t, uint64_t);
#endif /* RAIDZ_TEST_H */

View File

@ -1642,7 +1642,11 @@ dump_metaslab(metaslab_t *msp)
SPACE_MAP_HISTOGRAM_SIZE, sm->sm_shift);
}
ASSERT(msp->ms_size == (1ULL << vd->vdev_ms_shift));
if (vd->vdev_ops == &vdev_draid_ops)
ASSERT3U(msp->ms_size, <=, 1ULL << vd->vdev_ms_shift);
else
ASSERT3U(msp->ms_size, ==, 1ULL << vd->vdev_ms_shift);
dump_spacemap(spa->spa_meta_objset, msp->ms_sm);
if (spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP)) {
@ -5203,8 +5207,6 @@ zdb_blkptr_done(zio_t *zio)
zdb_cb_t *zcb = zio->io_private;
zbookmark_phys_t *zb = &zio->io_bookmark;
abd_free(zio->io_abd);
mutex_enter(&spa->spa_scrub_lock);
spa->spa_load_verify_bytes -= BP_GET_PSIZE(bp);
cv_broadcast(&spa->spa_scrub_io_cv);
@ -5231,6 +5233,8 @@ zdb_blkptr_done(zio_t *zio)
blkbuf);
}
mutex_exit(&spa->spa_scrub_lock);
abd_free(zio->io_abd);
}
static int

View File

@ -435,7 +435,15 @@ zfs_process_add(zpool_handle_t *zhp, nvlist_t *vdev, boolean_t labeled)
return;
}
ret = zpool_vdev_attach(zhp, fullpath, path, nvroot, B_TRUE, B_FALSE);
/*
* Prefer sequential resilvering when supported (mirrors and dRAID),
* otherwise fallback to a traditional healing resilver.
*/
ret = zpool_vdev_attach(zhp, fullpath, path, nvroot, B_TRUE, B_TRUE);
if (ret != 0) {
ret = zpool_vdev_attach(zhp, fullpath, path, nvroot,
B_TRUE, B_FALSE);
}
zed_log_msg(LOG_INFO, " zpool_vdev_replace: %s with %s (%s)",
fullpath, path, (ret == 0) ? "no errors" :

View File

@ -219,12 +219,18 @@ replace_with_spare(fmd_hdl_t *hdl, zpool_handle_t *zhp, nvlist_t *vdev)
* replace it.
*/
for (s = 0; s < nspares; s++) {
char *spare_name;
boolean_t rebuild = B_FALSE;
char *spare_name, *type;
if (nvlist_lookup_string(spares[s], ZPOOL_CONFIG_PATH,
&spare_name) != 0)
continue;
/* prefer sequential resilvering for distributed spares */
if ((nvlist_lookup_string(spares[s], ZPOOL_CONFIG_TYPE,
&type) == 0) && strcmp(type, VDEV_TYPE_DRAID_SPARE) == 0)
rebuild = B_TRUE;
/* if set, add the "ashift" pool property to the spare nvlist */
if (source != ZPROP_SRC_DEFAULT)
(void) nvlist_add_uint64(spares[s],
@ -237,7 +243,7 @@ replace_with_spare(fmd_hdl_t *hdl, zpool_handle_t *zhp, nvlist_t *vdev)
dev_name, basename(spare_name));
if (zpool_vdev_attach(zhp, dev_name, spare_name,
replacement, B_TRUE, B_FALSE) == 0) {
replacement, B_TRUE, rebuild) == 0) {
free(dev_name);
nvlist_free(replacement);
return (B_TRUE);
@ -499,6 +505,7 @@ zfs_retire_recv(fmd_hdl_t *hdl, fmd_event_t *ep, nvlist_t *nvl,
* Attempt to substitute a hot spare.
*/
(void) replace_with_spare(hdl, zhp, vdev);
zpool_close(zhp);
}

View File

@ -892,6 +892,107 @@ usage:
return (-1);
}
/*
* Return a default volblocksize for the pool which always uses more than
* half of the data sectors. This primarily applies to dRAID which always
* writes full stripe widths.
*/
static uint64_t
default_volblocksize(zpool_handle_t *zhp, nvlist_t *props)
{
uint64_t volblocksize, asize = SPA_MINBLOCKSIZE;
nvlist_t *tree, **vdevs;
uint_t nvdevs;
nvlist_t *config = zpool_get_config(zhp, NULL);
if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &tree) != 0 ||
nvlist_lookup_nvlist_array(tree, ZPOOL_CONFIG_CHILDREN,
&vdevs, &nvdevs) != 0) {
return (ZVOL_DEFAULT_BLOCKSIZE);
}
for (int i = 0; i < nvdevs; i++) {
nvlist_t *nv = vdevs[i];
uint64_t ashift, ndata, nparity;
if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ASHIFT, &ashift) != 0)
continue;
if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_DRAID_NDATA,
&ndata) == 0) {
/* dRAID minimum allocation width */
asize = MAX(asize, ndata * (1ULL << ashift));
} else if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NPARITY,
&nparity) == 0) {
/* raidz minimum allocation width */
if (nparity == 1)
asize = MAX(asize, 2 * (1ULL << ashift));
else
asize = MAX(asize, 4 * (1ULL << ashift));
} else {
/* mirror or (non-redundant) leaf vdev */
asize = MAX(asize, 1ULL << ashift);
}
}
/*
* Calculate the target volblocksize such that more than half
* of the asize is used. The following table is for 4k sectors.
*
* n asize blksz used | n asize blksz used
* -------------------------+---------------------------------
* 1 4,096 8,192 100% | 9 36,864 32,768 88%
* 2 8,192 8,192 100% | 10 40,960 32,768 80%
* 3 12,288 8,192 66% | 11 45,056 32,768 72%
* 4 16,384 16,384 100% | 12 49,152 32,768 66%
* 5 20,480 16,384 80% | 13 53,248 32,768 61%
* 6 24,576 16,384 66% | 14 57,344 32,768 57%
* 7 28,672 16,384 57% | 15 61,440 32,768 53%
* 8 32,768 32,768 100% | 16 65,536 65,636 100%
*
* This is primarily a concern for dRAID which always allocates
* a full stripe width. For dRAID the default stripe width is
* n=8 in which case the volblocksize is set to 32k. Ignoring
* compression there are no unused sectors. This same reasoning
* applies to raidz[2,3] so target 4 sectors to minimize waste.
*/
uint64_t tgt_volblocksize = ZVOL_DEFAULT_BLOCKSIZE;
while (tgt_volblocksize * 2 <= asize)
tgt_volblocksize *= 2;
const char *prop = zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE);
if (nvlist_lookup_uint64(props, prop, &volblocksize) == 0) {
/* Issue a warning when a non-optimal size is requested. */
if (volblocksize < ZVOL_DEFAULT_BLOCKSIZE) {
(void) fprintf(stderr, gettext("Warning: "
"volblocksize (%llu) is less than the default "
"minimum block size (%llu).\nTo reduce wasted "
"space a volblocksize of %llu is recommended.\n"),
(u_longlong_t)volblocksize,
(u_longlong_t)ZVOL_DEFAULT_BLOCKSIZE,
(u_longlong_t)tgt_volblocksize);
} else if (volblocksize < tgt_volblocksize) {
(void) fprintf(stderr, gettext("Warning: "
"volblocksize (%llu) is much less than the "
"minimum allocation\nunit (%llu), which wastes "
"at least %llu%% of space. To reduce wasted "
"space,\nuse a larger volblocksize (%llu is "
"recommended), fewer dRAID data disks\n"
"per group, or smaller sector size (ashift).\n"),
(u_longlong_t)volblocksize, (u_longlong_t)asize,
(u_longlong_t)((100 * (asize - volblocksize)) /
asize), (u_longlong_t)tgt_volblocksize);
}
} else {
volblocksize = tgt_volblocksize;
fnvlist_add_uint64(props, prop, volblocksize);
}
return (volblocksize);
}
/*
* zfs create [-Pnpv] [-o prop=value] ... fs
* zfs create [-Pnpsv] [-b blocksize] [-o prop=value] ... -V vol size
@ -932,6 +1033,7 @@ zfs_do_create(int argc, char **argv)
int ret = 1;
nvlist_t *props;
uint64_t intval;
char *strval;
if (nvlist_alloc(&props, NV_UNIQUE_NAME, 0) != 0)
nomem();
@ -1018,7 +1120,7 @@ zfs_do_create(int argc, char **argv)
goto badusage;
}
if (dryrun || (type == ZFS_TYPE_VOLUME && !noreserve)) {
if (dryrun || type == ZFS_TYPE_VOLUME) {
char msg[ZFS_MAX_DATASET_NAME_LEN * 2];
char *p;
@ -1040,18 +1142,24 @@ zfs_do_create(int argc, char **argv)
}
}
/*
* if volsize is not a multiple of volblocksize, round it up to the
* nearest multiple of the volblocksize
*/
if (type == ZFS_TYPE_VOLUME) {
uint64_t volblocksize;
const char *prop = zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE);
uint64_t volblocksize = default_volblocksize(zpool_handle,
real_props);
if (nvlist_lookup_uint64(props,
zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE),
&volblocksize) != 0)
volblocksize = ZVOL_DEFAULT_BLOCKSIZE;
if (volblocksize != ZVOL_DEFAULT_BLOCKSIZE &&
nvlist_lookup_string(props, prop, &strval) != 0) {
if (asprintf(&strval, "%llu",
(u_longlong_t)volblocksize) == -1)
nomem();
nvlist_add_string(props, prop, strval);
free(strval);
}
/*
* If volsize is not a multiple of volblocksize, round it
* up to the nearest multiple of the volblocksize.
*/
if (volsize % volblocksize) {
volsize = P2ROUNDUP_TYPED(volsize, volblocksize,
uint64_t);
@ -1064,11 +1172,9 @@ zfs_do_create(int argc, char **argv)
}
}
if (type == ZFS_TYPE_VOLUME && !noreserve) {
uint64_t spa_version;
zfs_prop_t resv_prop;
char *strval;
spa_version = zpool_get_prop_int(zpool_handle,
ZPOOL_PROP_VERSION, NULL);

View File

@ -2294,7 +2294,7 @@ print_status_config(zpool_handle_t *zhp, status_cbdata_t *cb, const char *name,
}
}
/* Display vdev initialization and trim status for leaves */
/* Display vdev initialization and trim status for leaves. */
if (children == 0) {
print_status_initialize(vs, cb->cb_print_vdev_init);
print_status_trim(vs, cb->cb_print_vdev_trim);
@ -9849,7 +9849,8 @@ vdev_any_spare_replacing(nvlist_t *nv)
(void) nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, &vdev_type);
if (strcmp(vdev_type, VDEV_TYPE_REPLACING) == 0 ||
strcmp(vdev_type, VDEV_TYPE_SPARE) == 0) {
strcmp(vdev_type, VDEV_TYPE_SPARE) == 0 ||
strcmp(vdev_type, VDEV_TYPE_DRAID_SPARE) == 0) {
return (B_TRUE);
}

View File

@ -86,9 +86,6 @@
boolean_t error_seen;
boolean_t is_force;
/*PRINTFLIKE1*/
void
vdev_error(const char *fmt, ...)
@ -222,6 +219,9 @@ is_spare(nvlist_t *config, const char *path)
uint_t i, nspares;
boolean_t inuse;
if (zpool_is_draid_spare(path))
return (B_TRUE);
if ((fd = open(path, O_RDONLY|O_DIRECT)) < 0)
return (B_FALSE);
@ -267,9 +267,10 @@ is_spare(nvlist_t *config, const char *path)
* /dev/xxx Complete disk path
* /xxx Full path to file
* xxx Shorthand for <zfs_vdev_paths>/xxx
* draid* Virtual dRAID spare
*/
static nvlist_t *
make_leaf_vdev(nvlist_t *props, const char *arg, uint64_t is_log)
make_leaf_vdev(nvlist_t *props, const char *arg, boolean_t is_primary)
{
char path[MAXPATHLEN];
struct stat64 statbuf;
@ -309,6 +310,17 @@ make_leaf_vdev(nvlist_t *props, const char *arg, uint64_t is_log)
/* After whole disk check restore original passed path */
strlcpy(path, arg, sizeof (path));
} else if (zpool_is_draid_spare(arg)) {
if (!is_primary) {
(void) fprintf(stderr,
gettext("cannot open '%s': dRAID spares can only "
"be used to replace primary vdevs\n"), arg);
return (NULL);
}
wholedisk = B_TRUE;
strlcpy(path, arg, sizeof (path));
type = VDEV_TYPE_DRAID_SPARE;
} else {
err = is_shorthand_path(arg, path, sizeof (path),
&statbuf, &wholedisk);
@ -337,17 +349,19 @@ make_leaf_vdev(nvlist_t *props, const char *arg, uint64_t is_log)
}
}
/*
* Determine whether this is a device or a file.
*/
if (wholedisk || S_ISBLK(statbuf.st_mode)) {
type = VDEV_TYPE_DISK;
} else if (S_ISREG(statbuf.st_mode)) {
type = VDEV_TYPE_FILE;
} else {
(void) fprintf(stderr, gettext("cannot use '%s': must be a "
"block device or regular file\n"), path);
return (NULL);
if (type == NULL) {
/*
* Determine whether this is a device or a file.
*/
if (wholedisk || S_ISBLK(statbuf.st_mode)) {
type = VDEV_TYPE_DISK;
} else if (S_ISREG(statbuf.st_mode)) {
type = VDEV_TYPE_FILE;
} else {
fprintf(stderr, gettext("cannot use '%s': must "
"be a block device or regular file\n"), path);
return (NULL);
}
}
/*
@ -358,10 +372,7 @@ make_leaf_vdev(nvlist_t *props, const char *arg, uint64_t is_log)
verify(nvlist_alloc(&vdev, NV_UNIQUE_NAME, 0) == 0);
verify(nvlist_add_string(vdev, ZPOOL_CONFIG_PATH, path) == 0);
verify(nvlist_add_string(vdev, ZPOOL_CONFIG_TYPE, type) == 0);
verify(nvlist_add_uint64(vdev, ZPOOL_CONFIG_IS_LOG, is_log) == 0);
if (is_log)
verify(nvlist_add_string(vdev, ZPOOL_CONFIG_ALLOCATION_BIAS,
VDEV_ALLOC_BIAS_LOG) == 0);
if (strcmp(type, VDEV_TYPE_DISK) == 0)
verify(nvlist_add_uint64(vdev, ZPOOL_CONFIG_WHOLE_DISK,
(uint64_t)wholedisk) == 0);
@ -432,11 +443,16 @@ typedef struct replication_level {
#define ZPOOL_FUZZ (16 * 1024 * 1024)
/*
* N.B. For the purposes of comparing replication levels dRAID can be
* considered functionally equivilant to raidz.
*/
static boolean_t
is_raidz_mirror(replication_level_t *a, replication_level_t *b,
replication_level_t **raidz, replication_level_t **mirror)
{
if (strcmp(a->zprl_type, "raidz") == 0 &&
if ((strcmp(a->zprl_type, "raidz") == 0 ||
strcmp(a->zprl_type, "draid") == 0) &&
strcmp(b->zprl_type, "mirror") == 0) {
*raidz = a;
*mirror = b;
@ -445,6 +461,22 @@ is_raidz_mirror(replication_level_t *a, replication_level_t *b,
return (B_FALSE);
}
/*
* Comparison for determining if dRAID and raidz where passed in either order.
*/
static boolean_t
is_raidz_draid(replication_level_t *a, replication_level_t *b)
{
if ((strcmp(a->zprl_type, "raidz") == 0 ||
strcmp(a->zprl_type, "draid") == 0) &&
(strcmp(b->zprl_type, "raidz") == 0 ||
strcmp(b->zprl_type, "draid") == 0)) {
return (B_TRUE);
}
return (B_FALSE);
}
/*
* Given a list of toplevel vdevs, return the current replication level. If
* the config is inconsistent, then NULL is returned. If 'fatal' is set, then
@ -511,7 +543,8 @@ get_replication(nvlist_t *nvroot, boolean_t fatal)
rep.zprl_type = type;
rep.zprl_children = 0;
if (strcmp(type, VDEV_TYPE_RAIDZ) == 0) {
if (strcmp(type, VDEV_TYPE_RAIDZ) == 0 ||
strcmp(type, VDEV_TYPE_DRAID) == 0) {
verify(nvlist_lookup_uint64(nv,
ZPOOL_CONFIG_NPARITY,
&rep.zprl_parity) == 0);
@ -677,6 +710,29 @@ get_replication(nvlist_t *nvroot, boolean_t fatal)
else
return (NULL);
}
} else if (is_raidz_draid(&lastrep, &rep)) {
/*
* Accepted raidz and draid when they can
* handle the same number of disk failures.
*/
if (lastrep.zprl_parity != rep.zprl_parity) {
if (ret != NULL)
free(ret);
ret = NULL;
if (fatal)
vdev_error(gettext(
"mismatched replication "
"level: %s and %s vdevs "
"with different "
"redundancy, %llu vs. "
"%llu are present\n"),
lastrep.zprl_type,
rep.zprl_type,
lastrep.zprl_parity,
rep.zprl_parity);
else
return (NULL);
}
} else if (strcmp(lastrep.zprl_type, rep.zprl_type) !=
0) {
if (ret != NULL)
@ -1103,31 +1159,87 @@ is_device_in_use(nvlist_t *config, nvlist_t *nv, boolean_t force,
return (anyinuse);
}
/*
* Returns the parity level extracted from a raidz or draid type.
* If the parity cannot be determined zero is returned.
*/
static int
get_parity(const char *type)
{
long parity = 0;
const char *p;
if (strncmp(type, VDEV_TYPE_RAIDZ, strlen(VDEV_TYPE_RAIDZ)) == 0) {
p = type + strlen(VDEV_TYPE_RAIDZ);
if (*p == '\0') {
/* when unspecified default to single parity */
return (1);
} else if (*p == '0') {
/* no zero prefixes allowed */
return (0);
} else {
/* 0-3, no suffixes allowed */
char *end;
errno = 0;
parity = strtol(p, &end, 10);
if (errno != 0 || *end != '\0' ||
parity < 1 || parity > VDEV_RAIDZ_MAXPARITY) {
return (0);
}
}
} else if (strncmp(type, VDEV_TYPE_DRAID,
strlen(VDEV_TYPE_DRAID)) == 0) {
p = type + strlen(VDEV_TYPE_DRAID);
if (*p == '\0' || *p == ':') {
/* when unspecified default to single parity */
return (1);
} else if (*p == '0') {
/* no zero prefixes allowed */
return (0);
} else {
/* 0-3, allowed suffixes: '\0' or ':' */
char *end;
errno = 0;
parity = strtol(p, &end, 10);
if (errno != 0 ||
parity < 1 || parity > VDEV_DRAID_MAXPARITY ||
(*end != '\0' && *end != ':')) {
return (0);
}
}
}
return ((int)parity);
}
/*
* Assign the minimum and maximum number of devices allowed for
* the specified type. On error NULL is returned, otherwise the
* type prefix is returned (raidz, mirror, etc).
*/
static const char *
is_grouping(const char *type, int *mindev, int *maxdev)
{
if (strncmp(type, "raidz", 5) == 0) {
const char *p = type + 5;
char *end;
long nparity;
if (*p == '\0') {
nparity = 1;
} else if (*p == '0') {
return (NULL); /* no zero prefixes allowed */
} else {
errno = 0;
nparity = strtol(p, &end, 10);
if (errno != 0 || nparity < 1 || nparity >= 255 ||
*end != '\0')
return (NULL);
}
int nparity;
if (strncmp(type, VDEV_TYPE_RAIDZ, strlen(VDEV_TYPE_RAIDZ)) == 0 ||
strncmp(type, VDEV_TYPE_DRAID, strlen(VDEV_TYPE_DRAID)) == 0) {
nparity = get_parity(type);
if (nparity == 0)
return (NULL);
if (mindev != NULL)
*mindev = nparity + 1;
if (maxdev != NULL)
*maxdev = 255;
return (VDEV_TYPE_RAIDZ);
if (strncmp(type, VDEV_TYPE_RAIDZ,
strlen(VDEV_TYPE_RAIDZ)) == 0) {
return (VDEV_TYPE_RAIDZ);
} else {
return (VDEV_TYPE_DRAID);
}
}
if (maxdev != NULL)
@ -1167,6 +1279,163 @@ is_grouping(const char *type, int *mindev, int *maxdev)
return (NULL);
}
/*
* Extract the configuration parameters encoded in the dRAID type and
* use them to generate a dRAID configuration. The expected format is:
*
* draid[<parity>][:<data><d|D>][:<children><c|C>][:<spares><s|S>]
*
* The intent is to be able to generate a good configuration when no
* additional information is provided. The only mandatory component
* of the 'type' is the 'draid' prefix. If a value is not provided
* then reasonable defaults are used. The optional components may
* appear in any order but the d/s/c suffix is required.
*
* Valid inputs:
* - data: number of data devices per group (1-255)
* - parity: number of parity blocks per group (1-3)
* - spares: number of distributed spare (0-100)
* - children: total number of devices (1-255)
*
* Examples:
* - zpool create tank draid <devices...>
* - zpool create tank draid2:8d:51c:2s <devices...>
*/
static int
draid_config_by_type(nvlist_t *nv, const char *type, uint64_t children)
{
uint64_t nparity = 1;
uint64_t nspares = 0;
uint64_t ndata = UINT64_MAX;
uint64_t ngroups = 1;
long value;
if (strncmp(type, VDEV_TYPE_DRAID, strlen(VDEV_TYPE_DRAID)) != 0)
return (EINVAL);
nparity = (uint64_t)get_parity(type);
if (nparity == 0)
return (EINVAL);
char *p = (char *)type;
while ((p = strchr(p, ':')) != NULL) {
char *end;
p = p + 1;
errno = 0;
if (!isdigit(p[0])) {
(void) fprintf(stderr, gettext("invalid dRAID "
"syntax; expected [:<number><c|d|s>] not '%s'\n"),
type);
return (EINVAL);
}
/* Expected non-zero value with c/d/s suffix */
value = strtol(p, &end, 10);
char suffix = tolower(*end);
if (errno != 0 ||
(suffix != 'c' && suffix != 'd' && suffix != 's')) {
(void) fprintf(stderr, gettext("invalid dRAID "
"syntax; expected [:<number><c|d|s>] not '%s'\n"),
type);
return (EINVAL);
}
if (suffix == 'c') {
if ((uint64_t)value != children) {
fprintf(stderr,
gettext("invalid number of dRAID children; "
"%llu required but %llu provided\n"),
(u_longlong_t)value,
(u_longlong_t)children);
return (EINVAL);
}
} else if (suffix == 'd') {
ndata = (uint64_t)value;
} else if (suffix == 's') {
nspares = (uint64_t)value;
} else {
verify(0); /* Unreachable */
}
}
/*
* When a specific number of data disks is not provided limit a
* redundancy group to 8 data disks. This value was selected to
* provide a reasonable tradeoff between capacity and performance.
*/
if (ndata == UINT64_MAX) {
if (children > nspares + nparity) {
ndata = MIN(children - nspares - nparity, 8);
} else {
fprintf(stderr, gettext("request number of "
"distributed spares %llu and parity level %llu\n"
"leaves no disks available for data\n"),
(u_longlong_t)nspares, (u_longlong_t)nparity);
return (EINVAL);
}
}
/* Verify the maximum allowed group size is never exceeded. */
if (ndata == 0 || (ndata + nparity > children - nspares)) {
fprintf(stderr, gettext("requested number of dRAID data "
"disks per group %llu is too high,\nat most %llu disks "
"are available for data\n"), (u_longlong_t)ndata,
(u_longlong_t)(children - nspares - nparity));
return (EINVAL);
}
if (nparity == 0 || nparity > VDEV_DRAID_MAXPARITY) {
fprintf(stderr,
gettext("invalid dRAID parity level %llu; must be "
"between 1 and %d\n"), (u_longlong_t)nparity,
VDEV_DRAID_MAXPARITY);
return (EINVAL);
}
/*
* Verify the requested number of spares can be satisfied.
* An arbitrary limit of 100 distributed spares is applied.
*/
if (nspares > 100 || nspares > (children - (ndata + nparity))) {
fprintf(stderr,
gettext("invalid number of dRAID spares %llu; additional "
"disks would be required\n"), (u_longlong_t)nspares);
return (EINVAL);
}
/* Verify the requested number children is sufficient. */
if (children < (ndata + nparity + nspares)) {
fprintf(stderr, gettext("%llu disks were provided, but at "
"least %llu disks are required for this config\n"),
(u_longlong_t)children,
(u_longlong_t)(ndata + nparity + nspares));
}
if (children > VDEV_DRAID_MAX_CHILDREN) {
fprintf(stderr, gettext("%llu disks were provided, but "
"dRAID only supports up to %u disks"),
(u_longlong_t)children, VDEV_DRAID_MAX_CHILDREN);
}
/*
* Calculate the minimum number of groups required to fill a slice.
* This is the LCM of the stripe width (ndata + nparity) and the
* number of data drives (children - nspares).
*/
while (ngroups * (ndata + nparity) % (children - nspares) != 0)
ngroups++;
/* Store the basic dRAID configuration. */
fnvlist_add_uint64(nv, ZPOOL_CONFIG_NPARITY, nparity);
fnvlist_add_uint64(nv, ZPOOL_CONFIG_DRAID_NDATA, ndata);
fnvlist_add_uint64(nv, ZPOOL_CONFIG_DRAID_NSPARES, nspares);
fnvlist_add_uint64(nv, ZPOOL_CONFIG_DRAID_NGROUPS, ngroups);
return (0);
}
/*
* Construct a syntactically valid vdev specification,
* and ensure that all devices and files exist and can be opened.
@ -1178,8 +1447,8 @@ construct_spec(nvlist_t *props, int argc, char **argv)
{
nvlist_t *nvroot, *nv, **top, **spares, **l2cache;
int t, toplevels, mindev, maxdev, nspares, nlogs, nl2cache;
const char *type;
uint64_t is_log, is_special, is_dedup;
const char *type, *fulltype;
boolean_t is_log, is_special, is_dedup, is_spare;
boolean_t seen_logs;
top = NULL;
@ -1189,18 +1458,20 @@ construct_spec(nvlist_t *props, int argc, char **argv)
nspares = 0;
nlogs = 0;
nl2cache = 0;
is_log = is_special = is_dedup = B_FALSE;
is_log = is_special = is_dedup = is_spare = B_FALSE;
seen_logs = B_FALSE;
nvroot = NULL;
while (argc > 0) {
fulltype = argv[0];
nv = NULL;
/*
* If it's a mirror or raidz, the subsequent arguments are
* its leaves -- until we encounter the next mirror or raidz.
* If it's a mirror, raidz, or draid the subsequent arguments
* are its leaves -- until we encounter the next mirror,
* raidz or draid.
*/
if ((type = is_grouping(argv[0], &mindev, &maxdev)) != NULL) {
if ((type = is_grouping(fulltype, &mindev, &maxdev)) != NULL) {
nvlist_t **child = NULL;
int c, children = 0;
@ -1212,6 +1483,7 @@ construct_spec(nvlist_t *props, int argc, char **argv)
"specified only once\n"));
goto spec_out;
}
is_spare = B_TRUE;
is_log = is_special = is_dedup = B_FALSE;
}
@ -1225,8 +1497,7 @@ construct_spec(nvlist_t *props, int argc, char **argv)
}
seen_logs = B_TRUE;
is_log = B_TRUE;
is_special = B_FALSE;
is_dedup = B_FALSE;
is_special = is_dedup = is_spare = B_FALSE;
argc--;
argv++;
/*
@ -1238,8 +1509,7 @@ construct_spec(nvlist_t *props, int argc, char **argv)
if (strcmp(type, VDEV_ALLOC_BIAS_SPECIAL) == 0) {
is_special = B_TRUE;
is_log = B_FALSE;
is_dedup = B_FALSE;
is_log = is_dedup = is_spare = B_FALSE;
argc--;
argv++;
continue;
@ -1247,8 +1517,7 @@ construct_spec(nvlist_t *props, int argc, char **argv)
if (strcmp(type, VDEV_ALLOC_BIAS_DEDUP) == 0) {
is_dedup = B_TRUE;
is_log = B_FALSE;
is_special = B_FALSE;
is_log = is_special = is_spare = B_FALSE;
argc--;
argv++;
continue;
@ -1262,7 +1531,8 @@ construct_spec(nvlist_t *props, int argc, char **argv)
"specified only once\n"));
goto spec_out;
}
is_log = is_special = is_dedup = B_FALSE;
is_log = is_special = B_FALSE;
is_dedup = is_spare = B_FALSE;
}
if (is_log || is_special || is_dedup) {
@ -1280,13 +1550,15 @@ construct_spec(nvlist_t *props, int argc, char **argv)
for (c = 1; c < argc; c++) {
if (is_grouping(argv[c], NULL, NULL) != NULL)
break;
children++;
child = realloc(child,
children * sizeof (nvlist_t *));
if (child == NULL)
zpool_no_memory();
if ((nv = make_leaf_vdev(props, argv[c],
B_FALSE)) == NULL) {
!(is_log || is_special || is_dedup ||
is_spare))) == NULL) {
for (c = 0; c < children - 1; c++)
nvlist_free(child[c]);
free(child);
@ -1335,10 +1607,11 @@ construct_spec(nvlist_t *props, int argc, char **argv)
type) == 0);
verify(nvlist_add_uint64(nv,
ZPOOL_CONFIG_IS_LOG, is_log) == 0);
if (is_log)
if (is_log) {
verify(nvlist_add_string(nv,
ZPOOL_CONFIG_ALLOCATION_BIAS,
VDEV_ALLOC_BIAS_LOG) == 0);
}
if (is_special) {
verify(nvlist_add_string(nv,
ZPOOL_CONFIG_ALLOCATION_BIAS,
@ -1354,6 +1627,15 @@ construct_spec(nvlist_t *props, int argc, char **argv)
ZPOOL_CONFIG_NPARITY,
mindev - 1) == 0);
}
if (strcmp(type, VDEV_TYPE_DRAID) == 0) {
if (draid_config_by_type(nv,
fulltype, children) != 0) {
for (c = 0; c < children; c++)
nvlist_free(child[c]);
free(child);
goto spec_out;
}
}
verify(nvlist_add_nvlist_array(nv,
ZPOOL_CONFIG_CHILDREN, child,
children) == 0);
@ -1367,12 +1649,19 @@ construct_spec(nvlist_t *props, int argc, char **argv)
* We have a device. Pass off to make_leaf_vdev() to
* construct the appropriate nvlist describing the vdev.
*/
if ((nv = make_leaf_vdev(props, argv[0],
is_log)) == NULL)
if ((nv = make_leaf_vdev(props, argv[0], !(is_log ||
is_special || is_dedup || is_spare))) == NULL)
goto spec_out;
if (is_log)
verify(nvlist_add_uint64(nv,
ZPOOL_CONFIG_IS_LOG, is_log) == 0);
if (is_log) {
verify(nvlist_add_string(nv,
ZPOOL_CONFIG_ALLOCATION_BIAS,
VDEV_ALLOC_BIAS_LOG) == 0);
nlogs++;
}
if (is_special) {
verify(nvlist_add_string(nv,
ZPOOL_CONFIG_ALLOCATION_BIAS,

View File

@ -104,6 +104,7 @@
#include <sys/zio.h>
#include <sys/zil.h>
#include <sys/zil_impl.h>
#include <sys/vdev_draid.h>
#include <sys/vdev_impl.h>
#include <sys/vdev_file.h>
#include <sys/vdev_initialize.h>
@ -167,8 +168,11 @@ typedef struct ztest_shared_opts {
size_t zo_vdev_size;
int zo_ashift;
int zo_mirrors;
int zo_raidz;
int zo_raidz_parity;
int zo_raid_children;
int zo_raid_parity;
char zo_raid_type[8];
int zo_draid_data;
int zo_draid_spares;
int zo_datasets;
int zo_threads;
uint64_t zo_passtime;
@ -191,9 +195,12 @@ static const ztest_shared_opts_t ztest_opts_defaults = {
.zo_vdevs = 5,
.zo_ashift = SPA_MINBLOCKSHIFT,
.zo_mirrors = 2,
.zo_raidz = 4,
.zo_raidz_parity = 1,
.zo_raid_children = 4,
.zo_raid_parity = 1,
.zo_raid_type = VDEV_TYPE_RAIDZ,
.zo_vdev_size = SPA_MINDEVSIZE * 4, /* 256m default size */
.zo_draid_data = 4, /* data drives */
.zo_draid_spares = 1, /* distributed spares */
.zo_datasets = 7,
.zo_threads = 23,
.zo_passtime = 60, /* 60 seconds */
@ -232,7 +239,7 @@ static ztest_shared_ds_t *ztest_shared_ds;
#define BT_MAGIC 0x123456789abcdefULL
#define MAXFAULTS(zs) \
(MAX((zs)->zs_mirrors, 1) * (ztest_opts.zo_raidz_parity + 1) - 1)
(MAX((zs)->zs_mirrors, 1) * (ztest_opts.zo_raid_parity + 1) - 1)
enum ztest_io_type {
ZTEST_IO_WRITE_TAG,
@ -689,8 +696,11 @@ usage(boolean_t requested)
"\t[-s size_of_each_vdev (default: %s)]\n"
"\t[-a alignment_shift (default: %d)] use 0 for random\n"
"\t[-m mirror_copies (default: %d)]\n"
"\t[-r raidz_disks (default: %d)]\n"
"\t[-R raidz_parity (default: %d)]\n"
"\t[-r raidz_disks / draid_disks (default: %d)]\n"
"\t[-R raid_parity (default: %d)]\n"
"\t[-K raid_kind (default: random)] raidz|draid|random\n"
"\t[-D draid_data (default: %d)] in config\n"
"\t[-S draid_spares (default: %d)]\n"
"\t[-d datasets (default: %d)]\n"
"\t[-t threads (default: %d)]\n"
"\t[-g gang_block_threshold (default: %s)]\n"
@ -716,8 +726,10 @@ usage(boolean_t requested)
nice_vdev_size, /* -s */
zo->zo_ashift, /* -a */
zo->zo_mirrors, /* -m */
zo->zo_raidz, /* -r */
zo->zo_raidz_parity, /* -R */
zo->zo_raid_children, /* -r */
zo->zo_raid_parity, /* -R */
zo->zo_draid_data, /* -D */
zo->zo_draid_spares, /* -S */
zo->zo_datasets, /* -d */
zo->zo_threads, /* -t */
nice_force_ganging, /* -g */
@ -731,6 +743,21 @@ usage(boolean_t requested)
exit(requested ? 0 : 1);
}
static uint64_t
ztest_random(uint64_t range)
{
uint64_t r;
ASSERT3S(ztest_fd_rand, >=, 0);
if (range == 0)
return (0);
if (read(ztest_fd_rand, &r, sizeof (r)) != sizeof (r))
fatal(1, "short read from /dev/urandom");
return (r % range);
}
static void
ztest_parse_name_value(const char *input, ztest_shared_opts_t *zo)
@ -780,11 +807,12 @@ process_options(int argc, char **argv)
int opt;
uint64_t value;
char altdir[MAXNAMELEN] = { 0 };
char raid_kind[8] = { "random" };
bcopy(&ztest_opts_defaults, zo, sizeof (*zo));
while ((opt = getopt(argc, argv,
"v:s:a:m:r:R:d:t:g:i:k:p:f:MVET:P:hF:B:C:o:G")) != EOF) {
"v:s:a:m:r:R:K:D:S:d:t:g:i:k:p:f:MVET:P:hF:B:C:o:G")) != EOF) {
value = 0;
switch (opt) {
case 'v':
@ -793,6 +821,8 @@ process_options(int argc, char **argv)
case 'm':
case 'r':
case 'R':
case 'D':
case 'S':
case 'd':
case 't':
case 'g':
@ -817,10 +847,19 @@ process_options(int argc, char **argv)
zo->zo_mirrors = value;
break;
case 'r':
zo->zo_raidz = MAX(1, value);
zo->zo_raid_children = MAX(1, value);
break;
case 'R':
zo->zo_raidz_parity = MIN(MAX(value, 1), 3);
zo->zo_raid_parity = MIN(MAX(value, 1), 3);
break;
case 'K':
(void) strlcpy(raid_kind, optarg, sizeof (raid_kind));
break;
case 'D':
zo->zo_draid_data = MAX(1, value);
break;
case 'S':
zo->zo_draid_spares = MAX(1, value);
break;
case 'd':
zo->zo_datasets = MAX(1, value);
@ -895,7 +934,54 @@ process_options(int argc, char **argv)
}
}
zo->zo_raidz_parity = MIN(zo->zo_raidz_parity, zo->zo_raidz - 1);
/* When raid choice is 'random' add a draid pool 50% of the time */
if (strcmp(raid_kind, "random") == 0) {
(void) strlcpy(raid_kind, (ztest_random(2) == 0) ?
"draid" : "raidz", sizeof (raid_kind));
if (ztest_opts.zo_verbose >= 3)
(void) printf("choosing RAID type '%s'\n", raid_kind);
}
if (strcmp(raid_kind, "draid") == 0) {
uint64_t min_devsize;
/* With fewer disk use 256M, otherwise 128M is OK */
min_devsize = (ztest_opts.zo_raid_children < 16) ?
(256ULL << 20) : (128ULL << 20);
/* No top-level mirrors with dRAID for now */
zo->zo_mirrors = 0;
/* Use more appropriate defaults for dRAID */
if (zo->zo_vdevs == ztest_opts_defaults.zo_vdevs)
zo->zo_vdevs = 1;
if (zo->zo_raid_children ==
ztest_opts_defaults.zo_raid_children)
zo->zo_raid_children = 16;
if (zo->zo_ashift < 12)
zo->zo_ashift = 12;
if (zo->zo_vdev_size < min_devsize)
zo->zo_vdev_size = min_devsize;
if (zo->zo_draid_data + zo->zo_raid_parity >
zo->zo_raid_children - zo->zo_draid_spares) {
(void) fprintf(stderr, "error: too few draid "
"children (%d) for stripe width (%d)\n",
zo->zo_raid_children,
zo->zo_draid_data + zo->zo_raid_parity);
usage(B_FALSE);
}
(void) strlcpy(zo->zo_raid_type, VDEV_TYPE_DRAID,
sizeof (zo->zo_raid_type));
} else /* using raidz */ {
ASSERT0(strcmp(raid_kind, "raidz"));
zo->zo_raid_parity = MIN(zo->zo_raid_parity,
zo->zo_raid_children - 1);
}
zo->zo_vdevtime =
(zo->zo_vdevs > 0 ? zo->zo_time * NANOSEC / zo->zo_vdevs :
@ -966,22 +1052,6 @@ ztest_kill(ztest_shared_t *zs)
(void) kill(getpid(), SIGKILL);
}
static uint64_t
ztest_random(uint64_t range)
{
uint64_t r;
ASSERT3S(ztest_fd_rand, >=, 0);
if (range == 0)
return (0);
if (read(ztest_fd_rand, &r, sizeof (r)) != sizeof (r))
fatal(1, "short read from /dev/urandom");
return (r % range);
}
/* ARGSUSED */
static void
ztest_record_enospc(const char *s)
@ -997,12 +1067,27 @@ ztest_get_ashift(void)
return (ztest_opts.zo_ashift);
}
static boolean_t
ztest_is_draid_spare(const char *name)
{
uint64_t spare_id = 0, parity = 0, vdev_id = 0;
if (sscanf(name, VDEV_TYPE_DRAID "%llu-%llu-%llu",
(u_longlong_t *)&parity, (u_longlong_t *)&vdev_id,
(u_longlong_t *)&spare_id) == 3) {
return (B_TRUE);
}
return (B_FALSE);
}
static nvlist_t *
make_vdev_file(char *path, char *aux, char *pool, size_t size, uint64_t ashift)
{
char *pathbuf;
uint64_t vdev;
nvlist_t *file;
boolean_t draid_spare = B_FALSE;
pathbuf = umem_alloc(MAXPATHLEN, UMEM_NOFAIL);
@ -1024,9 +1109,11 @@ make_vdev_file(char *path, char *aux, char *pool, size_t size, uint64_t ashift)
ztest_dev_template, ztest_opts.zo_dir,
pool == NULL ? ztest_opts.zo_pool : pool, vdev);
}
} else {
draid_spare = ztest_is_draid_spare(path);
}
if (size != 0) {
if (size != 0 && !draid_spare) {
int fd = open(path, O_RDWR | O_CREAT | O_TRUNC, 0666);
if (fd == -1)
fatal(1, "can't open %s", path);
@ -1035,20 +1122,21 @@ make_vdev_file(char *path, char *aux, char *pool, size_t size, uint64_t ashift)
(void) close(fd);
}
VERIFY(nvlist_alloc(&file, NV_UNIQUE_NAME, 0) == 0);
VERIFY(nvlist_add_string(file, ZPOOL_CONFIG_TYPE, VDEV_TYPE_FILE) == 0);
VERIFY(nvlist_add_string(file, ZPOOL_CONFIG_PATH, path) == 0);
VERIFY(nvlist_add_uint64(file, ZPOOL_CONFIG_ASHIFT, ashift) == 0);
VERIFY0(nvlist_alloc(&file, NV_UNIQUE_NAME, 0));
VERIFY0(nvlist_add_string(file, ZPOOL_CONFIG_TYPE,
draid_spare ? VDEV_TYPE_DRAID_SPARE : VDEV_TYPE_FILE));
VERIFY0(nvlist_add_string(file, ZPOOL_CONFIG_PATH, path));
VERIFY0(nvlist_add_uint64(file, ZPOOL_CONFIG_ASHIFT, ashift));
umem_free(pathbuf, MAXPATHLEN);
return (file);
}
static nvlist_t *
make_vdev_raidz(char *path, char *aux, char *pool, size_t size,
make_vdev_raid(char *path, char *aux, char *pool, size_t size,
uint64_t ashift, int r)
{
nvlist_t *raidz, **child;
nvlist_t *raid, **child;
int c;
if (r < 2)
@ -1058,20 +1146,41 @@ make_vdev_raidz(char *path, char *aux, char *pool, size_t size,
for (c = 0; c < r; c++)
child[c] = make_vdev_file(path, aux, pool, size, ashift);
VERIFY(nvlist_alloc(&raidz, NV_UNIQUE_NAME, 0) == 0);
VERIFY(nvlist_add_string(raidz, ZPOOL_CONFIG_TYPE,
VDEV_TYPE_RAIDZ) == 0);
VERIFY(nvlist_add_uint64(raidz, ZPOOL_CONFIG_NPARITY,
ztest_opts.zo_raidz_parity) == 0);
VERIFY(nvlist_add_nvlist_array(raidz, ZPOOL_CONFIG_CHILDREN,
child, r) == 0);
VERIFY0(nvlist_alloc(&raid, NV_UNIQUE_NAME, 0));
VERIFY0(nvlist_add_string(raid, ZPOOL_CONFIG_TYPE,
ztest_opts.zo_raid_type));
VERIFY0(nvlist_add_uint64(raid, ZPOOL_CONFIG_NPARITY,
ztest_opts.zo_raid_parity));
VERIFY0(nvlist_add_nvlist_array(raid, ZPOOL_CONFIG_CHILDREN,
child, r));
if (strcmp(ztest_opts.zo_raid_type, VDEV_TYPE_DRAID) == 0) {
uint64_t ndata = ztest_opts.zo_draid_data;
uint64_t nparity = ztest_opts.zo_raid_parity;
uint64_t nspares = ztest_opts.zo_draid_spares;
uint64_t children = ztest_opts.zo_raid_children;
uint64_t ngroups = 1;
/*
* Calculate the minimum number of groups required to fill a
* slice. This is the LCM of the stripe width (data + parity)
* and the number of data drives (children - spares).
*/
while (ngroups * (ndata + nparity) % (children - nspares) != 0)
ngroups++;
/* Store the basic dRAID configuration. */
fnvlist_add_uint64(raid, ZPOOL_CONFIG_DRAID_NDATA, ndata);
fnvlist_add_uint64(raid, ZPOOL_CONFIG_DRAID_NSPARES, nspares);
fnvlist_add_uint64(raid, ZPOOL_CONFIG_DRAID_NGROUPS, ngroups);
}
for (c = 0; c < r; c++)
nvlist_free(child[c]);
umem_free(child, r * sizeof (nvlist_t *));
return (raidz);
return (raid);
}
static nvlist_t *
@ -1082,12 +1191,12 @@ make_vdev_mirror(char *path, char *aux, char *pool, size_t size,
int c;
if (m < 1)
return (make_vdev_raidz(path, aux, pool, size, ashift, r));
return (make_vdev_raid(path, aux, pool, size, ashift, r));
child = umem_alloc(m * sizeof (nvlist_t *), UMEM_NOFAIL);
for (c = 0; c < m; c++)
child[c] = make_vdev_raidz(path, aux, pool, size, ashift, r);
child[c] = make_vdev_raid(path, aux, pool, size, ashift, r);
VERIFY(nvlist_alloc(&mirror, NV_UNIQUE_NAME, 0) == 0);
VERIFY(nvlist_add_string(mirror, ZPOOL_CONFIG_TYPE,
@ -2809,6 +2918,10 @@ ztest_spa_upgrade(ztest_ds_t *zd, uint64_t id)
if (ztest_opts.zo_mmp_test)
return;
/* dRAID added after feature flags, skip upgrade test. */
if (strcmp(ztest_opts.zo_raid_type, VDEV_TYPE_DRAID) == 0)
return;
mutex_enter(&ztest_vdev_lock);
name = kmem_asprintf("%s_upgrade", ztest_opts.zo_pool);
@ -2818,13 +2931,13 @@ ztest_spa_upgrade(ztest_ds_t *zd, uint64_t id)
(void) spa_destroy(name);
nvroot = make_vdev_root(NULL, NULL, name, ztest_opts.zo_vdev_size, 0,
NULL, ztest_opts.zo_raidz, ztest_opts.zo_mirrors, 1);
NULL, ztest_opts.zo_raid_children, ztest_opts.zo_mirrors, 1);
/*
* If we're configuring a RAIDZ device then make sure that the
* initial version is capable of supporting that feature.
*/
switch (ztest_opts.zo_raidz_parity) {
switch (ztest_opts.zo_raid_parity) {
case 0:
case 1:
initial_version = SPA_VERSION_INITIAL;
@ -2970,7 +3083,8 @@ ztest_vdev_add_remove(ztest_ds_t *zd, uint64_t id)
return;
mutex_enter(&ztest_vdev_lock);
leaves = MAX(zs->zs_mirrors + zs->zs_splits, 1) * ztest_opts.zo_raidz;
leaves = MAX(zs->zs_mirrors + zs->zs_splits, 1) *
ztest_opts.zo_raid_children;
spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER);
@ -3024,7 +3138,8 @@ ztest_vdev_add_remove(ztest_ds_t *zd, uint64_t id)
*/
nvroot = make_vdev_root(NULL, NULL, NULL,
ztest_opts.zo_vdev_size, 0, (ztest_random(4) == 0) ?
"log" : NULL, ztest_opts.zo_raidz, zs->zs_mirrors, 1);
"log" : NULL, ztest_opts.zo_raid_children, zs->zs_mirrors,
1);
error = spa_vdev_add(spa, nvroot);
nvlist_free(nvroot);
@ -3078,14 +3193,15 @@ ztest_vdev_class_add(ztest_ds_t *zd, uint64_t id)
return;
}
leaves = MAX(zs->zs_mirrors + zs->zs_splits, 1) * ztest_opts.zo_raidz;
leaves = MAX(zs->zs_mirrors + zs->zs_splits, 1) *
ztest_opts.zo_raid_children;
spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER);
ztest_shared->zs_vdev_next_leaf = spa_num_top_vdevs(spa) * leaves;
spa_config_exit(spa, SCL_VDEV, FTAG);
nvroot = make_vdev_root(NULL, NULL, NULL, ztest_opts.zo_vdev_size, 0,
class, ztest_opts.zo_raidz, zs->zs_mirrors, 1);
class, ztest_opts.zo_raid_children, zs->zs_mirrors, 1);
error = spa_vdev_add(spa, nvroot);
nvlist_free(nvroot);
@ -3134,7 +3250,7 @@ ztest_vdev_aux_add_remove(ztest_ds_t *zd, uint64_t id)
char *aux;
char *path;
uint64_t guid = 0;
int error;
int error, ignore_err = 0;
if (ztest_opts.zo_mmp_test)
return;
@ -3157,7 +3273,13 @@ ztest_vdev_aux_add_remove(ztest_ds_t *zd, uint64_t id)
/*
* Pick a random device to remove.
*/
guid = sav->sav_vdevs[ztest_random(sav->sav_count)]->vdev_guid;
vdev_t *svd = sav->sav_vdevs[ztest_random(sav->sav_count)];
/* dRAID spares cannot be removed; try anyways to see ENOTSUP */
if (strstr(svd->vdev_path, VDEV_TYPE_DRAID) != NULL)
ignore_err = ENOTSUP;
guid = svd->vdev_guid;
} else {
/*
* Find an unused device we can add.
@ -3214,7 +3336,9 @@ ztest_vdev_aux_add_remove(ztest_ds_t *zd, uint64_t id)
case ZFS_ERR_DISCARDING_CHECKPOINT:
break;
default:
fatal(0, "spa_vdev_remove(%llu) = %d", guid, error);
if (error != ignore_err)
fatal(0, "spa_vdev_remove(%llu) = %d", guid,
error);
}
}
@ -3243,7 +3367,7 @@ ztest_split_pool(ztest_ds_t *zd, uint64_t id)
mutex_enter(&ztest_vdev_lock);
/* ensure we have a usable config; mirrors of raidz aren't supported */
if (zs->zs_mirrors < 3 || ztest_opts.zo_raidz > 1) {
if (zs->zs_mirrors < 3 || ztest_opts.zo_raid_children > 1) {
mutex_exit(&ztest_vdev_lock);
return;
}
@ -3343,6 +3467,7 @@ ztest_vdev_attach_detach(ztest_ds_t *zd, uint64_t id)
int replacing;
int oldvd_has_siblings = B_FALSE;
int newvd_is_spare = B_FALSE;
int newvd_is_dspare = B_FALSE;
int oldvd_is_log;
int error, expected_error;
@ -3353,7 +3478,7 @@ ztest_vdev_attach_detach(ztest_ds_t *zd, uint64_t id)
newpath = umem_alloc(MAXPATHLEN, UMEM_NOFAIL);
mutex_enter(&ztest_vdev_lock);
leaves = MAX(zs->zs_mirrors, 1) * ztest_opts.zo_raidz;
leaves = MAX(zs->zs_mirrors, 1) * ztest_opts.zo_raid_children;
spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
@ -3393,14 +3518,17 @@ ztest_vdev_attach_detach(ztest_ds_t *zd, uint64_t id)
if (zs->zs_mirrors >= 1) {
ASSERT(oldvd->vdev_ops == &vdev_mirror_ops);
ASSERT(oldvd->vdev_children >= zs->zs_mirrors);
oldvd = oldvd->vdev_child[leaf / ztest_opts.zo_raidz];
oldvd = oldvd->vdev_child[leaf / ztest_opts.zo_raid_children];
}
/* pick a child out of the raidz group */
if (ztest_opts.zo_raidz > 1) {
ASSERT(oldvd->vdev_ops == &vdev_raidz_ops);
ASSERT(oldvd->vdev_children == ztest_opts.zo_raidz);
oldvd = oldvd->vdev_child[leaf % ztest_opts.zo_raidz];
if (ztest_opts.zo_raid_children > 1) {
if (strcmp(oldvd->vdev_ops->vdev_op_type, "raidz") == 0)
ASSERT(oldvd->vdev_ops == &vdev_raidz_ops);
else
ASSERT(oldvd->vdev_ops == &vdev_draid_ops);
ASSERT(oldvd->vdev_children == ztest_opts.zo_raid_children);
oldvd = oldvd->vdev_child[leaf % ztest_opts.zo_raid_children];
}
/*
@ -3447,6 +3575,10 @@ ztest_vdev_attach_detach(ztest_ds_t *zd, uint64_t id)
if (sav->sav_count != 0 && ztest_random(3) == 0) {
newvd = sav->sav_vdevs[ztest_random(sav->sav_count)];
newvd_is_spare = B_TRUE;
if (newvd->vdev_ops == &vdev_draid_spare_ops)
newvd_is_dspare = B_TRUE;
(void) strcpy(newpath, newvd->vdev_path);
} else {
(void) snprintf(newpath, MAXPATHLEN, ztest_dev_template,
@ -3480,6 +3612,9 @@ ztest_vdev_attach_detach(ztest_ds_t *zd, uint64_t id)
* If newvd is already part of the pool, it should fail with EBUSY.
*
* If newvd is too small, it should fail with EOVERFLOW.
*
* If newvd is a distributed spare and it's being attached to a
* dRAID which is not its parent it should fail with EINVAL.
*/
if (pvd->vdev_ops != &vdev_mirror_ops &&
pvd->vdev_ops != &vdev_root_ops && (!replacing ||
@ -3492,10 +3627,12 @@ ztest_vdev_attach_detach(ztest_ds_t *zd, uint64_t id)
expected_error = replacing ? 0 : EBUSY;
else if (vdev_lookup_by_path(rvd, newpath) != NULL)
expected_error = EBUSY;
else if (newsize < oldsize)
else if (!newvd_is_dspare && newsize < oldsize)
expected_error = EOVERFLOW;
else if (ashift > oldvd->vdev_top->vdev_ashift)
expected_error = EDOM;
else if (newvd_is_dspare && pvd != vdev_draid_spare_get_parent(newvd))
expected_error = ENOTSUP;
else
expected_error = 0;
@ -4880,13 +5017,13 @@ ztest_dmu_read_write_zcopy(ztest_ds_t *zd, uint64_t id)
void *packcheck = umem_alloc(packsize, UMEM_NOFAIL);
void *bigcheck = umem_alloc(bigsize, UMEM_NOFAIL);
VERIFY(0 == dmu_read(os, packobj, packoff,
VERIFY0(dmu_read(os, packobj, packoff,
packsize, packcheck, DMU_READ_PREFETCH));
VERIFY(0 == dmu_read(os, bigobj, bigoff,
VERIFY0(dmu_read(os, bigobj, bigoff,
bigsize, bigcheck, DMU_READ_PREFETCH));
ASSERT(bcmp(packbuf, packcheck, packsize) == 0);
ASSERT(bcmp(bigbuf, bigcheck, bigsize) == 0);
ASSERT0(bcmp(packbuf, packcheck, packsize));
ASSERT0(bcmp(bigbuf, bigcheck, bigsize));
umem_free(packcheck, packsize);
umem_free(bigcheck, bigsize);
@ -5761,7 +5898,7 @@ ztest_fault_inject(ztest_ds_t *zd, uint64_t id)
}
maxfaults = MAXFAULTS(zs);
leaves = MAX(zs->zs_mirrors, 1) * ztest_opts.zo_raidz;
leaves = MAX(zs->zs_mirrors, 1) * ztest_opts.zo_raid_children;
mirror_save = zs->zs_mirrors;
mutex_exit(&ztest_vdev_lock);
@ -6011,7 +6148,7 @@ out:
/*
* By design ztest will never inject uncorrectable damage in to the pool.
* Issue a scrub, wait for it to complete, and verify there is never any
* any persistent damage.
* persistent damage.
*
* Only after a full scrub has been completed is it safe to start injecting
* data corruption. See the comment in zfs_fault_inject().
@ -7347,7 +7484,7 @@ ztest_init(ztest_shared_t *zs)
zs->zs_splits = 0;
zs->zs_mirrors = ztest_opts.zo_mirrors;
nvroot = make_vdev_root(NULL, NULL, NULL, ztest_opts.zo_vdev_size, 0,
NULL, ztest_opts.zo_raidz, zs->zs_mirrors, 1);
NULL, ztest_opts.zo_raid_children, zs->zs_mirrors, 1);
props = make_random_props();
/*
@ -7683,10 +7820,12 @@ main(int argc, char **argv)
if (ztest_opts.zo_verbose >= 1) {
(void) printf("%llu vdevs, %d datasets, %d threads,"
" %llu seconds...\n",
"%d %s disks, %llu seconds...\n\n",
(u_longlong_t)ztest_opts.zo_vdevs,
ztest_opts.zo_datasets,
ztest_opts.zo_threads,
ztest_opts.zo_raid_children,
ztest_opts.zo_raid_type,
(u_longlong_t)ztest_opts.zo_time);
}

View File

@ -209,6 +209,7 @@ AC_CONFIG_FILES([
tests/zfs-tests/cmd/btree_test/Makefile
tests/zfs-tests/cmd/chg_usr_exec/Makefile
tests/zfs-tests/cmd/devname2devid/Makefile
tests/zfs-tests/cmd/draid/Makefile
tests/zfs-tests/cmd/dir_rd_update/Makefile
tests/zfs-tests/cmd/file_check/Makefile
tests/zfs-tests/cmd/file_trunc/Makefile

View File

@ -455,6 +455,7 @@ extern void zpool_explain_recover(libzfs_handle_t *, const char *, int,
nvlist_t *);
extern int zpool_checkpoint(zpool_handle_t *);
extern int zpool_discard_checkpoint(zpool_handle_t *);
extern boolean_t zpool_is_draid_spare(const char *);
/*
* Basic handle manipulations. These functions do not create or destroy the

View File

@ -82,6 +82,7 @@ COMMON_H = \
vdev_disk.h \
vdev_file.h \
vdev.h \
vdev_draid.h \
vdev_impl.h \
vdev_indirect_births.h \
vdev_indirect_mapping.h \

View File

@ -163,6 +163,7 @@ typedef struct dsl_scan_io_queue dsl_scan_io_queue_t;
void scan_init(void);
void scan_fini(void);
int dsl_scan_init(struct dsl_pool *dp, uint64_t txg);
void dsl_scan_setup_sync(void *, dmu_tx_t *);
void dsl_scan_fini(struct dsl_pool *dp);
void dsl_scan_sync(struct dsl_pool *, dmu_tx_t *);
int dsl_scan_cancel(struct dsl_pool *);

View File

@ -617,6 +617,7 @@ typedef struct zpool_load_policy {
#define ZPOOL_CONFIG_PREV_INDIRECT_VDEV "com.delphix:prev_indirect_vdev"
#define ZPOOL_CONFIG_PATH "path"
#define ZPOOL_CONFIG_DEVID "devid"
#define ZPOOL_CONFIG_SPARE_ID "spareid"
#define ZPOOL_CONFIG_METASLAB_ARRAY "metaslab_array"
#define ZPOOL_CONFIG_METASLAB_SHIFT "metaslab_shift"
#define ZPOOL_CONFIG_ASHIFT "ashift"
@ -757,10 +758,17 @@ typedef struct zpool_load_policy {
#define ZPOOL_CONFIG_LOAD_DATA_ERRORS "verify_data_errors"
#define ZPOOL_CONFIG_REWIND_TIME "seconds_of_rewind"
/* dRAID configuration */
#define ZPOOL_CONFIG_DRAID_NDATA "draid_ndata"
#define ZPOOL_CONFIG_DRAID_NSPARES "draid_nspares"
#define ZPOOL_CONFIG_DRAID_NGROUPS "draid_ngroups"
#define VDEV_TYPE_ROOT "root"
#define VDEV_TYPE_MIRROR "mirror"
#define VDEV_TYPE_REPLACING "replacing"
#define VDEV_TYPE_RAIDZ "raidz"
#define VDEV_TYPE_DRAID "draid"
#define VDEV_TYPE_DRAID_SPARE "dspare"
#define VDEV_TYPE_DISK "disk"
#define VDEV_TYPE_FILE "file"
#define VDEV_TYPE_MISSING "missing"
@ -770,6 +778,12 @@ typedef struct zpool_load_policy {
#define VDEV_TYPE_L2CACHE "l2cache"
#define VDEV_TYPE_INDIRECT "indirect"
#define VDEV_RAIDZ_MAXPARITY 3
#define VDEV_DRAID_MAXPARITY 3
#define VDEV_DRAID_MIN_CHILDREN 2
#define VDEV_DRAID_MAX_CHILDREN UINT8_MAX
/* VDEV_TOP_ZAP_* are used in top-level vdev ZAP objects. */
#define VDEV_TOP_ZAP_INDIRECT_OBSOLETE_SM \
"com.delphix:indirect_obsolete_sm"

View File

@ -240,8 +240,9 @@ struct spa {
kcondvar_t spa_evicting_os_cv; /* Objset Eviction Completion */
txg_list_t spa_vdev_txg_list; /* per-txg dirty vdev list */
vdev_t *spa_root_vdev; /* top-level vdev container */
int spa_min_ashift; /* of vdevs in normal class */
int spa_max_ashift; /* of vdevs in normal class */
uint64_t spa_min_ashift; /* of vdevs in normal class */
uint64_t spa_max_ashift; /* of vdevs in normal class */
uint64_t spa_min_alloc; /* of vdevs in normal class */
uint64_t spa_config_guid; /* config pool guid */
uint64_t spa_load_guid; /* spa_load initialized guid */
uint64_t spa_last_synced_guid; /* last synced guid */

View File

@ -41,6 +41,7 @@ extern "C" {
#define TXG_MASK (TXG_SIZE - 1) /* mask for size */
#define TXG_INITIAL TXG_SIZE /* initial txg */
#define TXG_IDX (txg & TXG_MASK)
#define TXG_UNKNOWN 0
/* Number of txgs worth of frees we defer adding to in-core spacemaps */
#define TXG_DEFER_SIZE 2

View File

@ -49,10 +49,13 @@ typedef enum vdev_dtl_type {
extern int zfs_nocacheflush;
typedef boolean_t vdev_open_children_func_t(vdev_t *vd);
extern void vdev_dbgmsg(vdev_t *vd, const char *fmt, ...);
extern void vdev_dbgmsg_print_tree(vdev_t *, int);
extern int vdev_open(vdev_t *);
extern void vdev_open_children(vdev_t *);
extern void vdev_open_children_subset(vdev_t *, vdev_open_children_func_t *);
extern int vdev_validate(vdev_t *);
extern int vdev_copy_path_strict(vdev_t *, vdev_t *);
extern void vdev_copy_path_relaxed(vdev_t *, vdev_t *);
@ -71,7 +74,10 @@ extern void vdev_dtl_dirty(vdev_t *vd, vdev_dtl_type_t d,
extern boolean_t vdev_dtl_contains(vdev_t *vd, vdev_dtl_type_t d,
uint64_t txg, uint64_t size);
extern boolean_t vdev_dtl_empty(vdev_t *vd, vdev_dtl_type_t d);
extern boolean_t vdev_dtl_need_resilver(vdev_t *vd, uint64_t off, size_t size);
extern boolean_t vdev_default_need_resilver(vdev_t *vd, const dva_t *dva,
size_t psize, uint64_t phys_birth);
extern boolean_t vdev_dtl_need_resilver(vdev_t *vd, const dva_t *dva,
size_t psize, uint64_t phys_birth);
extern void vdev_dtl_reassess(vdev_t *vd, uint64_t txg, uint64_t scrub_txg,
boolean_t scrub_done, boolean_t rebuild_done);
extern boolean_t vdev_dtl_required(vdev_t *vd);
@ -97,8 +103,14 @@ extern void vdev_metaslab_set_size(vdev_t *);
extern void vdev_expand(vdev_t *vd, uint64_t txg);
extern void vdev_split(vdev_t *vd);
extern void vdev_deadman(vdev_t *vd, char *tag);
typedef void vdev_xlate_func_t(void *arg, range_seg64_t *physical_rs);
extern boolean_t vdev_xlate_is_empty(range_seg64_t *rs);
extern void vdev_xlate(vdev_t *vd, const range_seg64_t *logical_rs,
range_seg64_t *physical_rs);
range_seg64_t *physical_rs, range_seg64_t *remain_rs);
extern void vdev_xlate_walk(vdev_t *vd, const range_seg64_t *logical_rs,
vdev_xlate_func_t *func, void *arg);
extern void vdev_get_stats_ex(vdev_t *vd, vdev_stat_t *vs, vdev_stat_ex_t *vsx);
extern void vdev_get_stats(vdev_t *vd, vdev_stat_t *vs);

110
include/sys/vdev_draid.h Normal file
View File

@ -0,0 +1,110 @@
/*
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
* Common Development and Distribution License (the "License").
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or http://www.opensolaris.org/os/licensing.
* See the License for the specific language governing permissions
* and limitations under the License.
*
* When distributing Covered Code, include this CDDL HEADER in each
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
* If applicable, add the following below this CDDL HEADER, with the
* fields enclosed by brackets "[]" replaced with your own identifying
* information: Portions Copyright [yyyy] [name of copyright owner]
*
* CDDL HEADER END
*/
/*
* Copyright (c) 2016, Intel Corporation.
* Copyright (c) 2020 by Lawrence Livermore National Security, LLC.
*/
#ifndef _SYS_VDEV_DRAID_H
#define _SYS_VDEV_DRAID_H
#include <sys/types.h>
#include <sys/abd.h>
#include <sys/nvpair.h>
#include <sys/zio.h>
#include <sys/vdev_impl.h>
#include <sys/vdev_raidz_impl.h>
#include <sys/vdev.h>
#ifdef __cplusplus
extern "C" {
#endif
/*
* Constants required to generate and use dRAID permutations.
*/
#define VDEV_DRAID_SEED 0xd7a1d5eed
#define VDEV_DRAID_MAX_MAPS 254
#define VDEV_DRAID_ROWSHIFT SPA_MAXBLOCKSHIFT
#define VDEV_DRAID_ROWHEIGHT (1ULL << VDEV_DRAID_ROWSHIFT)
#define VDEV_DRAID_REFLOW_RESERVE (2 * VDEV_DRAID_ROWHEIGHT)
/*
* dRAID permutation map.
*/
typedef struct draid_map {
uint64_t dm_children; /* # of permuation columns */
uint64_t dm_nperms; /* # of permutation rows */
uint64_t dm_seed; /* dRAID map seed */
uint64_t dm_checksum; /* Checksum of generated map */
uint8_t *dm_perms; /* base permutation array */
} draid_map_t;
/*
* dRAID configuration.
*/
typedef struct vdev_draid_config {
/*
* Values read from the dRAID nvlist configuration.
*/
uint64_t vdc_ndata; /* # of data devices in group */
uint64_t vdc_nparity; /* # of parity devices in group */
uint64_t vdc_nspares; /* # of distributed spares */
uint64_t vdc_children; /* # of children */
uint64_t vdc_ngroups; /* # groups per slice */
/*
* Immutable derived constants.
*/
uint8_t *vdc_perms; /* permutation array */
uint64_t vdc_nperms; /* # of permutations */
uint64_t vdc_groupwidth; /* = data + parity */
uint64_t vdc_ndisks; /* = children - spares */
uint64_t vdc_groupsz; /* = groupwidth * DRAID_ROWSIZE */
uint64_t vdc_devslicesz; /* = (groupsz * groups) / ndisks */
} vdev_draid_config_t;
/*
* Functions for handling dRAID permutation maps.
*/
extern uint64_t vdev_draid_rand(uint64_t *);
extern int vdev_draid_lookup_map(uint64_t, const draid_map_t **);
extern int vdev_draid_generate_perms(const draid_map_t *, uint8_t **);
/*
* General dRAID support functions.
*/
extern boolean_t vdev_draid_readable(vdev_t *, uint64_t);
extern boolean_t vdev_draid_missing(vdev_t *, uint64_t, uint64_t, uint64_t);
extern uint64_t vdev_draid_asize_to_psize(vdev_t *, uint64_t);
extern void vdev_draid_map_alloc_empty(zio_t *, struct raidz_row *);
extern nvlist_t *vdev_draid_read_config_spare(vdev_t *);
/* Functions for dRAID distributed spares. */
extern vdev_t *vdev_draid_spare_get_child(vdev_t *, uint64_t);
extern vdev_t *vdev_draid_spare_get_parent(vdev_t *);
extern int vdev_draid_spare_create(nvlist_t *, vdev_t *, uint64_t *, uint64_t);
#ifdef __cplusplus
}
#endif
#endif /* _SYS_VDEV_DRAID_H */

View File

@ -68,14 +68,19 @@ extern uint32_t zfs_vdev_async_write_max_active;
/*
* Virtual device operations
*/
typedef int vdev_init_func_t(spa_t *spa, nvlist_t *nv, void **tsd);
typedef void vdev_fini_func_t(vdev_t *vd);
typedef int vdev_open_func_t(vdev_t *vd, uint64_t *size, uint64_t *max_size,
uint64_t *ashift, uint64_t *pshift);
typedef void vdev_close_func_t(vdev_t *vd);
typedef uint64_t vdev_asize_func_t(vdev_t *vd, uint64_t psize);
typedef uint64_t vdev_min_asize_func_t(vdev_t *vd);
typedef uint64_t vdev_min_alloc_func_t(vdev_t *vd);
typedef void vdev_io_start_func_t(zio_t *zio);
typedef void vdev_io_done_func_t(zio_t *zio);
typedef void vdev_state_change_func_t(vdev_t *vd, int, int);
typedef boolean_t vdev_need_resilver_func_t(vdev_t *vd, uint64_t, size_t);
typedef boolean_t vdev_need_resilver_func_t(vdev_t *vd, const dva_t *dva,
size_t psize, uint64_t phys_birth);
typedef void vdev_hold_func_t(vdev_t *vd);
typedef void vdev_rele_func_t(vdev_t *vd);
@ -87,13 +92,24 @@ typedef void vdev_remap_func_t(vdev_t *vd, uint64_t offset, uint64_t size,
* Given a target vdev, translates the logical range "in" to the physical
* range "res"
*/
typedef void vdev_xlation_func_t(vdev_t *cvd, const range_seg64_t *in,
range_seg64_t *res);
typedef void vdev_xlation_func_t(vdev_t *cvd, const range_seg64_t *logical,
range_seg64_t *physical, range_seg64_t *remain);
typedef uint64_t vdev_rebuild_asize_func_t(vdev_t *vd, uint64_t start,
uint64_t size, uint64_t max_segment);
typedef void vdev_metaslab_init_func_t(vdev_t *vd, uint64_t *startp,
uint64_t *sizep);
typedef void vdev_config_generate_func_t(vdev_t *vd, nvlist_t *nv);
typedef uint64_t vdev_nparity_func_t(vdev_t *vd);
typedef uint64_t vdev_ndisks_func_t(vdev_t *vd);
typedef const struct vdev_ops {
vdev_init_func_t *vdev_op_init;
vdev_fini_func_t *vdev_op_fini;
vdev_open_func_t *vdev_op_open;
vdev_close_func_t *vdev_op_close;
vdev_asize_func_t *vdev_op_asize;
vdev_min_asize_func_t *vdev_op_min_asize;
vdev_min_alloc_func_t *vdev_op_min_alloc;
vdev_io_start_func_t *vdev_op_io_start;
vdev_io_done_func_t *vdev_op_io_done;
vdev_state_change_func_t *vdev_op_state_change;
@ -101,11 +117,12 @@ typedef const struct vdev_ops {
vdev_hold_func_t *vdev_op_hold;
vdev_rele_func_t *vdev_op_rele;
vdev_remap_func_t *vdev_op_remap;
/*
* For translating ranges from non-leaf vdevs (e.g. raidz) to leaves.
* Used when initializing vdevs. Isn't used by leaf ops.
*/
vdev_xlation_func_t *vdev_op_xlate;
vdev_rebuild_asize_func_t *vdev_op_rebuild_asize;
vdev_metaslab_init_func_t *vdev_op_metaslab_init;
vdev_config_generate_func_t *vdev_op_config_generate;
vdev_nparity_func_t *vdev_op_nparity;
vdev_ndisks_func_t *vdev_op_ndisks;
char vdev_op_type[16];
boolean_t vdev_op_leaf;
} vdev_ops_t;
@ -325,16 +342,13 @@ struct vdev {
kthread_t *vdev_rebuild_thread;
vdev_rebuild_t vdev_rebuild_config;
/* For limiting outstanding I/Os (initialize, TRIM, rebuild) */
/* For limiting outstanding I/Os (initialize, TRIM) */
kmutex_t vdev_initialize_io_lock;
kcondvar_t vdev_initialize_io_cv;
uint64_t vdev_initialize_inflight;
kmutex_t vdev_trim_io_lock;
kcondvar_t vdev_trim_io_cv;
uint64_t vdev_trim_inflight[3];
kmutex_t vdev_rebuild_io_lock;
kcondvar_t vdev_rebuild_io_cv;
uint64_t vdev_rebuild_inflight;
/*
* Values stored in the config for an indirect or removing vdev.
@ -392,7 +406,6 @@ struct vdev {
uint64_t vdev_removed; /* persistent removed state */
uint64_t vdev_resilver_txg; /* persistent resilvering state */
uint64_t vdev_rebuild_txg; /* persistent rebuilding state */
uint64_t vdev_nparity; /* number of parity devices for raidz */
char *vdev_path; /* vdev path (if any) */
char *vdev_devid; /* vdev devid (if any) */
char *vdev_physpath; /* vdev device path (if any) */
@ -445,8 +458,6 @@ struct vdev {
zfs_ratelimit_t vdev_checksum_rl;
};
#define VDEV_RAIDZ_MAXPARITY 3
#define VDEV_PAD_SIZE (8 << 10)
/* 2 padding areas (vl_pad1 and vl_be) to skip */
#define VDEV_SKIP_SIZE VDEV_PAD_SIZE * 2
@ -532,6 +543,9 @@ typedef struct vdev_label {
#define VDEV_LABEL_END_SIZE (2 * sizeof (vdev_label_t))
#define VDEV_LABELS 4
#define VDEV_BEST_LABEL VDEV_LABELS
#define VDEV_OFFSET_IS_LABEL(vd, off) \
(((off) < VDEV_LABEL_START_SIZE) || \
((off) >= ((vd)->vdev_psize - VDEV_LABEL_END_SIZE)))
#define VDEV_ALLOC_LOAD 0
#define VDEV_ALLOC_ADD 1
@ -577,6 +591,8 @@ extern vdev_ops_t vdev_root_ops;
extern vdev_ops_t vdev_mirror_ops;
extern vdev_ops_t vdev_replacing_ops;
extern vdev_ops_t vdev_raidz_ops;
extern vdev_ops_t vdev_draid_ops;
extern vdev_ops_t vdev_draid_spare_ops;
extern vdev_ops_t vdev_disk_ops;
extern vdev_ops_t vdev_file_ops;
extern vdev_ops_t vdev_missing_ops;
@ -587,11 +603,15 @@ extern vdev_ops_t vdev_indirect_ops;
/*
* Common size functions
*/
extern void vdev_default_xlate(vdev_t *vd, const range_seg64_t *in,
range_seg64_t *out);
extern void vdev_default_xlate(vdev_t *vd, const range_seg64_t *logical_rs,
range_seg64_t *physical_rs, range_seg64_t *remain_rs);
extern uint64_t vdev_default_asize(vdev_t *vd, uint64_t psize);
extern uint64_t vdev_default_min_asize(vdev_t *vd);
extern uint64_t vdev_get_min_asize(vdev_t *vd);
extern void vdev_set_min_asize(vdev_t *vd);
extern uint64_t vdev_get_min_alloc(vdev_t *vd);
extern uint64_t vdev_get_nparity(vdev_t *vd);
extern uint64_t vdev_get_ndisks(vdev_t *vd);
/*
* Global variables

View File

@ -32,6 +32,7 @@ extern "C" {
#endif
struct zio;
struct raidz_row;
struct raidz_map;
#if !defined(_KERNEL)
struct kernel_param {};
@ -43,8 +44,11 @@ struct kernel_param {};
struct raidz_map *vdev_raidz_map_alloc(struct zio *, uint64_t, uint64_t,
uint64_t);
void vdev_raidz_map_free(struct raidz_map *);
void vdev_raidz_generate_parity_row(struct raidz_map *, struct raidz_row *);
void vdev_raidz_generate_parity(struct raidz_map *);
int vdev_raidz_reconstruct(struct raidz_map *, const int *, int);
void vdev_raidz_reconstruct(struct raidz_map *, const int *, int);
void vdev_raidz_child_done(zio_t *);
void vdev_raidz_io_done(zio_t *);
/*
* vdev_raidz_math interface
@ -52,11 +56,16 @@ int vdev_raidz_reconstruct(struct raidz_map *, const int *, int);
void vdev_raidz_math_init(void);
void vdev_raidz_math_fini(void);
const struct raidz_impl_ops *vdev_raidz_math_get_ops(void);
int vdev_raidz_math_generate(struct raidz_map *);
int vdev_raidz_math_reconstruct(struct raidz_map *, const int *, const int *,
const int);
int vdev_raidz_math_generate(struct raidz_map *, struct raidz_row *);
int vdev_raidz_math_reconstruct(struct raidz_map *, struct raidz_row *,
const int *, const int *, const int);
int vdev_raidz_impl_set(const char *);
typedef struct vdev_raidz {
int vd_logical_width;
int vd_nparity;
} vdev_raidz_t;
#ifdef __cplusplus
}
#endif

View File

@ -29,6 +29,7 @@
#include <sys/debug.h>
#include <sys/kstat.h>
#include <sys/abd.h>
#include <sys/vdev_impl.h>
#ifdef __cplusplus
extern "C" {
@ -106,30 +107,45 @@ typedef struct raidz_col {
uint64_t rc_offset; /* device offset */
uint64_t rc_size; /* I/O size */
abd_t *rc_abd; /* I/O data */
void *rc_gdata; /* used to store the "good" version */
void *rc_orig_data; /* pre-reconstruction */
abd_t *rc_gdata; /* used to store the "good" version */
int rc_error; /* I/O error for this device */
uint8_t rc_tried; /* Did we attempt this I/O column? */
uint8_t rc_skipped; /* Did we skip this I/O column? */
uint8_t rc_need_orig_restore; /* need to restore from orig_data? */
uint8_t rc_repair; /* Write good data to this column */
} raidz_col_t;
typedef struct raidz_row {
uint64_t rr_cols; /* Regular column count */
uint64_t rr_scols; /* Count including skipped columns */
uint64_t rr_bigcols; /* Remainder data column count */
uint64_t rr_missingdata; /* Count of missing data devices */
uint64_t rr_missingparity; /* Count of missing parity devices */
uint64_t rr_firstdatacol; /* First data column/parity count */
abd_t *rr_abd_copy; /* rm_asize-buffer of copied data */
abd_t *rr_abd_empty; /* dRAID empty sector buffer */
int rr_nempty; /* empty sectors included in parity */
int rr_code; /* reconstruction code (unused) */
#ifdef ZFS_DEBUG
uint64_t rr_offset; /* Logical offset for *_io_verify() */
uint64_t rr_size; /* Physical size for *_io_verify() */
#endif
raidz_col_t rr_col[0]; /* Flexible array of I/O columns */
} raidz_row_t;
typedef struct raidz_map {
uint64_t rm_cols; /* Regular column count */
uint64_t rm_scols; /* Count including skipped columns */
uint64_t rm_bigcols; /* Number of oversized columns */
uint64_t rm_asize; /* Actual total I/O size */
uint64_t rm_missingdata; /* Count of missing data devices */
uint64_t rm_missingparity; /* Count of missing parity devices */
uint64_t rm_firstdatacol; /* First data column/parity count */
uint64_t rm_nskip; /* Skipped sectors for padding */
uint64_t rm_skipstart; /* Column index of padding start */
abd_t *rm_abd_copy; /* rm_asize-buffer of copied data */
uintptr_t rm_reports; /* # of referencing checksum reports */
uint8_t rm_freed; /* map no longer has referencing ZIO */
uint8_t rm_ecksuminjected; /* checksum error was injected */
boolean_t rm_freed; /* map no longer has referencing ZIO */
boolean_t rm_ecksuminjected; /* checksum error was injected */
int rm_nrows; /* Regular row count */
int rm_nskip; /* RAIDZ sectors skipped for padding */
int rm_skipstart; /* Column index of padding start */
const raidz_impl_ops_t *rm_ops; /* RAIDZ math operations */
raidz_col_t rm_col[1]; /* Flexible array of I/O columns */
raidz_row_t *rm_row[0]; /* flexible array of rows */
} raidz_map_t;
#define RAIDZ_ORIGINAL_IMPL (INT_MAX)
extern const raidz_impl_ops_t vdev_raidz_scalar_impl;
@ -163,14 +179,15 @@ extern const raidz_impl_ops_t vdev_raidz_powerpc_altivec_impl;
*
* raidz_parity Returns parity of the RAIDZ block
* raidz_ncols Returns number of columns the block spans
* Note, all rows have the same number of columns.
* raidz_nbigcols Returns number of big columns
* raidz_col_p Returns pointer to a column
* raidz_col_size Returns size of a column
* raidz_big_size Returns size of big columns
* raidz_short_size Returns size of short columns
*/
#define raidz_parity(rm) ((rm)->rm_firstdatacol)
#define raidz_ncols(rm) ((rm)->rm_cols)
#define raidz_parity(rm) ((rm)->rm_row[0]->rr_firstdatacol)
#define raidz_ncols(rm) ((rm)->rm_row[0]->rr_cols)
#define raidz_nbigcols(rm) ((rm)->rm_bigcols)
#define raidz_col_p(rm, c) ((rm)->rm_col + (c))
#define raidz_col_size(rm, c) ((rm)->rm_col[c].rc_size)
@ -185,10 +202,10 @@ extern const raidz_impl_ops_t vdev_raidz_powerpc_altivec_impl;
*/
#define _RAIDZ_GEN_WRAP(code, impl) \
static void \
impl ## _gen_ ## code(void *rmp) \
impl ## _gen_ ## code(void *rrp) \
{ \
raidz_map_t *rm = (raidz_map_t *)rmp; \
raidz_generate_## code ## _impl(rm); \
raidz_row_t *rr = (raidz_row_t *)rrp; \
raidz_generate_## code ## _impl(rr); \
}
/*
@ -199,10 +216,10 @@ impl ## _gen_ ## code(void *rmp) \
*/
#define _RAIDZ_REC_WRAP(code, impl) \
static int \
impl ## _rec_ ## code(void *rmp, const int *tgtidx) \
impl ## _rec_ ## code(void *rrp, const int *tgtidx) \
{ \
raidz_map_t *rm = (raidz_map_t *)rmp; \
return (raidz_reconstruct_## code ## _impl(rm, tgtidx)); \
raidz_row_t *rr = (raidz_row_t *)rrp; \
return (raidz_reconstruct_## code ## _impl(rr, tgtidx)); \
}
/*

View File

@ -66,10 +66,14 @@ typedef struct vdev_rebuild {
vdev_t *vr_top_vdev; /* top-level vdev to rebuild */
metaslab_t *vr_scan_msp; /* scanning disabled metaslab */
range_tree_t *vr_scan_tree; /* scan ranges (in metaslab) */
kmutex_t vr_io_lock; /* inflight IO lock */
kcondvar_t vr_io_cv; /* inflight IO cv */
/* In-core state and progress */
uint64_t vr_scan_offset[TXG_SIZE];
uint64_t vr_prev_scan_time_ms; /* any previous scan time */
uint64_t vr_bytes_inflight_max; /* maximum bytes inflight */
uint64_t vr_bytes_inflight; /* current bytes inflight */
/* Per-rebuild pass statistics for calculating bandwidth */
uint64_t vr_pass_start_time;

View File

@ -372,6 +372,7 @@ struct zio_cksum_report {
nvlist_t *zcr_detector;
void *zcr_cbdata;
size_t zcr_cbinfo; /* passed to zcr_free() */
uint64_t zcr_sector;
uint64_t zcr_align;
uint64_t zcr_length;
zio_cksum_finish_f *zcr_finish;

View File

@ -76,6 +76,7 @@ typedef enum spa_feature {
SPA_FEATURE_LIVELIST,
SPA_FEATURE_DEVICE_REBUILD,
SPA_FEATURE_ZSTD_COMPRESS,
SPA_FEATURE_DRAID,
SPA_FEATURES
} spa_feature_t;

View File

@ -5336,6 +5336,16 @@ zfs_get_holds(zfs_handle_t *zhp, nvlist_t **nvl)
* 160k. Again, 128k is from SPA_OLD_MAXBLOCKSIZE and 160k is as calculated in
* the 128k block example above.
*
* The situtation is slightly different for dRAID since the minimum allocation
* size is the full group width. The same 8K block above would be written as
* follows in a dRAID group:
*
* +-------+-------+-------+-------+-------+
* | disk1 | disk2 | disk3 | disk4 | disk5 |
* +-------+-------+-------+-------+-------+
* | P0 | D0 | D1 | S0 | S1 |
* +-------+-------+-------+-------+-------+
*
* Compression may lead to a variety of block sizes being written for the same
* volume or file. There is no clear way to reserve just the amount of space
* that will be required, so the worst case (no compression) is assumed.
@ -5365,6 +5375,23 @@ vdev_raidz_asize(uint64_t ndisks, uint64_t nparity, uint64_t ashift,
return (asize);
}
/*
* Derived from function of same name in module/zfs/vdev_draid.c. Returns the
* amount of space (in bytes) that will be allocated for the specified block
* size.
*/
static uint64_t
vdev_draid_asize(uint64_t ndisks, uint64_t nparity, uint64_t ashift,
uint64_t blksize)
{
ASSERT3U(ndisks, >, nparity);
uint64_t ndata = ndisks - nparity;
uint64_t rows = ((blksize - 1) / (ndata << ashift)) + 1;
uint64_t asize = (rows * ndisks) << ashift;
return (asize);
}
/*
* Determine how much space will be allocated if it lands on the most space-
* inefficient top-level vdev. Returns the size in bytes required to store one
@ -5374,7 +5401,7 @@ static uint64_t
volsize_from_vdevs(zpool_handle_t *zhp, uint64_t nblocks, uint64_t blksize)
{
nvlist_t *config, *tree, **vdevs;
uint_t nvdevs, v;
uint_t nvdevs;
uint64_t ret = 0;
config = zpool_get_config(zhp, NULL);
@ -5384,33 +5411,61 @@ volsize_from_vdevs(zpool_handle_t *zhp, uint64_t nblocks, uint64_t blksize)
return (nblocks * blksize);
}
for (v = 0; v < nvdevs; v++) {
for (int v = 0; v < nvdevs; v++) {
char *type;
uint64_t nparity, ashift, asize, tsize;
nvlist_t **disks;
uint_t ndisks;
uint64_t volsize;
if (nvlist_lookup_string(vdevs[v], ZPOOL_CONFIG_TYPE,
&type) != 0 || strcmp(type, VDEV_TYPE_RAIDZ) != 0 ||
nvlist_lookup_uint64(vdevs[v], ZPOOL_CONFIG_NPARITY,
&nparity) != 0 ||
nvlist_lookup_uint64(vdevs[v], ZPOOL_CONFIG_ASHIFT,
&ashift) != 0 ||
nvlist_lookup_nvlist_array(vdevs[v], ZPOOL_CONFIG_CHILDREN,
&disks, &ndisks) != 0) {
&type) != 0)
continue;
if (strcmp(type, VDEV_TYPE_RAIDZ) != 0 &&
strcmp(type, VDEV_TYPE_DRAID) != 0)
continue;
if (nvlist_lookup_uint64(vdevs[v],
ZPOOL_CONFIG_NPARITY, &nparity) != 0)
continue;
if (nvlist_lookup_uint64(vdevs[v],
ZPOOL_CONFIG_ASHIFT, &ashift) != 0)
continue;
if (strcmp(type, VDEV_TYPE_RAIDZ) == 0) {
nvlist_t **disks;
uint_t ndisks;
if (nvlist_lookup_nvlist_array(vdevs[v],
ZPOOL_CONFIG_CHILDREN, &disks, &ndisks) != 0)
continue;
/* allocation size for the "typical" 128k block */
tsize = vdev_raidz_asize(ndisks, nparity, ashift,
SPA_OLD_MAXBLOCKSIZE);
/* allocation size for the blksize block */
asize = vdev_raidz_asize(ndisks, nparity, ashift,
blksize);
} else {
uint64_t ndata;
if (nvlist_lookup_uint64(vdevs[v],
ZPOOL_CONFIG_DRAID_NDATA, &ndata) != 0)
continue;
/* allocation size for the "typical" 128k block */
tsize = vdev_draid_asize(ndata + nparity, nparity,
ashift, SPA_OLD_MAXBLOCKSIZE);
/* allocation size for the blksize block */
asize = vdev_draid_asize(ndata + nparity, nparity,
ashift, blksize);
}
/* allocation size for the "typical" 128k block */
tsize = vdev_raidz_asize(ndisks, nparity, ashift,
SPA_OLD_MAXBLOCKSIZE);
/* allocation size for the blksize block */
asize = vdev_raidz_asize(ndisks, nparity, ashift, blksize);
/*
* Scale this size down as a ratio of 128k / tsize. See theory
* statement above.
* Scale this size down as a ratio of 128k / tsize.
* See theory statement above.
*/
volsize = nblocks * asize * SPA_OLD_MAXBLOCKSIZE / tsize;
if (volsize > ret) {

View File

@ -112,7 +112,6 @@ refresh_config_libzfs(void *handle, nvlist_t *tryconfig)
return (refresh_config((libzfs_handle_t *)handle, tryconfig));
}
static int
pool_active_libzfs(void *handle, const char *name, uint64_t guid,
boolean_t *isactive)

View File

@ -42,10 +42,10 @@
#include <sys/efi_partition.h>
#include <sys/systeminfo.h>
#include <sys/zfs_ioctl.h>
#include <sys/zfs_sysfs.h>
#include <sys/vdev_disk.h>
#include <dlfcn.h>
#include <libzutil.h>
#include "zfs_namecheck.h"
#include "zfs_prop.h"
#include "libzfs_impl.h"
@ -481,7 +481,8 @@ zpool_valid_proplist(libzfs_handle_t *hdl, const char *poolname,
if (err != 0) {
ASSERT3U(err, ==, ENOENT);
zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
"invalid feature '%s'"), fname);
"feature '%s' unsupported by kernel"),
fname);
(void) zfs_error(hdl, EZFS_BADPROP, errbuf);
goto error;
}
@ -960,6 +961,7 @@ zpool_name_valid(libzfs_handle_t *hdl, boolean_t isopen, const char *pool)
if (ret == 0 && !isopen &&
(strncmp(pool, "mirror", 6) == 0 ||
strncmp(pool, "raidz", 5) == 0 ||
strncmp(pool, "draid", 5) == 0 ||
strncmp(pool, "spare", 5) == 0 ||
strcmp(pool, "log") == 0)) {
if (hdl != NULL)
@ -1186,6 +1188,37 @@ zpool_has_special_vdev(nvlist_t *nvroot)
return (B_FALSE);
}
/*
* Output a dRAID top-level vdev name in to the provided buffer.
*/
static char *
zpool_draid_name(char *name, int len, uint64_t data, uint64_t parity,
uint64_t spares, uint64_t children)
{
snprintf(name, len, "%s%llu:%llud:%lluc:%llus",
VDEV_TYPE_DRAID, (u_longlong_t)parity, (u_longlong_t)data,
(u_longlong_t)children, (u_longlong_t)spares);
return (name);
}
/*
* Return B_TRUE if the provided name is a dRAID spare name.
*/
boolean_t
zpool_is_draid_spare(const char *name)
{
uint64_t spare_id, parity, vdev_id;
if (sscanf(name, VDEV_TYPE_DRAID "%llu-%llu-%llu",
(u_longlong_t *)&parity, (u_longlong_t *)&vdev_id,
(u_longlong_t *)&spare_id) == 3) {
return (B_TRUE);
}
return (B_FALSE);
}
/*
* Create the named pool, using the provided vdev list. It is assumed
* that the consumer has already validated the contents of the nvlist, so we
@ -2668,6 +2701,11 @@ zpool_vdev_is_interior(const char *name)
VDEV_TYPE_REPLACING, strlen(VDEV_TYPE_REPLACING)) == 0 ||
strncmp(name, VDEV_TYPE_MIRROR, strlen(VDEV_TYPE_MIRROR)) == 0)
return (B_TRUE);
if (strncmp(name, VDEV_TYPE_DRAID, strlen(VDEV_TYPE_DRAID)) == 0 &&
!zpool_is_draid_spare(name))
return (B_TRUE);
return (B_FALSE);
}
@ -3101,7 +3139,8 @@ is_replacing_spare(nvlist_t *search, nvlist_t *tgt, int which)
verify(nvlist_lookup_string(search, ZPOOL_CONFIG_TYPE,
&type) == 0);
if (strcmp(type, VDEV_TYPE_SPARE) == 0 &&
if ((strcmp(type, VDEV_TYPE_SPARE) == 0 ||
strcmp(type, VDEV_TYPE_DRAID_SPARE) == 0) &&
children == 2 && child[which] == tgt)
return (B_TRUE);
@ -3216,8 +3255,12 @@ zpool_vdev_attach(zpool_handle_t *zhp, const char *old_disk,
"cannot replace a log with a spare"));
} else if (rebuild) {
zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
"only mirror vdevs support sequential "
"reconstruction"));
"only mirror and dRAID vdevs support "
"sequential reconstruction"));
} else if (zpool_is_draid_spare(new_disk)) {
zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
"dRAID spares can only replace child "
"devices in their parent's dRAID vdev"));
} else if (version >= SPA_VERSION_MULTI_REPLACE) {
zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
"already in replacing/spare config; wait "
@ -3618,6 +3661,12 @@ zpool_vdev_remove(zpool_handle_t *zhp, const char *path)
(void) snprintf(msg, sizeof (msg),
dgettext(TEXT_DOMAIN, "cannot remove %s"), path);
if (zpool_is_draid_spare(path)) {
zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
"dRAID spares cannot be removed"));
return (zfs_error(hdl, EZFS_NODEVICE, msg));
}
(void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name));
if ((tgt = zpool_find_vdev(zhp, path, &avail_spare, &l2cache,
&islog)) == NULL)
@ -3955,9 +4004,10 @@ zpool_vdev_name(libzfs_handle_t *hdl, zpool_handle_t *zhp, nvlist_t *nv,
}
/*
* Remove the partition from the path it this is a whole disk.
* Remove the partition from the path if this is a whole disk.
*/
if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_WHOLE_DISK, &value)
if (strcmp(type, VDEV_TYPE_DRAID_SPARE) != 0 &&
nvlist_lookup_uint64(nv, ZPOOL_CONFIG_WHOLE_DISK, &value)
== 0 && value && !(name_flags & VDEV_NAME_PATH)) {
return (zfs_strip_partition(path));
}
@ -3975,6 +4025,27 @@ zpool_vdev_name(libzfs_handle_t *hdl, zpool_handle_t *zhp, nvlist_t *nv,
path = buf;
}
/*
* If it's a dRAID device, we add parity, groups, and spares.
*/
if (strcmp(path, VDEV_TYPE_DRAID) == 0) {
uint64_t ndata, nparity, nspares;
nvlist_t **child;
uint_t children;
verify(nvlist_lookup_nvlist_array(nv,
ZPOOL_CONFIG_CHILDREN, &child, &children) == 0);
verify(nvlist_lookup_uint64(nv,
ZPOOL_CONFIG_NPARITY, &nparity) == 0);
verify(nvlist_lookup_uint64(nv,
ZPOOL_CONFIG_DRAID_NDATA, &ndata) == 0);
verify(nvlist_lookup_uint64(nv,
ZPOOL_CONFIG_DRAID_NSPARES, &nspares) == 0);
path = zpool_draid_name(buf, sizeof (buf), ndata,
nparity, nspares, children);
}
/*
* We identify each top-level vdev by using a <type-id>
* naming convention.

View File

@ -124,6 +124,8 @@ KERNEL_C = \
unique.c \
vdev.c \
vdev_cache.c \
vdev_draid.c \
vdev_draid_rand.c \
vdev_file.c \
vdev_indirect_births.c \
vdev_indirect.c \
@ -216,7 +218,7 @@ libzpool_la_LIBADD = \
$(abs_top_builddir)/lib/libnvpair/libnvpair.la \
$(abs_top_builddir)/lib/libzstd/libzstd.la
libzpool_la_LIBADD += $(LIBCLOCK_GETTIME) $(ZLIB_LIBS) -ldl
libzpool_la_LIBADD += $(LIBCLOCK_GETTIME) $(ZLIB_LIBS) -ldl -lm
libzpool_la_LDFLAGS = -pthread

View File

@ -61,6 +61,11 @@ during testing.
.IP
Size of data for raidz block. Size is 1 << (zio_size_shift).
.HP
.BI "\-r" " reflow_offset" " (default: uint max)"
.IP
Set raidz expansion offset. The expanded raidz map allocation function will
produce different map configurations depending on this value.
.HP
.BI "\-S(weep)"
.IP
Sweep parameter space while verifying the raidz implementations. This option
@ -77,6 +82,10 @@ This options starts the benchmark mode. All implementations are benchmarked
using increasing per disk data size. Results are given as throughput per disk,
measured in MiB/s.
.HP
.BI "\-e(xpansion)"
.IP
Use expanded raidz map allocation function.
.HP
.BI "\-v(erbose)"
.IP
Increase verbosity.

View File

@ -23,6 +23,7 @@
.\" Copyright (c) 2009 Oracle and/or its affiliates. All rights reserved.
.\" Copyright (c) 2009 Michael Gebetsroither <michael.geb@gmx.at>. All rights
.\" reserved.
.\" Copyright (c) 2017, Intel Corporation.
.\"
.TH ZTEST 1 "Aug 24, 2020" OpenZFS
@ -82,13 +83,29 @@ Used alignment in test.
.IP
Number of mirror copies.
.HP
.BI "\-r" " raidz_disks" " (default: 4)"
.BI "\-r" " raidz_disks / draid_disks" " (default: 4 / 16)"
.IP
Number of raidz disks.
.HP
.BI "\-R" " raidz_parity" " (default: 1)"
.BI "\-R" " raid_parity" " (default: 1)"
.IP
Raidz parity.
Raid parity (raidz & draid).
.HP
.BI "\-K" " raid_kind" " (default: 'random') raidz|draid|random"
.IP
The kind of RAID config to use. With 'random' the kind alternates between raidz and draid.
.HP
.BI "\-D" " draid_data" " (default: 4)"
.IP
Number of data disks in a dRAID redundancy group.
.HP
.BI "\-S" " draid_spares" " (default: 1)"
.IP
Number of dRAID distributed spare disks.
.HP
.BI "\-C" " vdev_class_state" " (default: random)"
.IP
The vdev allocation class state: special=on|off|random.
.HP
.BI "\-d" " datasets" " (default: 7)"
.IP

View File

@ -2902,6 +2902,31 @@ top-level vdev.
Default value: \fB1,048,576\fR.
.RE
.sp
.ne 2
.na
\fBzfs_rebuild_scrub_enabled\fR (int)
.ad
.RS 12n
Automatically start a pool scrub when the last active sequential resilver
completes in order to verify the checksums of all blocks which have been
resilvered. This option is enabled by default and is strongly recommended.
.sp
Default value: \fB1\fR.
.RE
.sp
.ne 2
.na
\fBzfs_rebuild_vdev_limit\fR (ulong)
.ad
.RS 12n
Maximum amount of i/o that can be concurrently issued for a sequential
resilver per leaf device, given in bytes.
.sp
Default value: \fB33,554,432\fR.
.RE
.sp
.ne 2
.na

View File

@ -306,6 +306,30 @@ This feature becomes \fBactive\fR when the \fBzpool remove\fR subcommand is used
on a top-level vdev, and will never return to being \fBenabled\fR.
.RE
.sp
.ne 2
.na
\fBdraid\fR
.ad
.RS 4n
.TS
l l .
GUID org.openzfs:draid
READ\-ONLY COMPATIBLE no
DEPENDENCIES none
.TE
This feature enables use of the \fBdraid\fR vdev type. dRAID is a variant
of raidz which provides integrated distributed hot spares that allow faster
resilvering while retaining the benefits of raidz. Data, parity, and spare
space are organized in redundancy groups and distributed evenly over all of
the devices.
This feature becomes \fBactive\fR when creating a pool which uses the
\fBdraid\fR vdev type, or when adding a new \fBdraid\fR vdev to an
existing pool.
.RE
.sp
.ne 2
.na

View File

@ -73,12 +73,14 @@ and period
The pool names
.Sy mirror ,
.Sy raidz ,
.Sy draid ,
.Sy spare
and
.Sy log
are reserved, as are names beginning with
.Sy mirror ,
.Sy raidz ,
.Sy draid ,
.Sy spare ,
and the pattern
.Sy c[0-9] .

View File

@ -52,7 +52,7 @@ Begins a scrub or resumes a paused scrub.
The scrub examines all data in the specified pools to verify that it checksums
correctly.
For replicated
.Pq mirror or raidz
.Pq mirror, raidz, or draid
devices, ZFS automatically repairs any damage discovered during the scrub.
The
.Nm zpool Cm status

View File

@ -64,7 +64,7 @@ A file must be specified by a full path.
A mirror of two or more devices.
Data is replicated in an identical fashion across all components of a mirror.
A mirror with N disks of size X can hold X bytes and can withstand (N-1) devices
failing before data integrity is compromised.
failing without losing data.
.It Sy raidz , raidz1 , raidz2 , raidz3
A variation on RAID-5 that allows for better distribution of parity and
eliminates the RAID-5
@ -88,11 +88,75 @@ vdev type is an alias for
.Sy raidz1 .
.Pp
A raidz group with N disks of size X with P parity disks can hold approximately
(N-P)*X bytes and can withstand P device(s) failing before data integrity is
compromised.
(N-P)*X bytes and can withstand P device(s) failing without losing data.
The minimum number of devices in a raidz group is one more than the number of
parity disks.
The recommended number is between 3 and 9 to help increase performance.
.It Sy draid , draid1 , draid2 , draid3
A variant of raidz that provides integrated distributed hot spares which
allows for faster resilvering while retaining the benefits of raidz.
A dRAID vdev is constructed from multiple internal raidz groups, each with D
data devices and P parity devices.
These groups are distributed over all of the children in order to fully
utilize the available disk performance.
.Pp
Unlike raidz, dRAID uses a fixed stripe width (padding as necessary with
zeros) to allow fully sequential resilvering.
This fixed stripe width significantly effects both usable capacity and IOPS.
For example, with the default D=8 and 4k disk sectors the minimum allocation
size is 32k.
If using compression, this relatively large allocation size can reduce the
effective compression ratio.
When using ZFS volumes and dRAID the default volblocksize property is increased
to account for the allocation size.
If a dRAID pool will hold a significant amount of small blocks, it is
recommended to also add a mirrored
.Sy special
vdev to store those blocks.
.Pp
In regards to IO/s, performance is similar to raidz since for any read all D
data disks must be accessed.
Delivered random IOPS can be reasonably approximated as
floor((N-S)/(D+P))*<single-drive-IOPS>.
.Pp
Like raidz a dRAID can have single-, double-, or triple-parity. The
.Sy draid1 ,
.Sy draid2 ,
and
.Sy draid3
types can be used to specify the parity level.
The
.Sy draid
vdev type is an alias for
.Sy draid1 .
.Pp
A dRAID with N disks of size X, D data disks per redundancy group, P parity
level, and S distributed hot spares can hold approximately (N-S)*(D/(D+P))*X
bytes and can withstand P device(s) failing without losing data.
.It Sy draid[<parity>][:<data>d][:<children>c][:<spares>s]
A non-default dRAID configuration can be specified by appending one or more
of the following optional arguments to the
.Sy draid
keyword.
.Pp
.Em parity
- The parity level (1-3).
.Pp
.Em data
- The number of data devices per redundancy group.
In general a smaller value of D will increase IOPS, improve the compression ratio, and speed up resilvering at the expense of total usable capacity.
Defaults to 8, unless N-P-S is less than 8.
.Pp
.Em children
- The expected number of children.
Useful as a cross-check when listing a large number of devices.
An error is returned when the provided number of children differs.
.Pp
.Em spares
- The number of distributed hot spares.
Defaults to zero.
.Pp
.Pp
.It Sy spare
A pseudo-vdev which keeps track of available hot spares for a pool.
For more information, see the
@ -273,6 +337,14 @@ If the original faulted device is detached, then the hot spare assumes its
place in the configuration, and is removed from the spare list of all active
pools.
.Pp
The
.Sy draid
vdev type provides distributed hot spares.
These hot spares are named after the dRAID vdev they're a part of (
.Qq draid1-2-3 specifies spare 3 of vdev 2, which is a single parity dRAID
) and may only be used by that dRAID vdev.
Otherwise, they behave the same as normal hot spares.
.Pp
Spares cannot replace log devices.
.Ss Intent Log
The ZFS Intent Log (ZIL) satisfies POSIX requirements for synchronous

View File

@ -243,6 +243,8 @@ SRCS+= abd.c \
unique.c \
vdev.c \
vdev_cache.c \
vdev_draid.c \
vdev_draid_rand.c \
vdev_indirect.c \
vdev_indirect_births.c \
vdev_indirect_mapping.c \
@ -341,6 +343,7 @@ CFLAGS.lz4.c= -Wno-cast-qual
CFLAGS.spa.c= -Wno-cast-qual
CFLAGS.spa_misc.c= -Wno-cast-qual
CFLAGS.sysctl_os.c= -include ../zfs_config.h
CFLAGS.vdev_draid.c= -Wno-cast-qual
CFLAGS.vdev_raidz.c= -Wno-cast-qual
CFLAGS.vdev_raidz_math.c= -Wno-cast-qual
CFLAGS.vdev_raidz_math_scalar.c= -Wno-cast-qual

View File

@ -292,19 +292,28 @@ vdev_file_io_done(zio_t *zio)
}
vdev_ops_t vdev_file_ops = {
vdev_file_open,
vdev_file_close,
vdev_default_asize,
vdev_file_io_start,
vdev_file_io_done,
NULL,
NULL,
vdev_file_hold,
vdev_file_rele,
NULL,
vdev_default_xlate,
VDEV_TYPE_FILE, /* name of this vdev type */
B_TRUE /* leaf vdev */
.vdev_op_init = NULL,
.vdev_op_fini = NULL,
.vdev_op_open = vdev_file_open,
.vdev_op_close = vdev_file_close,
.vdev_op_asize = vdev_default_asize,
.vdev_op_min_asize = vdev_default_min_asize,
.vdev_op_min_alloc = NULL,
.vdev_op_io_start = vdev_file_io_start,
.vdev_op_io_done = vdev_file_io_done,
.vdev_op_state_change = NULL,
.vdev_op_need_resilver = NULL,
.vdev_op_hold = vdev_file_hold,
.vdev_op_rele = vdev_file_rele,
.vdev_op_remap = NULL,
.vdev_op_xlate = vdev_default_xlate,
.vdev_op_rebuild_asize = NULL,
.vdev_op_metaslab_init = NULL,
.vdev_op_config_generate = NULL,
.vdev_op_nparity = NULL,
.vdev_op_ndisks = NULL,
.vdev_op_type = VDEV_TYPE_FILE, /* name of this vdev type */
.vdev_op_leaf = B_TRUE /* leaf vdev */
};
/*
@ -313,19 +322,28 @@ vdev_ops_t vdev_file_ops = {
#ifndef _KERNEL
vdev_ops_t vdev_disk_ops = {
vdev_file_open,
vdev_file_close,
vdev_default_asize,
vdev_file_io_start,
vdev_file_io_done,
NULL,
NULL,
vdev_file_hold,
vdev_file_rele,
NULL,
vdev_default_xlate,
VDEV_TYPE_DISK, /* name of this vdev type */
B_TRUE /* leaf vdev */
.vdev_op_init = NULL,
.vdev_op_fini = NULL,
.vdev_op_open = vdev_file_open,
.vdev_op_close = vdev_file_close,
.vdev_op_asize = vdev_default_asize,
.vdev_op_min_asize = vdev_default_min_asize,
.vdev_op_min_alloc = NULL,
.vdev_op_io_start = vdev_file_io_start,
.vdev_op_io_done = vdev_file_io_done,
.vdev_op_state_change = NULL,
.vdev_op_need_resilver = NULL,
.vdev_op_hold = vdev_file_hold,
.vdev_op_rele = vdev_file_rele,
.vdev_op_remap = NULL,
.vdev_op_xlate = vdev_default_xlate,
.vdev_op_rebuild_asize = NULL,
.vdev_op_metaslab_init = NULL,
.vdev_op_config_generate = NULL,
.vdev_op_nparity = NULL,
.vdev_op_ndisks = NULL,
.vdev_op_type = VDEV_TYPE_DISK, /* name of this vdev type */
.vdev_op_leaf = B_TRUE /* leaf vdev */
};
#endif

View File

@ -1189,17 +1189,26 @@ vdev_geom_rele(vdev_t *vd)
}
vdev_ops_t vdev_disk_ops = {
vdev_geom_open,
vdev_geom_close,
vdev_default_asize,
vdev_geom_io_start,
vdev_geom_io_done,
NULL,
NULL,
vdev_geom_hold,
vdev_geom_rele,
NULL,
vdev_default_xlate,
VDEV_TYPE_DISK, /* name of this vdev type */
B_TRUE /* leaf vdev */
.vdev_op_init = NULL,
.vdev_op_fini = NULL,
.vdev_op_open = vdev_geom_open,
.vdev_op_close = vdev_geom_close,
.vdev_op_asize = vdev_default_asize,
.vdev_op_min_asize = vdev_default_min_asize,
.vdev_op_min_alloc = NULL,
.vdev_op_io_start = vdev_geom_io_start,
.vdev_op_io_done = vdev_geom_io_done,
.vdev_op_state_change = NULL,
.vdev_op_need_resilver = NULL,
.vdev_op_hold = vdev_geom_hold,
.vdev_op_rele = vdev_geom_rele,
.vdev_op_remap = NULL,
.vdev_op_xlate = vdev_default_xlate,
.vdev_op_rebuild_asize = NULL,
.vdev_op_metaslab_init = NULL,
.vdev_op_config_generate = NULL,
.vdev_op_nparity = NULL,
.vdev_op_ndisks = NULL,
.vdev_op_type = VDEV_TYPE_DISK, /* name of this vdev type */
.vdev_op_leaf = B_TRUE /* leaf vdev */
};

View File

@ -826,9 +826,13 @@ vdev_disk_rele(vdev_t *vd)
}
vdev_ops_t vdev_disk_ops = {
.vdev_op_init = NULL,
.vdev_op_fini = NULL,
.vdev_op_open = vdev_disk_open,
.vdev_op_close = vdev_disk_close,
.vdev_op_asize = vdev_default_asize,
.vdev_op_min_asize = vdev_default_min_asize,
.vdev_op_min_alloc = NULL,
.vdev_op_io_start = vdev_disk_io_start,
.vdev_op_io_done = vdev_disk_io_done,
.vdev_op_state_change = NULL,
@ -837,6 +841,11 @@ vdev_ops_t vdev_disk_ops = {
.vdev_op_rele = vdev_disk_rele,
.vdev_op_remap = NULL,
.vdev_op_xlate = vdev_default_xlate,
.vdev_op_rebuild_asize = NULL,
.vdev_op_metaslab_init = NULL,
.vdev_op_config_generate = NULL,
.vdev_op_nparity = NULL,
.vdev_op_ndisks = NULL,
.vdev_op_type = VDEV_TYPE_DISK, /* name of this vdev type */
.vdev_op_leaf = B_TRUE /* leaf vdev */
};

View File

@ -305,9 +305,13 @@ vdev_file_io_done(zio_t *zio)
}
vdev_ops_t vdev_file_ops = {
.vdev_op_init = NULL,
.vdev_op_fini = NULL,
.vdev_op_open = vdev_file_open,
.vdev_op_close = vdev_file_close,
.vdev_op_asize = vdev_default_asize,
.vdev_op_min_asize = vdev_default_min_asize,
.vdev_op_min_alloc = NULL,
.vdev_op_io_start = vdev_file_io_start,
.vdev_op_io_done = vdev_file_io_done,
.vdev_op_state_change = NULL,
@ -316,6 +320,11 @@ vdev_ops_t vdev_file_ops = {
.vdev_op_rele = vdev_file_rele,
.vdev_op_remap = NULL,
.vdev_op_xlate = vdev_default_xlate,
.vdev_op_rebuild_asize = NULL,
.vdev_op_metaslab_init = NULL,
.vdev_op_config_generate = NULL,
.vdev_op_nparity = NULL,
.vdev_op_ndisks = NULL,
.vdev_op_type = VDEV_TYPE_FILE, /* name of this vdev type */
.vdev_op_leaf = B_TRUE /* leaf vdev */
};
@ -341,9 +350,13 @@ vdev_file_fini(void)
#ifndef _KERNEL
vdev_ops_t vdev_disk_ops = {
.vdev_op_init = NULL,
.vdev_op_fini = NULL,
.vdev_op_open = vdev_file_open,
.vdev_op_close = vdev_file_close,
.vdev_op_asize = vdev_default_asize,
.vdev_op_min_asize = vdev_default_min_asize,
.vdev_op_min_alloc = NULL,
.vdev_op_io_start = vdev_file_io_start,
.vdev_op_io_done = vdev_file_io_done,
.vdev_op_state_change = NULL,
@ -352,6 +365,11 @@ vdev_ops_t vdev_disk_ops = {
.vdev_op_rele = vdev_file_rele,
.vdev_op_remap = NULL,
.vdev_op_xlate = vdev_default_xlate,
.vdev_op_rebuild_asize = NULL,
.vdev_op_metaslab_init = NULL,
.vdev_op_config_generate = NULL,
.vdev_op_nparity = NULL,
.vdev_op_ndisks = NULL,
.vdev_op_type = VDEV_TYPE_DISK, /* name of this vdev type */
.vdev_op_leaf = B_TRUE /* leaf vdev */
};

View File

@ -576,7 +576,7 @@ zpool_feature_init(void)
zfeature_register(SPA_FEATURE_DEVICE_REBUILD,
"org.openzfs:device_rebuild", "device_rebuild",
"Support for sequential device rebuilds",
"Support for sequential mirror/dRAID device rebuilds",
ZFEATURE_FLAG_READONLY_COMPAT, ZFEATURE_TYPE_BOOLEAN, NULL);
{
@ -589,6 +589,10 @@ zpool_feature_init(void)
"zstd compression algorithm support.",
ZFEATURE_FLAG_PER_DATASET, ZFEATURE_TYPE_BOOLEAN, zstd_deps);
}
zfeature_register(SPA_FEATURE_DRAID,
"org.openzfs:draid", "draid", "Support for distributed parity RAID",
ZFEATURE_FLAG_MOS, ZFEATURE_TYPE_BOOLEAN, NULL);
}
#if defined(_KERNEL)

View File

@ -442,7 +442,9 @@ pool_namecheck(const char *pool, namecheck_err_t *why, char *what)
return (-1);
}
if (strcmp(pool, "mirror") == 0 || strcmp(pool, "raidz") == 0) {
if (strcmp(pool, "mirror") == 0 ||
strcmp(pool, "raidz") == 0 ||
strcmp(pool, "draid") == 0) {
if (why)
*why = NAME_ERR_RESERVED;
return (-1);

View File

@ -84,6 +84,8 @@ $(MODULE)-objs += uberblock.o
$(MODULE)-objs += unique.o
$(MODULE)-objs += vdev.o
$(MODULE)-objs += vdev_cache.o
$(MODULE)-objs += vdev_draid.o
$(MODULE)-objs += vdev_draid_rand.o
$(MODULE)-objs += vdev_indirect.o
$(MODULE)-objs += vdev_indirect_births.o
$(MODULE)-objs += vdev_indirect_mapping.o

View File

@ -781,16 +781,17 @@ int
abd_iterate_func(abd_t *abd, size_t off, size_t size,
abd_iter_func_t *func, void *private)
{
int ret = 0;
struct abd_iter aiter;
boolean_t abd_multi;
abd_t *c_abd;
int ret = 0;
if (size == 0)
return (0);
abd_verify(abd);
ASSERT3U(off + size, <=, abd->abd_size);
abd_multi = abd_is_gang(abd);
c_abd = abd_init_abd_iter(abd, &aiter, off);
boolean_t abd_multi = abd_is_gang(abd);
abd_t *c_abd = abd_init_abd_iter(abd, &aiter, off);
while (size > 0) {
/* If we are at the end of the gang ABD we are done */
@ -920,6 +921,9 @@ abd_iterate_func2(abd_t *dabd, abd_t *sabd, size_t doff, size_t soff,
boolean_t dabd_is_gang_abd, sabd_is_gang_abd;
abd_t *c_dabd, *c_sabd;
if (size == 0)
return (0);
abd_verify(dabd);
abd_verify(sabd);

View File

@ -713,7 +713,7 @@ dsl_scan_setup_check(void *arg, dmu_tx_t *tx)
return (0);
}
static void
void
dsl_scan_setup_sync(void *arg, dmu_tx_t *tx)
{
dsl_scan_t *scn = dmu_tx_pool(tx)->dp_scan;
@ -3327,20 +3327,13 @@ dsl_scan_need_resilver(spa_t *spa, const dva_t *dva, size_t psize,
return (B_TRUE);
}
/*
* Check if the txg falls within the range which must be
* resilvered. DVAs outside this range can always be skipped.
*/
if (!vdev_dtl_contains(vd, DTL_PARTIAL, phys_birth, 1))
return (B_FALSE);
/*
* Check if the top-level vdev must resilver this offset.
* When the offset does not intersect with a dirty leaf DTL
* then it may be possible to skip the resilver IO. The psize
* is provided instead of asize to simplify the check for RAIDZ.
*/
if (!vdev_dtl_need_resilver(vd, DVA_GET_OFFSET(dva), psize))
if (!vdev_dtl_need_resilver(vd, dva, psize, phys_birth))
return (B_FALSE);
/*

View File

@ -32,6 +32,7 @@
#include <sys/space_map.h>
#include <sys/metaslab_impl.h>
#include <sys/vdev_impl.h>
#include <sys/vdev_draid.h>
#include <sys/zio.h>
#include <sys/spa_impl.h>
#include <sys/zfeature.h>
@ -1563,6 +1564,7 @@ metaslab_block_find(zfs_btree_t *t, range_tree_t *rt, uint64_t start,
#if defined(WITH_DF_BLOCK_ALLOCATOR) || \
defined(WITH_CF_BLOCK_ALLOCATOR)
/*
* This is a helper function that can be used by the allocator to find a
* suitable block to allocate. This will search the specified B-tree looking
@ -1654,6 +1656,7 @@ metaslab_df_alloc(metaslab_t *msp, uint64_t size)
range_seg_t *rs;
if (zfs_btree_numnodes(&msp->ms_allocatable_by_size) == 0)
metaslab_size_tree_full_load(msp->ms_allocatable);
if (metaslab_df_use_largest_segment) {
/* use largest free segment */
rs = zfs_btree_last(&msp->ms_allocatable_by_size, NULL);
@ -2616,6 +2619,10 @@ metaslab_init(metaslab_group_t *mg, uint64_t id, uint64_t object,
ms->ms_allocator = -1;
ms->ms_new = B_TRUE;
vdev_ops_t *ops = vd->vdev_ops;
if (ops->vdev_op_metaslab_init != NULL)
ops->vdev_op_metaslab_init(vd, &ms->ms_start, &ms->ms_size);
/*
* We only open space map objects that already exist. All others
* will be opened when we finally allocate an object for it.
@ -5813,7 +5820,6 @@ metaslab_alloc(spa_t *spa, metaslab_class_t *mc, uint64_t psize, blkptr_t *bp,
metaslab_group_alloc_increment(spa,
DVA_GET_VDEV(&dva[d]), zio, flags, allocator);
}
}
ASSERT(error == 0);
ASSERT(BP_GET_NDVAS(bp) == ndvas);

View File

@ -307,8 +307,17 @@ mmp_next_leaf(spa_t *spa)
if (leaf == NULL)
leaf = list_head(&spa->spa_leaf_list);
if (!vdev_writeable(leaf)) {
/*
* We skip unwritable, offline, detached, and dRAID spare
* devices as they are either not legal targets or the write
* may fail or not be seen by other hosts. Skipped dRAID
* spares can never be written so the fail mask is not set.
*/
if (!vdev_writeable(leaf) || leaf->vdev_offline ||
leaf->vdev_detached) {
fail_mask |= MMP_FAIL_NOT_WRITABLE;
} else if (leaf->vdev_ops == &vdev_draid_spare_ops) {
continue;
} else if (leaf->vdev_mmp_pending != 0) {
fail_mask |= MMP_FAIL_WRITE_PENDING;
} else {

View File

@ -60,6 +60,7 @@
#include <sys/vdev_rebuild.h>
#include <sys/vdev_trim.h>
#include <sys/vdev_disk.h>
#include <sys/vdev_draid.h>
#include <sys/metaslab.h>
#include <sys/metaslab_impl.h>
#include <sys/mmp.h>
@ -3681,7 +3682,14 @@ spa_ld_trusted_config(spa_t *spa, spa_import_type_t type,
/*
* Build a new vdev tree from the trusted config
*/
VERIFY(spa_config_parse(spa, &mrvd, nv, NULL, 0, VDEV_ALLOC_LOAD) == 0);
error = spa_config_parse(spa, &mrvd, nv, NULL, 0, VDEV_ALLOC_LOAD);
if (error != 0) {
nvlist_free(mos_config);
spa_config_exit(spa, SCL_ALL, FTAG);
spa_load_failed(spa, "spa_config_parse failed [error=%d]",
error);
return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, error));
}
/*
* Vdev paths in the MOS may be obsolete. If the untrusted config was
@ -5631,7 +5639,7 @@ spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props,
uint64_t txg = TXG_INITIAL;
nvlist_t **spares, **l2cache;
uint_t nspares, nl2cache;
uint64_t version, obj;
uint64_t version, obj, ndraid = 0;
boolean_t has_features;
boolean_t has_encryption;
boolean_t has_allocclass;
@ -5753,8 +5761,8 @@ spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props,
if (error == 0 &&
(error = vdev_create(rvd, txg, B_FALSE)) == 0 &&
(error = spa_validate_aux(spa, nvroot, txg,
VDEV_ALLOC_ADD)) == 0) {
(error = vdev_draid_spare_create(nvroot, rvd, &ndraid, 0)) == 0 &&
(error = spa_validate_aux(spa, nvroot, txg, VDEV_ALLOC_ADD)) == 0) {
/*
* instantiate the metaslab groups (this will dirty the vdevs)
* we can no longer error exit past this point
@ -5895,6 +5903,9 @@ spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props,
spa_sync_props(props, tx);
}
for (int i = 0; i < ndraid; i++)
spa_feature_incr(spa, SPA_FEATURE_DRAID, tx);
dmu_tx_commit(tx);
spa->spa_sync_on = B_TRUE;
@ -6403,13 +6414,26 @@ spa_reset(const char *pool)
* ==========================================================================
*/
/*
* This is called as a synctask to increment the draid feature flag
*/
static void
spa_draid_feature_incr(void *arg, dmu_tx_t *tx)
{
spa_t *spa = dmu_tx_pool(tx)->dp_spa;
int draid = (int)(uintptr_t)arg;
for (int c = 0; c < draid; c++)
spa_feature_incr(spa, SPA_FEATURE_DRAID, tx);
}
/*
* Add a device to a storage pool.
*/
int
spa_vdev_add(spa_t *spa, nvlist_t *nvroot)
{
uint64_t txg;
uint64_t txg, ndraid = 0;
int error;
vdev_t *rvd = spa->spa_root_vdev;
vdev_t *vd, *tvd;
@ -6438,8 +6462,23 @@ spa_vdev_add(spa_t *spa, nvlist_t *nvroot)
return (spa_vdev_exit(spa, vd, txg, EINVAL));
if (vd->vdev_children != 0 &&
(error = vdev_create(vd, txg, B_FALSE)) != 0)
(error = vdev_create(vd, txg, B_FALSE)) != 0) {
return (spa_vdev_exit(spa, vd, txg, error));
}
/*
* The virtual dRAID spares must be added after vdev tree is created
* and the vdev guids are generated. The guid of their assoicated
* dRAID is stored in the config and used when opening the spare.
*/
if ((error = vdev_draid_spare_create(nvroot, vd, &ndraid,
rvd->vdev_children)) == 0) {
if (ndraid > 0 && nvlist_lookup_nvlist_array(nvroot,
ZPOOL_CONFIG_SPARES, &spares, &nspares) != 0)
nspares = 0;
} else {
return (spa_vdev_exit(spa, vd, txg, error));
}
/*
* We must validate the spares and l2cache devices after checking the
@ -6452,7 +6491,7 @@ spa_vdev_add(spa_t *spa, nvlist_t *nvroot)
* If we are in the middle of a device removal, we can only add
* devices which match the existing devices in the pool.
* If we are in the middle of a removal, or have some indirect
* vdevs, we can not add raidz toplevels.
* vdevs, we can not add raidz or dRAID top levels.
*/
if (spa->spa_vdev_removal != NULL ||
spa->spa_removing_phys.sr_prev_indirect_vdev != -1) {
@ -6462,10 +6501,10 @@ spa_vdev_add(spa_t *spa, nvlist_t *nvroot)
tvd->vdev_ashift != spa->spa_max_ashift) {
return (spa_vdev_exit(spa, vd, txg, EINVAL));
}
/* Fail if top level vdev is raidz */
if (tvd->vdev_ops == &vdev_raidz_ops) {
/* Fail if top level vdev is raidz or a dRAID */
if (vdev_get_nparity(tvd) != 0)
return (spa_vdev_exit(spa, vd, txg, EINVAL));
}
/*
* Need the top level mirror to be
* a mirror of leaf vdevs only
@ -6505,6 +6544,19 @@ spa_vdev_add(spa_t *spa, nvlist_t *nvroot)
spa->spa_l2cache.sav_sync = B_TRUE;
}
/*
* We can't increment a feature while holding spa_vdev so we
* have to do it in a synctask.
*/
if (ndraid != 0) {
dmu_tx_t *tx;
tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg);
dsl_sync_task_nowait(spa->spa_dsl_pool, spa_draid_feature_incr,
(void *)(uintptr_t)ndraid, tx);
dmu_tx_commit(tx);
}
/*
* We have to be careful when adding new vdevs to an existing pool.
* If other threads start allocating from these vdevs before we
@ -6615,14 +6667,27 @@ spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing,
if (oldvd->vdev_top->vdev_islog && newvd->vdev_isspare)
return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP));
/*
* A dRAID spare can only replace a child of its parent dRAID vdev.
*/
if (newvd->vdev_ops == &vdev_draid_spare_ops &&
oldvd->vdev_top != vdev_draid_spare_get_parent(newvd)) {
return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP));
}
if (rebuild) {
/*
* For rebuilds, the parent vdev must support reconstruction
* For rebuilds, the top vdev must support reconstruction
* using only space maps. This means the only allowable
* parents are the root vdev or a mirror vdev.
* vdevs types are the root vdev, a mirror, or dRAID.
*/
if (pvd->vdev_ops != &vdev_mirror_ops &&
pvd->vdev_ops != &vdev_root_ops) {
tvd = pvd;
if (pvd->vdev_top != NULL)
tvd = pvd->vdev_top;
if (tvd->vdev_ops != &vdev_mirror_ops &&
tvd->vdev_ops != &vdev_root_ops &&
tvd->vdev_ops != &vdev_draid_ops) {
return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP));
}
}
@ -6915,14 +6980,20 @@ spa_vdev_detach(spa_t *spa, uint64_t guid, uint64_t pguid, int replace_done)
}
/*
* If we are detaching the original disk from a spare, then it implies
* that the spare should become a real disk, and be removed from the
* active spare list for the pool.
* If we are detaching the original disk from a normal spare, then it
* implies that the spare should become a real disk, and be removed
* from the active spare list for the pool. dRAID spares on the
* other hand are coupled to the pool and thus should never be removed
* from the spares list.
*/
if (pvd->vdev_ops == &vdev_spare_ops &&
vd->vdev_id == 0 &&
pvd->vdev_child[pvd->vdev_children - 1]->vdev_isspare)
unspare = B_TRUE;
if (pvd->vdev_ops == &vdev_spare_ops && vd->vdev_id == 0) {
vdev_t *last_cvd = pvd->vdev_child[pvd->vdev_children - 1];
if (last_cvd->vdev_isspare &&
last_cvd->vdev_ops != &vdev_draid_spare_ops) {
unspare = B_TRUE;
}
}
/*
* Erase the disk labels so the disk can be used for other things.
@ -8013,18 +8084,9 @@ spa_async_thread(void *arg)
/*
* If any devices are done replacing, detach them.
*/
if (tasks & SPA_ASYNC_RESILVER_DONE)
if (tasks & SPA_ASYNC_RESILVER_DONE ||
tasks & SPA_ASYNC_REBUILD_DONE) {
spa_vdev_resilver_done(spa);
/*
* If any devices are done replacing, detach them. Then if no
* top-level vdevs are rebuilding attempt to kick off a scrub.
*/
if (tasks & SPA_ASYNC_REBUILD_DONE) {
spa_vdev_resilver_done(spa);
if (!vdev_rebuild_active(spa->spa_root_vdev))
(void) dsl_scan(spa->spa_dsl_pool, POOL_SCAN_SCRUB);
}
/*

View File

@ -741,6 +741,7 @@ spa_add(const char *name, nvlist_t *config, const char *altroot)
spa->spa_min_ashift = INT_MAX;
spa->spa_max_ashift = 0;
spa->spa_min_alloc = INT_MAX;
/* Reset cached value */
spa->spa_dedup_dspace = ~0ULL;

View File

@ -40,6 +40,7 @@
#include <sys/dsl_dir.h>
#include <sys/vdev_impl.h>
#include <sys/vdev_rebuild.h>
#include <sys/vdev_draid.h>
#include <sys/uberblock_impl.h>
#include <sys/metaslab.h>
#include <sys/metaslab_impl.h>
@ -51,6 +52,7 @@
#include <sys/arc.h>
#include <sys/zil.h>
#include <sys/dsl_scan.h>
#include <sys/vdev_raidz.h>
#include <sys/abd.h>
#include <sys/vdev_initialize.h>
#include <sys/vdev_trim.h>
@ -193,6 +195,8 @@ vdev_dbgmsg_print_tree(vdev_t *vd, int indent)
static vdev_ops_t *vdev_ops_table[] = {
&vdev_root_ops,
&vdev_raidz_ops,
&vdev_draid_ops,
&vdev_draid_spare_ops,
&vdev_mirror_ops,
&vdev_replacing_ops,
&vdev_spare_ops,
@ -221,10 +225,11 @@ vdev_getops(const char *type)
/* ARGSUSED */
void
vdev_default_xlate(vdev_t *vd, const range_seg64_t *in, range_seg64_t *res)
vdev_default_xlate(vdev_t *vd, const range_seg64_t *logical_rs,
range_seg64_t *physical_rs, range_seg64_t *remain_rs)
{
res->rs_start = in->rs_start;
res->rs_end = in->rs_end;
physical_rs->rs_start = logical_rs->rs_start;
physical_rs->rs_end = logical_rs->rs_end;
}
/*
@ -264,6 +269,12 @@ vdev_default_asize(vdev_t *vd, uint64_t psize)
return (asize);
}
uint64_t
vdev_default_min_asize(vdev_t *vd)
{
return (vd->vdev_min_asize);
}
/*
* Get the minimum allocatable size. We define the allocatable size as
* the vdev's asize rounded to the nearest metaslab. This allows us to
@ -289,15 +300,7 @@ vdev_get_min_asize(vdev_t *vd)
if (vd == vd->vdev_top)
return (P2ALIGN(vd->vdev_asize, 1ULL << vd->vdev_ms_shift));
/*
* The allocatable space for a raidz vdev is N * sizeof(smallest child),
* so each child must provide at least 1/Nth of its asize.
*/
if (pvd->vdev_ops == &vdev_raidz_ops)
return ((pvd->vdev_min_asize + pvd->vdev_children - 1) /
pvd->vdev_children);
return (pvd->vdev_min_asize);
return (pvd->vdev_ops->vdev_op_min_asize(pvd));
}
void
@ -309,6 +312,48 @@ vdev_set_min_asize(vdev_t *vd)
vdev_set_min_asize(vd->vdev_child[c]);
}
/*
* Get the minimal allocation size for the top-level vdev.
*/
uint64_t
vdev_get_min_alloc(vdev_t *vd)
{
uint64_t min_alloc = 1ULL << vd->vdev_ashift;
if (vd->vdev_ops->vdev_op_min_alloc != NULL)
min_alloc = vd->vdev_ops->vdev_op_min_alloc(vd);
return (min_alloc);
}
/*
* Get the parity level for a top-level vdev.
*/
uint64_t
vdev_get_nparity(vdev_t *vd)
{
uint64_t nparity = 0;
if (vd->vdev_ops->vdev_op_nparity != NULL)
nparity = vd->vdev_ops->vdev_op_nparity(vd);
return (nparity);
}
/*
* Get the number of data disks for a top-level vdev.
*/
uint64_t
vdev_get_ndisks(vdev_t *vd)
{
uint64_t ndisks = 1;
if (vd->vdev_ops->vdev_op_ndisks != NULL)
ndisks = vd->vdev_ops->vdev_op_ndisks(vd);
return (ndisks);
}
vdev_t *
vdev_lookup_top(spa_t *spa, uint64_t vdev)
{
@ -551,6 +596,7 @@ vdev_alloc_common(spa_t *spa, uint_t id, uint64_t guid, vdev_ops_t *ops)
list_link_init(&vd->vdev_initialize_node);
list_link_init(&vd->vdev_leaf_node);
list_link_init(&vd->vdev_trim_node);
mutex_init(&vd->vdev_dtl_lock, NULL, MUTEX_NOLOCKDEP, NULL);
mutex_init(&vd->vdev_stat_lock, NULL, MUTEX_DEFAULT, NULL);
mutex_init(&vd->vdev_probe_lock, NULL, MUTEX_DEFAULT, NULL);
@ -569,9 +615,7 @@ vdev_alloc_common(spa_t *spa, uint_t id, uint64_t guid, vdev_ops_t *ops)
cv_init(&vd->vdev_trim_io_cv, NULL, CV_DEFAULT, NULL);
mutex_init(&vd->vdev_rebuild_lock, NULL, MUTEX_DEFAULT, NULL);
mutex_init(&vd->vdev_rebuild_io_lock, NULL, MUTEX_DEFAULT, NULL);
cv_init(&vd->vdev_rebuild_cv, NULL, CV_DEFAULT, NULL);
cv_init(&vd->vdev_rebuild_io_cv, NULL, CV_DEFAULT, NULL);
for (int t = 0; t < DTL_TYPES; t++) {
vd->vdev_dtl[t] = range_tree_create(NULL, RANGE_SEG64, NULL, 0,
@ -600,7 +644,7 @@ vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id,
{
vdev_ops_t *ops;
char *type;
uint64_t guid = 0, islog, nparity;
uint64_t guid = 0, islog;
vdev_t *vd;
vdev_indirect_config_t *vic;
char *tmp = NULL;
@ -657,48 +701,13 @@ vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id,
if (ops == &vdev_hole_ops && spa_version(spa) < SPA_VERSION_HOLES)
return (SET_ERROR(ENOTSUP));
/*
* Set the nparity property for RAID-Z vdevs.
*/
nparity = -1ULL;
if (ops == &vdev_raidz_ops) {
if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NPARITY,
&nparity) == 0) {
if (nparity == 0 || nparity > VDEV_RAIDZ_MAXPARITY)
return (SET_ERROR(EINVAL));
/*
* Previous versions could only support 1 or 2 parity
* device.
*/
if (nparity > 1 &&
spa_version(spa) < SPA_VERSION_RAIDZ2)
return (SET_ERROR(ENOTSUP));
if (nparity > 2 &&
spa_version(spa) < SPA_VERSION_RAIDZ3)
return (SET_ERROR(ENOTSUP));
} else {
/*
* We require the parity to be specified for SPAs that
* support multiple parity levels.
*/
if (spa_version(spa) >= SPA_VERSION_RAIDZ2)
return (SET_ERROR(EINVAL));
/*
* Otherwise, we default to 1 parity device for RAID-Z.
*/
nparity = 1;
}
} else {
nparity = 0;
}
ASSERT(nparity != -1ULL);
/*
* If creating a top-level vdev, check for allocation classes input
*/
if (top_level && alloctype == VDEV_ALLOC_ADD) {
char *bias;
/*
* If creating a top-level vdev, check for allocation
* classes input.
*/
if (nvlist_lookup_string(nv, ZPOOL_CONFIG_ALLOCATION_BIAS,
&bias) == 0) {
alloc_bias = vdev_derive_alloc_bias(bias);
@ -710,13 +719,32 @@ vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id,
return (SET_ERROR(ENOTSUP));
}
}
/* spa_vdev_add() expects feature to be enabled */
if (ops == &vdev_draid_ops &&
spa->spa_load_state != SPA_LOAD_CREATE &&
!spa_feature_is_enabled(spa, SPA_FEATURE_DRAID)) {
return (SET_ERROR(ENOTSUP));
}
}
/*
* Initialize the vdev specific data. This is done before calling
* vdev_alloc_common() since it may fail and this simplifies the
* error reporting and cleanup code paths.
*/
void *tsd = NULL;
if (ops->vdev_op_init != NULL) {
rc = ops->vdev_op_init(spa, nv, &tsd);
if (rc != 0) {
return (rc);
}
}
vd = vdev_alloc_common(spa, id, guid, ops);
vic = &vd->vdev_indirect_config;
vd->vdev_tsd = tsd;
vd->vdev_islog = islog;
vd->vdev_nparity = nparity;
if (top_level && alloc_bias != VDEV_BIAS_NONE)
vd->vdev_alloc_bias = alloc_bias;
@ -756,6 +784,8 @@ vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id,
&vd->vdev_wholedisk) != 0)
vd->vdev_wholedisk = -1ULL;
vic = &vd->vdev_indirect_config;
ASSERT0(vic->vic_mapping_object);
(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_INDIRECT_OBJECT,
&vic->vic_mapping_object);
@ -937,6 +967,9 @@ vdev_free(vdev_t *vd)
ASSERT(vd->vdev_child == NULL);
ASSERT(vd->vdev_guid_sum == vd->vdev_guid);
if (vd->vdev_ops->vdev_op_fini != NULL)
vd->vdev_ops->vdev_op_fini(vd);
/*
* Discard allocation state.
*/
@ -1028,9 +1061,7 @@ vdev_free(vdev_t *vd)
cv_destroy(&vd->vdev_trim_io_cv);
mutex_destroy(&vd->vdev_rebuild_lock);
mutex_destroy(&vd->vdev_rebuild_io_lock);
cv_destroy(&vd->vdev_rebuild_cv);
cv_destroy(&vd->vdev_rebuild_io_cv);
zfs_ratelimit_fini(&vd->vdev_delay_rl);
zfs_ratelimit_fini(&vd->vdev_checksum_rl);
@ -1161,7 +1192,8 @@ vdev_top_update(vdev_t *tvd, vdev_t *vd)
}
/*
* Add a mirror/replacing vdev above an existing vdev.
* Add a mirror/replacing vdev above an existing vdev. There is no need to
* call .vdev_op_init() since mirror/replacing vdevs do not have private state.
*/
vdev_t *
vdev_add_parent(vdev_t *cvd, vdev_ops_t *ops)
@ -1296,6 +1328,10 @@ vdev_metaslab_group_create(vdev_t *vd)
spa->spa_max_ashift = vd->vdev_ashift;
if (vd->vdev_ashift < spa->spa_min_ashift)
spa->spa_min_ashift = vd->vdev_ashift;
uint64_t min_alloc = vdev_get_min_alloc(vd);
if (min_alloc < spa->spa_min_alloc)
spa->spa_min_alloc = min_alloc;
}
}
}
@ -1622,39 +1658,67 @@ vdev_uses_zvols(vdev_t *vd)
return (B_FALSE);
}
/*
* Returns B_TRUE if the passed child should be opened.
*/
static boolean_t
vdev_default_open_children_func(vdev_t *vd)
{
return (B_TRUE);
}
/*
* Open the requested child vdevs. If any of the leaf vdevs are using
* a ZFS volume then do the opens in a single thread. This avoids a
* deadlock when the current thread is holding the spa_namespace_lock.
*/
static void
vdev_open_children_impl(vdev_t *vd, vdev_open_children_func_t *open_func)
{
int children = vd->vdev_children;
taskq_t *tq = taskq_create("vdev_open", children, minclsyspri,
children, children, TASKQ_PREPOPULATE);
vd->vdev_nonrot = B_TRUE;
for (int c = 0; c < children; c++) {
vdev_t *cvd = vd->vdev_child[c];
if (open_func(cvd) == B_FALSE)
continue;
if (tq == NULL || vdev_uses_zvols(vd)) {
cvd->vdev_open_error = vdev_open(cvd);
} else {
VERIFY(taskq_dispatch(tq, vdev_open_child,
cvd, TQ_SLEEP) != TASKQID_INVALID);
}
vd->vdev_nonrot &= cvd->vdev_nonrot;
}
if (tq != NULL) {
taskq_wait(tq);
taskq_destroy(tq);
}
}
/*
* Open all child vdevs.
*/
void
vdev_open_children(vdev_t *vd)
{
taskq_t *tq;
int children = vd->vdev_children;
vdev_open_children_impl(vd, vdev_default_open_children_func);
}
/*
* in order to handle pools on top of zvols, do the opens
* in a single thread so that the same thread holds the
* spa_namespace_lock
*/
if (vdev_uses_zvols(vd)) {
retry_sync:
for (int c = 0; c < children; c++)
vd->vdev_child[c]->vdev_open_error =
vdev_open(vd->vdev_child[c]);
} else {
tq = taskq_create("vdev_open", children, minclsyspri,
children, children, TASKQ_PREPOPULATE);
if (tq == NULL)
goto retry_sync;
for (int c = 0; c < children; c++)
VERIFY(taskq_dispatch(tq, vdev_open_child,
vd->vdev_child[c], TQ_SLEEP) != TASKQID_INVALID);
taskq_destroy(tq);
}
vd->vdev_nonrot = B_TRUE;
for (int c = 0; c < children; c++)
vd->vdev_nonrot &= vd->vdev_child[c]->vdev_nonrot;
/*
* Conditionally open a subset of child vdevs.
*/
void
vdev_open_children_subset(vdev_t *vd, vdev_open_children_func_t *open_func)
{
vdev_open_children_impl(vd, open_func);
}
/*
@ -1952,6 +2016,16 @@ vdev_open(vdev_t *vd)
return (error);
}
/*
* Track the the minimum allocation size.
*/
if (vd->vdev_top == vd && vd->vdev_ashift != 0 &&
vd->vdev_islog == 0 && vd->vdev_aux == NULL) {
uint64_t min_alloc = vdev_get_min_alloc(vd);
if (min_alloc < spa->spa_min_alloc)
spa->spa_min_alloc = min_alloc;
}
/*
* If this is a leaf vdev, assess whether a resilver is needed.
* But don't do this if we are doing a reopen for a scrub, since
@ -2278,7 +2352,9 @@ vdev_close(vdev_t *vd)
vdev_t *pvd = vd->vdev_parent;
spa_t *spa __maybe_unused = vd->vdev_spa;
ASSERT(spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL);
ASSERT(vd != NULL);
ASSERT(vd->vdev_open_thread == curthread ||
spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL);
/*
* If our parent is reopening, then we are as well, unless we are
@ -2606,10 +2682,26 @@ vdev_dtl_empty(vdev_t *vd, vdev_dtl_type_t t)
}
/*
* Returns B_TRUE if vdev determines offset needs to be resilvered.
* Check if the txg falls within the range which must be
* resilvered. DVAs outside this range can always be skipped.
*/
boolean_t
vdev_dtl_need_resilver(vdev_t *vd, uint64_t offset, size_t psize)
vdev_default_need_resilver(vdev_t *vd, const dva_t *dva, size_t psize,
uint64_t phys_birth)
{
/* Set by sequential resilver. */
if (phys_birth == TXG_UNKNOWN)
return (B_TRUE);
return (vdev_dtl_contains(vd, DTL_PARTIAL, phys_birth, 1));
}
/*
* Returns B_TRUE if the vdev determines the DVA needs to be resilvered.
*/
boolean_t
vdev_dtl_need_resilver(vdev_t *vd, const dva_t *dva, size_t psize,
uint64_t phys_birth)
{
ASSERT(vd != vd->vdev_spa->spa_root_vdev);
@ -2617,7 +2709,8 @@ vdev_dtl_need_resilver(vdev_t *vd, uint64_t offset, size_t psize)
vd->vdev_ops->vdev_op_leaf)
return (B_TRUE);
return (vd->vdev_ops->vdev_op_need_resilver(vd, offset, psize));
return (vd->vdev_ops->vdev_op_need_resilver(vd, dva, psize,
phys_birth));
}
/*
@ -2862,8 +2955,8 @@ vdev_dtl_reassess(vdev_t *vd, uint64_t txg, uint64_t scrub_txg,
continue; /* leaf vdevs only */
if (t == DTL_PARTIAL)
minref = 1; /* i.e. non-zero */
else if (vd->vdev_nparity != 0)
minref = vd->vdev_nparity + 1; /* RAID-Z */
else if (vdev_get_nparity(vd) != 0)
minref = vdev_get_nparity(vd) + 1; /* RAID-Z, dRAID */
else
minref = vd->vdev_children; /* any kind of mirror */
space_reftree_create(&reftree);
@ -3727,6 +3820,9 @@ top:
if (!vd->vdev_ops->vdev_op_leaf)
return (spa_vdev_state_exit(spa, NULL, SET_ERROR(ENOTSUP)));
if (vd->vdev_ops == &vdev_draid_spare_ops)
return (spa_vdev_state_exit(spa, NULL, ENOTSUP));
tvd = vd->vdev_top;
mg = tvd->vdev_mg;
generation = spa->spa_config_generation + 1;
@ -3971,6 +4067,13 @@ vdev_accessible(vdev_t *vd, zio_t *zio)
static void
vdev_get_child_stat(vdev_t *cvd, vdev_stat_t *vs, vdev_stat_t *cvs)
{
/*
* Exclude the dRAID spare when aggregating to avoid double counting
* the ops and bytes. These IOs are counted by the physical leaves.
*/
if (cvd->vdev_ops == &vdev_draid_spare_ops)
return;
for (int t = 0; t < VS_ZIO_TYPES; t++) {
vs->vs_ops[t] += cvs->vs_ops[t];
vs->vs_bytes[t] += cvs->vs_bytes[t];
@ -4063,7 +4166,6 @@ vdev_get_stats_ex_impl(vdev_t *vd, vdev_stat_t *vs, vdev_stat_ex_t *vsx)
vdev_get_child_stat(cvd, vs, cvs);
if (vsx)
vdev_get_child_stat_ex(cvd, vsx, cvsx);
}
} else {
/*
@ -4248,7 +4350,9 @@ vdev_stat_update(zio_t *zio, uint64_t psize)
/*
* Repair is the result of a rebuild issued by the
* rebuild thread (vdev_rebuild_thread).
* rebuild thread (vdev_rebuild_thread). To avoid
* double counting repaired bytes the virtual dRAID
* spare vdev is excluded from the processed bytes.
*/
if (zio->io_priority == ZIO_PRIORITY_REBUILD) {
vdev_t *tvd = vd->vdev_top;
@ -4256,8 +4360,10 @@ vdev_stat_update(zio_t *zio, uint64_t psize)
vdev_rebuild_phys_t *vrp = &vr->vr_rebuild_phys;
uint64_t *rebuilt = &vrp->vrp_bytes_rebuilt;
if (vd->vdev_ops->vdev_op_leaf)
if (vd->vdev_ops->vdev_op_leaf &&
vd->vdev_ops != &vdev_draid_spare_ops) {
atomic_add_64(rebuilt, psize);
}
vs->vs_rebuild_processed += psize;
}
@ -4981,31 +5087,42 @@ vdev_clear_resilver_deferred(vdev_t *vd, dmu_tx_t *tx)
vdev_resilver_needed(vd, NULL, NULL));
}
boolean_t
vdev_xlate_is_empty(range_seg64_t *rs)
{
return (rs->rs_start == rs->rs_end);
}
/*
* Translate a logical range to the physical range for the specified vdev_t.
* This function is initially called with a leaf vdev and will walk each
* parent vdev until it reaches a top-level vdev. Once the top-level is
* reached the physical range is initialized and the recursive function
* begins to unwind. As it unwinds it calls the parent's vdev specific
* translation function to do the real conversion.
* Translate a logical range to the first contiguous physical range for the
* specified vdev_t. This function is initially called with a leaf vdev and
* will walk each parent vdev until it reaches a top-level vdev. Once the
* top-level is reached the physical range is initialized and the recursive
* function begins to unwind. As it unwinds it calls the parent's vdev
* specific translation function to do the real conversion.
*/
void
vdev_xlate(vdev_t *vd, const range_seg64_t *logical_rs,
range_seg64_t *physical_rs)
range_seg64_t *physical_rs, range_seg64_t *remain_rs)
{
/*
* Walk up the vdev tree
*/
if (vd != vd->vdev_top) {
vdev_xlate(vd->vdev_parent, logical_rs, physical_rs);
vdev_xlate(vd->vdev_parent, logical_rs, physical_rs,
remain_rs);
} else {
/*
* We've reached the top-level vdev, initialize the
* physical range to the logical range and start to
* unwind.
* We've reached the top-level vdev, initialize the physical
* range to the logical range and set an empty remaining
* range then start to unwind.
*/
physical_rs->rs_start = logical_rs->rs_start;
physical_rs->rs_end = logical_rs->rs_end;
remain_rs->rs_start = logical_rs->rs_start;
remain_rs->rs_end = logical_rs->rs_start;
return;
}
@ -5015,16 +5132,40 @@ vdev_xlate(vdev_t *vd, const range_seg64_t *logical_rs,
/*
* As this recursive function unwinds, translate the logical
* range into its physical components by calling the
* vdev specific translate function.
* range into its physical and any remaining components by calling
* the vdev specific translate function.
*/
range_seg64_t intermediate = { 0 };
pvd->vdev_ops->vdev_op_xlate(vd, physical_rs, &intermediate);
pvd->vdev_ops->vdev_op_xlate(vd, physical_rs, &intermediate, remain_rs);
physical_rs->rs_start = intermediate.rs_start;
physical_rs->rs_end = intermediate.rs_end;
}
void
vdev_xlate_walk(vdev_t *vd, const range_seg64_t *logical_rs,
vdev_xlate_func_t *func, void *arg)
{
range_seg64_t iter_rs = *logical_rs;
range_seg64_t physical_rs;
range_seg64_t remain_rs;
while (!vdev_xlate_is_empty(&iter_rs)) {
vdev_xlate(vd, &iter_rs, &physical_rs, &remain_rs);
/*
* With raidz and dRAID, it's possible that the logical range
* does not live on this leaf vdev. Only when there is a non-
* zero physical size call the provided function.
*/
if (!vdev_xlate_is_empty(&physical_rs))
func(arg, &physical_rs);
iter_rs = remain_rs;
}
}
/*
* Look at the vdev tree and determine whether any devices are currently being
* replaced.

2984
module/zfs/vdev_draid.c Normal file

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,40 @@
/*
* Xorshift Pseudo Random Number Generator based on work by David Blackman
* and Sebastiano Vigna (vigna@acm.org).
*
* "Further scramblings of Marsaglia's xorshift generators"
* http://vigna.di.unimi.it/ftp/papers/xorshiftplus.pdf
* http://prng.di.unimi.it/xoroshiro128plusplus.c
*
* To the extent possible under law, the author has dedicated all copyright
* and related and neighboring rights to this software to the public domain
* worldwide. This software is distributed without any warranty.
*
* See <http://creativecommons.org/publicdomain/zero/1.0/>.
*
* This is xoroshiro128++ 1.0, one of our all-purpose, rock-solid,
* small-state generators. It is extremely (sub-ns) fast and it passes all
* tests we are aware of, but its state space is large enough only for
* mild parallelism.
*/
#include <sys/vdev_draid.h>
static inline uint64_t rotl(const uint64_t x, int k)
{
return (x << k) | (x >> (64 - k));
}
uint64_t
vdev_draid_rand(uint64_t *s)
{
const uint64_t s0 = s[0];
uint64_t s1 = s[1];
const uint64_t result = rotl(s0 + s1, 17) + s0;
s1 ^= s0;
s[0] = rotl(s0, 49) ^ s1 ^ (s1 << 21); // a, b
s[1] = rotl(s1, 28); // c
return (result);
}

View File

@ -1844,9 +1844,13 @@ vdev_indirect_io_done(zio_t *zio)
}
vdev_ops_t vdev_indirect_ops = {
.vdev_op_init = NULL,
.vdev_op_fini = NULL,
.vdev_op_open = vdev_indirect_open,
.vdev_op_close = vdev_indirect_close,
.vdev_op_asize = vdev_default_asize,
.vdev_op_min_asize = vdev_default_min_asize,
.vdev_op_min_alloc = NULL,
.vdev_op_io_start = vdev_indirect_io_start,
.vdev_op_io_done = vdev_indirect_io_done,
.vdev_op_state_change = NULL,
@ -1855,6 +1859,11 @@ vdev_ops_t vdev_indirect_ops = {
.vdev_op_rele = NULL,
.vdev_op_remap = vdev_indirect_remap,
.vdev_op_xlate = NULL,
.vdev_op_rebuild_asize = NULL,
.vdev_op_metaslab_init = NULL,
.vdev_op_config_generate = NULL,
.vdev_op_nparity = NULL,
.vdev_op_ndisks = NULL,
.vdev_op_type = VDEV_TYPE_INDIRECT, /* name of this vdev type */
.vdev_op_leaf = B_FALSE /* leaf vdev */
};

View File

@ -121,6 +121,8 @@ vdev_initialize_change_state(vdev_t *vd, vdev_initializing_state_t new_state)
if (vd->vdev_initialize_state != VDEV_INITIALIZE_SUSPENDED) {
vd->vdev_initialize_action_time = gethrestime_sec();
}
vdev_initializing_state_t old_state = vd->vdev_initialize_state;
vd->vdev_initialize_state = new_state;
dmu_tx_t *tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir);
@ -138,8 +140,10 @@ vdev_initialize_change_state(vdev_t *vd, vdev_initializing_state_t new_state)
"vdev=%s suspended", vd->vdev_path);
break;
case VDEV_INITIALIZE_CANCELED:
spa_history_log_internal(spa, "initialize", tx,
"vdev=%s canceled", vd->vdev_path);
if (old_state == VDEV_INITIALIZE_ACTIVE ||
old_state == VDEV_INITIALIZE_SUSPENDED)
spa_history_log_internal(spa, "initialize", tx,
"vdev=%s canceled", vd->vdev_path);
break;
case VDEV_INITIALIZE_COMPLETE:
spa_history_log_internal(spa, "initialize", tx,
@ -317,6 +321,32 @@ vdev_initialize_ranges(vdev_t *vd, abd_t *data)
return (0);
}
static void
vdev_initialize_xlate_last_rs_end(void *arg, range_seg64_t *physical_rs)
{
uint64_t *last_rs_end = (uint64_t *)arg;
if (physical_rs->rs_end > *last_rs_end)
*last_rs_end = physical_rs->rs_end;
}
static void
vdev_initialize_xlate_progress(void *arg, range_seg64_t *physical_rs)
{
vdev_t *vd = (vdev_t *)arg;
uint64_t size = physical_rs->rs_end - physical_rs->rs_start;
vd->vdev_initialize_bytes_est += size;
if (vd->vdev_initialize_last_offset > physical_rs->rs_end) {
vd->vdev_initialize_bytes_done += size;
} else if (vd->vdev_initialize_last_offset > physical_rs->rs_start &&
vd->vdev_initialize_last_offset < physical_rs->rs_end) {
vd->vdev_initialize_bytes_done +=
vd->vdev_initialize_last_offset - physical_rs->rs_start;
}
}
static void
vdev_initialize_calculate_progress(vdev_t *vd)
{
@ -331,28 +361,35 @@ vdev_initialize_calculate_progress(vdev_t *vd)
metaslab_t *msp = vd->vdev_top->vdev_ms[i];
mutex_enter(&msp->ms_lock);
uint64_t ms_free = msp->ms_size -
metaslab_allocated_space(msp);
if (vd->vdev_top->vdev_ops == &vdev_raidz_ops)
ms_free /= vd->vdev_top->vdev_children;
uint64_t ms_free = (msp->ms_size -
metaslab_allocated_space(msp)) /
vdev_get_ndisks(vd->vdev_top);
/*
* Convert the metaslab range to a physical range
* on our vdev. We use this to determine if we are
* in the middle of this metaslab range.
*/
range_seg64_t logical_rs, physical_rs;
range_seg64_t logical_rs, physical_rs, remain_rs;
logical_rs.rs_start = msp->ms_start;
logical_rs.rs_end = msp->ms_start + msp->ms_size;
vdev_xlate(vd, &logical_rs, &physical_rs);
/* Metaslab space after this offset has not been initialized */
vdev_xlate(vd, &logical_rs, &physical_rs, &remain_rs);
if (vd->vdev_initialize_last_offset <= physical_rs.rs_start) {
vd->vdev_initialize_bytes_est += ms_free;
mutex_exit(&msp->ms_lock);
continue;
} else if (vd->vdev_initialize_last_offset >
physical_rs.rs_end) {
}
/* Metaslab space before this offset has been initialized */
uint64_t last_rs_end = physical_rs.rs_end;
if (!vdev_xlate_is_empty(&remain_rs)) {
vdev_xlate_walk(vd, &remain_rs,
vdev_initialize_xlate_last_rs_end, &last_rs_end);
}
if (vd->vdev_initialize_last_offset > last_rs_end) {
vd->vdev_initialize_bytes_done += ms_free;
vd->vdev_initialize_bytes_est += ms_free;
mutex_exit(&msp->ms_lock);
@ -374,22 +411,9 @@ vdev_initialize_calculate_progress(vdev_t *vd)
&where)) {
logical_rs.rs_start = rs_get_start(rs, rt);
logical_rs.rs_end = rs_get_end(rs, rt);
vdev_xlate(vd, &logical_rs, &physical_rs);
uint64_t size = physical_rs.rs_end -
physical_rs.rs_start;
vd->vdev_initialize_bytes_est += size;
if (vd->vdev_initialize_last_offset >
physical_rs.rs_end) {
vd->vdev_initialize_bytes_done += size;
} else if (vd->vdev_initialize_last_offset >
physical_rs.rs_start &&
vd->vdev_initialize_last_offset <
physical_rs.rs_end) {
vd->vdev_initialize_bytes_done +=
vd->vdev_initialize_last_offset -
physical_rs.rs_start;
}
vdev_xlate_walk(vd, &logical_rs,
vdev_initialize_xlate_progress, vd);
}
mutex_exit(&msp->ms_lock);
}
@ -419,6 +443,34 @@ vdev_initialize_load(vdev_t *vd)
return (err);
}
static void
vdev_initialize_xlate_range_add(void *arg, range_seg64_t *physical_rs)
{
vdev_t *vd = arg;
/* Only add segments that we have not visited yet */
if (physical_rs->rs_end <= vd->vdev_initialize_last_offset)
return;
/* Pick up where we left off mid-range. */
if (vd->vdev_initialize_last_offset > physical_rs->rs_start) {
zfs_dbgmsg("range write: vd %s changed (%llu, %llu) to "
"(%llu, %llu)", vd->vdev_path,
(u_longlong_t)physical_rs->rs_start,
(u_longlong_t)physical_rs->rs_end,
(u_longlong_t)vd->vdev_initialize_last_offset,
(u_longlong_t)physical_rs->rs_end);
ASSERT3U(physical_rs->rs_end, >,
vd->vdev_initialize_last_offset);
physical_rs->rs_start = vd->vdev_initialize_last_offset;
}
ASSERT3U(physical_rs->rs_end, >, physical_rs->rs_start);
range_tree_add(vd->vdev_initialize_tree, physical_rs->rs_start,
physical_rs->rs_end - physical_rs->rs_start);
}
/*
* Convert the logical range into a physical range and add it to our
* avl tree.
@ -427,47 +479,12 @@ static void
vdev_initialize_range_add(void *arg, uint64_t start, uint64_t size)
{
vdev_t *vd = arg;
range_seg64_t logical_rs, physical_rs;
range_seg64_t logical_rs;
logical_rs.rs_start = start;
logical_rs.rs_end = start + size;
ASSERT(vd->vdev_ops->vdev_op_leaf);
vdev_xlate(vd, &logical_rs, &physical_rs);
IMPLY(vd->vdev_top == vd,
logical_rs.rs_start == physical_rs.rs_start);
IMPLY(vd->vdev_top == vd,
logical_rs.rs_end == physical_rs.rs_end);
/* Only add segments that we have not visited yet */
if (physical_rs.rs_end <= vd->vdev_initialize_last_offset)
return;
/* Pick up where we left off mid-range. */
if (vd->vdev_initialize_last_offset > physical_rs.rs_start) {
zfs_dbgmsg("range write: vd %s changed (%llu, %llu) to "
"(%llu, %llu)", vd->vdev_path,
(u_longlong_t)physical_rs.rs_start,
(u_longlong_t)physical_rs.rs_end,
(u_longlong_t)vd->vdev_initialize_last_offset,
(u_longlong_t)physical_rs.rs_end);
ASSERT3U(physical_rs.rs_end, >,
vd->vdev_initialize_last_offset);
physical_rs.rs_start = vd->vdev_initialize_last_offset;
}
ASSERT3U(physical_rs.rs_end, >=, physical_rs.rs_start);
/*
* With raidz, it's possible that the logical range does not live on
* this leaf vdev. We only add the physical range to this vdev's if it
* has a length greater than 0.
*/
if (physical_rs.rs_end > physical_rs.rs_start) {
range_tree_add(vd->vdev_initialize_tree, physical_rs.rs_start,
physical_rs.rs_end - physical_rs.rs_start);
} else {
ASSERT3U(physical_rs.rs_end, ==, physical_rs.rs_start);
}
vdev_xlate_walk(vd, &logical_rs, vdev_initialize_xlate_range_add, arg);
}
static void

View File

@ -142,6 +142,7 @@
#include <sys/zap.h>
#include <sys/vdev.h>
#include <sys/vdev_impl.h>
#include <sys/vdev_draid.h>
#include <sys/uberblock_impl.h>
#include <sys/metaslab.h>
#include <sys/metaslab_impl.h>
@ -453,31 +454,13 @@ vdev_config_generate(spa_t *spa, vdev_t *vd, boolean_t getstats,
if (vd->vdev_fru != NULL)
fnvlist_add_string(nv, ZPOOL_CONFIG_FRU, vd->vdev_fru);
if (vd->vdev_nparity != 0) {
ASSERT(strcmp(vd->vdev_ops->vdev_op_type,
VDEV_TYPE_RAIDZ) == 0);
if (vd->vdev_ops->vdev_op_config_generate != NULL)
vd->vdev_ops->vdev_op_config_generate(vd, nv);
/*
* Make sure someone hasn't managed to sneak a fancy new vdev
* into a crufty old storage pool.
*/
ASSERT(vd->vdev_nparity == 1 ||
(vd->vdev_nparity <= 2 &&
spa_version(spa) >= SPA_VERSION_RAIDZ2) ||
(vd->vdev_nparity <= 3 &&
spa_version(spa) >= SPA_VERSION_RAIDZ3));
/*
* Note that we'll add the nparity tag even on storage pools
* that only support a single parity device -- older software
* will just ignore it.
*/
fnvlist_add_uint64(nv, ZPOOL_CONFIG_NPARITY, vd->vdev_nparity);
}
if (vd->vdev_wholedisk != -1ULL)
if (vd->vdev_wholedisk != -1ULL) {
fnvlist_add_uint64(nv, ZPOOL_CONFIG_WHOLE_DISK,
vd->vdev_wholedisk);
}
if (vd->vdev_not_present && !(flags & VDEV_CONFIG_MISSING))
fnvlist_add_uint64(nv, ZPOOL_CONFIG_NOT_PRESENT, 1);
@ -785,6 +768,14 @@ vdev_label_read_config(vdev_t *vd, uint64_t txg)
if (!vdev_readable(vd))
return (NULL);
/*
* The label for a dRAID distributed spare is not stored on disk.
* Instead it is generated when needed which allows us to bypass
* the pipeline when reading the config from the label.
*/
if (vd->vdev_ops == &vdev_draid_spare_ops)
return (vdev_draid_read_config_spare(vd));
vp_abd = abd_alloc_linear(sizeof (vdev_phys_t), B_TRUE);
vp = abd_to_buf(vp_abd);
@ -1497,7 +1488,8 @@ vdev_uberblock_load_impl(zio_t *zio, vdev_t *vd, int flags,
for (int c = 0; c < vd->vdev_children; c++)
vdev_uberblock_load_impl(zio, vd->vdev_child[c], flags, cbp);
if (vd->vdev_ops->vdev_op_leaf && vdev_readable(vd)) {
if (vd->vdev_ops->vdev_op_leaf && vdev_readable(vd) &&
vd->vdev_ops != &vdev_draid_spare_ops) {
for (int l = 0; l < VDEV_LABELS; l++) {
for (int n = 0; n < VDEV_UBERBLOCK_COUNT(vd); n++) {
vdev_label_read(zio, vd, l,
@ -1586,6 +1578,13 @@ vdev_copy_uberblocks(vdev_t *vd)
SCL_STATE);
ASSERT(vd->vdev_ops->vdev_op_leaf);
/*
* No uberblocks are stored on distributed spares, they may be
* safely skipped when expanding a leaf vdev.
*/
if (vd->vdev_ops == &vdev_draid_spare_ops)
return;
spa_config_enter(vd->vdev_spa, locks, FTAG, RW_READER);
ub_abd = abd_alloc_linear(VDEV_UBERBLOCK_SIZE(vd), B_TRUE);
@ -1647,6 +1646,15 @@ vdev_uberblock_sync(zio_t *zio, uint64_t *good_writes,
if (!vdev_writeable(vd))
return;
/*
* There's no need to write uberblocks to a distributed spare, they
* are already stored on all the leaves of the parent dRAID. For
* this same reason vdev_uberblock_load_impl() skips distributed
* spares when reading uberblocks.
*/
if (vd->vdev_ops == &vdev_draid_spare_ops)
return;
/* If the vdev was expanded, need to copy uberblock rings. */
if (vd->vdev_state == VDEV_STATE_HEALTHY &&
vd->vdev_copy_uberblocks == B_TRUE) {
@ -1763,6 +1771,14 @@ vdev_label_sync(zio_t *zio, uint64_t *good_writes,
if (!vdev_writeable(vd))
return;
/*
* The top-level config never needs to be written to a distributed
* spare. When read vdev_dspare_label_read_config() will generate
* the config for the vdev_label_read_config().
*/
if (vd->vdev_ops == &vdev_draid_spare_ops)
return;
/*
* Generate a label describing the top-level config to which we belong.
*/

View File

@ -33,6 +33,7 @@
#include <sys/dsl_pool.h>
#include <sys/dsl_scan.h>
#include <sys/vdev_impl.h>
#include <sys/vdev_draid.h>
#include <sys/zio.h>
#include <sys/abd.h>
#include <sys/fs/zfs.h>
@ -99,7 +100,6 @@ vdev_mirror_stat_fini(void)
/*
* Virtual device vector for mirroring.
*/
typedef struct mirror_child {
vdev_t *mc_vd;
uint64_t mc_offset;
@ -108,6 +108,7 @@ typedef struct mirror_child {
uint8_t mc_tried;
uint8_t mc_skipped;
uint8_t mc_speculative;
uint8_t mc_rebuilding;
} mirror_child_t;
typedef struct mirror_map {
@ -115,6 +116,7 @@ typedef struct mirror_map {
int mm_preferred_cnt;
int mm_children;
boolean_t mm_resilvering;
boolean_t mm_rebuilding;
boolean_t mm_root;
mirror_child_t mm_child[];
} mirror_map_t;
@ -239,6 +241,21 @@ vdev_mirror_load(mirror_map_t *mm, vdev_t *vd, uint64_t zio_offset)
return (load + zfs_vdev_mirror_rotating_seek_inc);
}
static boolean_t
vdev_mirror_rebuilding(vdev_t *vd)
{
if (vd->vdev_ops->vdev_op_leaf && vd->vdev_rebuild_txg)
return (B_TRUE);
for (int i = 0; i < vd->vdev_children; i++) {
if (vdev_mirror_rebuilding(vd->vdev_child[i])) {
return (B_TRUE);
}
}
return (B_FALSE);
}
/*
* Avoid inlining the function to keep vdev_mirror_io_start(), which
* is this functions only caller, as small as possible on the stack.
@ -356,6 +373,9 @@ vdev_mirror_map_init(zio_t *zio)
mc = &mm->mm_child[c];
mc->mc_vd = vd->vdev_child[c];
mc->mc_offset = zio->io_offset;
if (vdev_mirror_rebuilding(mc->mc_vd))
mm->mm_rebuilding = mc->mc_rebuilding = B_TRUE;
}
}
@ -493,12 +513,37 @@ vdev_mirror_preferred_child_randomize(zio_t *zio)
return (mm->mm_preferred[p]);
}
static boolean_t
vdev_mirror_child_readable(mirror_child_t *mc)
{
vdev_t *vd = mc->mc_vd;
if (vd->vdev_top != NULL && vd->vdev_top->vdev_ops == &vdev_draid_ops)
return (vdev_draid_readable(vd, mc->mc_offset));
else
return (vdev_readable(vd));
}
static boolean_t
vdev_mirror_child_missing(mirror_child_t *mc, uint64_t txg, uint64_t size)
{
vdev_t *vd = mc->mc_vd;
if (vd->vdev_top != NULL && vd->vdev_top->vdev_ops == &vdev_draid_ops)
return (vdev_draid_missing(vd, mc->mc_offset, txg, size));
else
return (vdev_dtl_contains(vd, DTL_MISSING, txg, size));
}
/*
* Try to find a vdev whose DTL doesn't contain the block we want to read
* preferring vdevs based on determined load.
* preferring vdevs based on determined load. If we can't, try the read on
* any vdev we haven't already tried.
*
* Try to find a child whose DTL doesn't contain the block we want to read.
* If we can't, try the read on any vdev we haven't already tried.
* Distributed spares are an exception to the above load rule. They are
* always preferred in order to detect gaps in the distributed spare which
* are created when another disk in the dRAID fails. In order to restore
* redundancy those gaps must be read to trigger the required repair IO.
*/
static int
vdev_mirror_child_select(zio_t *zio)
@ -518,20 +563,27 @@ vdev_mirror_child_select(zio_t *zio)
if (mc->mc_tried || mc->mc_skipped)
continue;
if (mc->mc_vd == NULL || !vdev_readable(mc->mc_vd)) {
if (mc->mc_vd == NULL ||
!vdev_mirror_child_readable(mc)) {
mc->mc_error = SET_ERROR(ENXIO);
mc->mc_tried = 1; /* don't even try */
mc->mc_skipped = 1;
continue;
}
if (vdev_dtl_contains(mc->mc_vd, DTL_MISSING, txg, 1)) {
if (vdev_mirror_child_missing(mc, txg, 1)) {
mc->mc_error = SET_ERROR(ESTALE);
mc->mc_skipped = 1;
mc->mc_speculative = 1;
continue;
}
if (mc->mc_vd->vdev_ops == &vdev_draid_spare_ops) {
mm->mm_preferred[0] = c;
mm->mm_preferred_cnt = 1;
break;
}
mc->mc_load = vdev_mirror_load(mm, mc->mc_vd, mc->mc_offset);
if (mc->mc_load > lowest_load)
continue;
@ -625,11 +677,25 @@ vdev_mirror_io_start(zio_t *zio)
while (children--) {
mc = &mm->mm_child[c];
c++;
/*
* When sequentially resilvering only issue write repair
* IOs to the vdev which is being rebuilt since performance
* is limited by the slowest child. This is an issue for
* faster replacement devices such as distributed spares.
*/
if ((zio->io_priority == ZIO_PRIORITY_REBUILD) &&
(zio->io_flags & ZIO_FLAG_IO_REPAIR) &&
!(zio->io_flags & ZIO_FLAG_SCRUB) &&
mm->mm_rebuilding && !mc->mc_rebuilding) {
continue;
}
zio_nowait(zio_vdev_child_io(zio, zio->io_bp,
mc->mc_vd, mc->mc_offset, zio->io_abd, zio->io_size,
zio->io_type, zio->io_priority, 0,
vdev_mirror_child_done, mc));
c++;
}
zio_execute(zio);
@ -744,6 +810,8 @@ vdev_mirror_io_done(zio_t *zio)
mc = &mm->mm_child[c];
if (mc->mc_error == 0) {
vdev_ops_t *ops = mc->mc_vd->vdev_ops;
if (mc->mc_tried)
continue;
/*
@ -752,15 +820,16 @@ vdev_mirror_io_done(zio_t *zio)
* 1. it's a scrub (in which case we have
* tried everything that was healthy)
* - or -
* 2. it's an indirect vdev (in which case
* it could point to any other vdev, which
* might have a bad DTL)
* 2. it's an indirect or distributed spare
* vdev (in which case it could point to any
* other vdev, which might have a bad DTL)
* - or -
* 3. the DTL indicates that this data is
* missing from this vdev
*/
if (!(zio->io_flags & ZIO_FLAG_SCRUB) &&
mc->mc_vd->vdev_ops != &vdev_indirect_ops &&
ops != &vdev_indirect_ops &&
ops != &vdev_draid_spare_ops &&
!vdev_dtl_contains(mc->mc_vd, DTL_PARTIAL,
zio->io_txg, 1))
continue;
@ -796,50 +865,90 @@ vdev_mirror_state_change(vdev_t *vd, int faulted, int degraded)
}
}
/*
* Return the maximum asize for a rebuild zio in the provided range.
*/
static uint64_t
vdev_mirror_rebuild_asize(vdev_t *vd, uint64_t start, uint64_t asize,
uint64_t max_segment)
{
uint64_t psize = MIN(P2ROUNDUP(max_segment, 1 << vd->vdev_ashift),
SPA_MAXBLOCKSIZE);
return (MIN(asize, vdev_psize_to_asize(vd, psize)));
}
vdev_ops_t vdev_mirror_ops = {
.vdev_op_init = NULL,
.vdev_op_fini = NULL,
.vdev_op_open = vdev_mirror_open,
.vdev_op_close = vdev_mirror_close,
.vdev_op_asize = vdev_default_asize,
.vdev_op_min_asize = vdev_default_min_asize,
.vdev_op_min_alloc = NULL,
.vdev_op_io_start = vdev_mirror_io_start,
.vdev_op_io_done = vdev_mirror_io_done,
.vdev_op_state_change = vdev_mirror_state_change,
.vdev_op_need_resilver = NULL,
.vdev_op_need_resilver = vdev_default_need_resilver,
.vdev_op_hold = NULL,
.vdev_op_rele = NULL,
.vdev_op_remap = NULL,
.vdev_op_xlate = vdev_default_xlate,
.vdev_op_rebuild_asize = vdev_mirror_rebuild_asize,
.vdev_op_metaslab_init = NULL,
.vdev_op_config_generate = NULL,
.vdev_op_nparity = NULL,
.vdev_op_ndisks = NULL,
.vdev_op_type = VDEV_TYPE_MIRROR, /* name of this vdev type */
.vdev_op_leaf = B_FALSE /* not a leaf vdev */
};
vdev_ops_t vdev_replacing_ops = {
.vdev_op_init = NULL,
.vdev_op_fini = NULL,
.vdev_op_open = vdev_mirror_open,
.vdev_op_close = vdev_mirror_close,
.vdev_op_asize = vdev_default_asize,
.vdev_op_min_asize = vdev_default_min_asize,
.vdev_op_min_alloc = NULL,
.vdev_op_io_start = vdev_mirror_io_start,
.vdev_op_io_done = vdev_mirror_io_done,
.vdev_op_state_change = vdev_mirror_state_change,
.vdev_op_need_resilver = NULL,
.vdev_op_need_resilver = vdev_default_need_resilver,
.vdev_op_hold = NULL,
.vdev_op_rele = NULL,
.vdev_op_remap = NULL,
.vdev_op_xlate = vdev_default_xlate,
.vdev_op_rebuild_asize = vdev_mirror_rebuild_asize,
.vdev_op_metaslab_init = NULL,
.vdev_op_config_generate = NULL,
.vdev_op_nparity = NULL,
.vdev_op_ndisks = NULL,
.vdev_op_type = VDEV_TYPE_REPLACING, /* name of this vdev type */
.vdev_op_leaf = B_FALSE /* not a leaf vdev */
};
vdev_ops_t vdev_spare_ops = {
.vdev_op_init = NULL,
.vdev_op_fini = NULL,
.vdev_op_open = vdev_mirror_open,
.vdev_op_close = vdev_mirror_close,
.vdev_op_asize = vdev_default_asize,
.vdev_op_min_asize = vdev_default_min_asize,
.vdev_op_min_alloc = NULL,
.vdev_op_io_start = vdev_mirror_io_start,
.vdev_op_io_done = vdev_mirror_io_done,
.vdev_op_state_change = vdev_mirror_state_change,
.vdev_op_need_resilver = NULL,
.vdev_op_need_resilver = vdev_default_need_resilver,
.vdev_op_hold = NULL,
.vdev_op_rele = NULL,
.vdev_op_remap = NULL,
.vdev_op_xlate = vdev_default_xlate,
.vdev_op_rebuild_asize = vdev_mirror_rebuild_asize,
.vdev_op_metaslab_init = NULL,
.vdev_op_config_generate = NULL,
.vdev_op_nparity = NULL,
.vdev_op_ndisks = NULL,
.vdev_op_type = VDEV_TYPE_SPARE, /* name of this vdev type */
.vdev_op_leaf = B_FALSE /* not a leaf vdev */
};

View File

@ -81,9 +81,13 @@ vdev_missing_io_done(zio_t *zio)
}
vdev_ops_t vdev_missing_ops = {
.vdev_op_init = NULL,
.vdev_op_fini = NULL,
.vdev_op_open = vdev_missing_open,
.vdev_op_close = vdev_missing_close,
.vdev_op_asize = vdev_default_asize,
.vdev_op_min_asize = vdev_default_min_asize,
.vdev_op_min_alloc = NULL,
.vdev_op_io_start = vdev_missing_io_start,
.vdev_op_io_done = vdev_missing_io_done,
.vdev_op_state_change = NULL,
@ -92,14 +96,23 @@ vdev_ops_t vdev_missing_ops = {
.vdev_op_rele = NULL,
.vdev_op_remap = NULL,
.vdev_op_xlate = NULL,
.vdev_op_rebuild_asize = NULL,
.vdev_op_metaslab_init = NULL,
.vdev_op_config_generate = NULL,
.vdev_op_nparity = NULL,
.vdev_op_ndisks = NULL,
.vdev_op_type = VDEV_TYPE_MISSING, /* name of this vdev type */
.vdev_op_leaf = B_TRUE /* leaf vdev */
};
vdev_ops_t vdev_hole_ops = {
.vdev_op_init = NULL,
.vdev_op_fini = NULL,
.vdev_op_open = vdev_missing_open,
.vdev_op_close = vdev_missing_close,
.vdev_op_asize = vdev_default_asize,
.vdev_op_min_asize = vdev_default_min_asize,
.vdev_op_min_alloc = NULL,
.vdev_op_io_start = vdev_missing_io_start,
.vdev_op_io_done = vdev_missing_io_done,
.vdev_op_state_change = NULL,
@ -108,6 +121,11 @@ vdev_ops_t vdev_hole_ops = {
.vdev_op_rele = NULL,
.vdev_op_remap = NULL,
.vdev_op_xlate = NULL,
.vdev_op_rebuild_asize = NULL,
.vdev_op_metaslab_init = NULL,
.vdev_op_config_generate = NULL,
.vdev_op_nparity = NULL,
.vdev_op_ndisks = NULL,
.vdev_op_type = VDEV_TYPE_HOLE, /* name of this vdev type */
.vdev_op_leaf = B_TRUE /* leaf vdev */
};

View File

@ -593,6 +593,13 @@ vdev_queue_aggregate(vdev_queue_t *vq, zio_t *zio)
if (zio->io_type == ZIO_TYPE_TRIM && !zfs_vdev_aggregate_trim)
return (NULL);
/*
* I/Os to distributed spares are directly dispatched to the dRAID
* leaf vdevs for aggregation. See the comment at the end of the
* zio_vdev_io_start() function.
*/
ASSERT(vq->vq_vdev->vdev_ops != &vdev_draid_spare_ops);
first = last = zio;
if (zio->io_type == ZIO_TYPE_READ)

File diff suppressed because it is too large Load Diff

View File

@ -149,7 +149,7 @@ vdev_raidz_math_get_ops(void)
* Select parity generation method for raidz_map
*/
int
vdev_raidz_math_generate(raidz_map_t *rm)
vdev_raidz_math_generate(raidz_map_t *rm, raidz_row_t *rr)
{
raidz_gen_f gen_parity = NULL;
@ -174,7 +174,7 @@ vdev_raidz_math_generate(raidz_map_t *rm)
if (gen_parity == NULL)
return (RAIDZ_ORIGINAL_IMPL);
gen_parity(rm);
gen_parity(rr);
return (0);
}
@ -241,8 +241,8 @@ reconstruct_fun_pqr_sel(raidz_map_t *rm, const int *parity_valid,
* @nbaddata - Number of failed data columns
*/
int
vdev_raidz_math_reconstruct(raidz_map_t *rm, const int *parity_valid,
const int *dt, const int nbaddata)
vdev_raidz_math_reconstruct(raidz_map_t *rm, raidz_row_t *rr,
const int *parity_valid, const int *dt, const int nbaddata)
{
raidz_rec_f rec_fn = NULL;
@ -265,7 +265,7 @@ vdev_raidz_math_reconstruct(raidz_map_t *rm, const int *parity_valid,
if (rec_fn == NULL)
return (RAIDZ_ORIGINAL_IMPL);
else
return (rec_fn(rm, dt));
return (rec_fn(rr, dt));
}
const char *raidz_gen_name[] = {

View File

@ -26,6 +26,7 @@
#define _VDEV_RAIDZ_MATH_IMPL_H
#include <sys/types.h>
#include <sys/vdev_raidz_impl.h>
#define raidz_inline inline __attribute__((always_inline))
#ifndef noinline
@ -36,33 +37,33 @@
* Functions calculate multiplication constants for data reconstruction.
* Coefficients depend on RAIDZ geometry, indexes of failed child vdevs, and
* used parity columns for reconstruction.
* @rm RAIDZ map
* @rr RAIDZ row
* @tgtidx array of missing data indexes
* @coeff output array of coefficients. Array must be provided by
* user and must hold minimum MUL_CNT values.
*/
static noinline void
raidz_rec_q_coeff(const raidz_map_t *rm, const int *tgtidx, unsigned *coeff)
raidz_rec_q_coeff(const raidz_row_t *rr, const int *tgtidx, unsigned *coeff)
{
const unsigned ncols = raidz_ncols(rm);
const unsigned ncols = rr->rr_cols;
const unsigned x = tgtidx[TARGET_X];
coeff[MUL_Q_X] = gf_exp2(255 - (ncols - x - 1));
}
static noinline void
raidz_rec_r_coeff(const raidz_map_t *rm, const int *tgtidx, unsigned *coeff)
raidz_rec_r_coeff(const raidz_row_t *rr, const int *tgtidx, unsigned *coeff)
{
const unsigned ncols = raidz_ncols(rm);
const unsigned ncols = rr->rr_cols;
const unsigned x = tgtidx[TARGET_X];
coeff[MUL_R_X] = gf_exp4(255 - (ncols - x - 1));
}
static noinline void
raidz_rec_pq_coeff(const raidz_map_t *rm, const int *tgtidx, unsigned *coeff)
raidz_rec_pq_coeff(const raidz_row_t *rr, const int *tgtidx, unsigned *coeff)
{
const unsigned ncols = raidz_ncols(rm);
const unsigned ncols = rr->rr_cols;
const unsigned x = tgtidx[TARGET_X];
const unsigned y = tgtidx[TARGET_Y];
gf_t a, b, e;
@ -76,9 +77,9 @@ raidz_rec_pq_coeff(const raidz_map_t *rm, const int *tgtidx, unsigned *coeff)
}
static noinline void
raidz_rec_pr_coeff(const raidz_map_t *rm, const int *tgtidx, unsigned *coeff)
raidz_rec_pr_coeff(const raidz_row_t *rr, const int *tgtidx, unsigned *coeff)
{
const unsigned ncols = raidz_ncols(rm);
const unsigned ncols = rr->rr_cols;
const unsigned x = tgtidx[TARGET_X];
const unsigned y = tgtidx[TARGET_Y];
@ -93,9 +94,9 @@ raidz_rec_pr_coeff(const raidz_map_t *rm, const int *tgtidx, unsigned *coeff)
}
static noinline void
raidz_rec_qr_coeff(const raidz_map_t *rm, const int *tgtidx, unsigned *coeff)
raidz_rec_qr_coeff(const raidz_row_t *rr, const int *tgtidx, unsigned *coeff)
{
const unsigned ncols = raidz_ncols(rm);
const unsigned ncols = rr->rr_cols;
const unsigned x = tgtidx[TARGET_X];
const unsigned y = tgtidx[TARGET_Y];
@ -114,9 +115,9 @@ raidz_rec_qr_coeff(const raidz_map_t *rm, const int *tgtidx, unsigned *coeff)
}
static noinline void
raidz_rec_pqr_coeff(const raidz_map_t *rm, const int *tgtidx, unsigned *coeff)
raidz_rec_pqr_coeff(const raidz_row_t *rr, const int *tgtidx, unsigned *coeff)
{
const unsigned ncols = raidz_ncols(rm);
const unsigned ncols = rr->rr_cols;
const unsigned x = tgtidx[TARGET_X];
const unsigned y = tgtidx[TARGET_Y];
const unsigned z = tgtidx[TARGET_Z];
@ -347,26 +348,26 @@ raidz_mul_abd_cb(void *dc, size_t size, void *private)
/*
* Generate P parity (RAIDZ1)
*
* @rm RAIDZ map
* @rr RAIDZ row
*/
static raidz_inline void
raidz_generate_p_impl(raidz_map_t * const rm)
raidz_generate_p_impl(raidz_row_t * const rr)
{
size_t c;
const size_t ncols = raidz_ncols(rm);
const size_t psize = rm->rm_col[CODE_P].rc_size;
abd_t *pabd = rm->rm_col[CODE_P].rc_abd;
const size_t ncols = rr->rr_cols;
const size_t psize = rr->rr_col[CODE_P].rc_size;
abd_t *pabd = rr->rr_col[CODE_P].rc_abd;
size_t size;
abd_t *dabd;
raidz_math_begin();
/* start with first data column */
raidz_copy(pabd, rm->rm_col[1].rc_abd, psize);
raidz_copy(pabd, rr->rr_col[1].rc_abd, psize);
for (c = 2; c < ncols; c++) {
dabd = rm->rm_col[c].rc_abd;
size = rm->rm_col[c].rc_size;
dabd = rr->rr_col[c].rc_abd;
size = rr->rr_col[c].rc_size;
/* add data column */
raidz_add(pabd, dabd, size);
@ -414,29 +415,29 @@ raidz_gen_pq_add(void **c, const void *dc, const size_t csize,
/*
* Generate PQ parity (RAIDZ2)
*
* @rm RAIDZ map
* @rr RAIDZ row
*/
static raidz_inline void
raidz_generate_pq_impl(raidz_map_t * const rm)
raidz_generate_pq_impl(raidz_row_t * const rr)
{
size_t c;
const size_t ncols = raidz_ncols(rm);
const size_t csize = rm->rm_col[CODE_P].rc_size;
const size_t ncols = rr->rr_cols;
const size_t csize = rr->rr_col[CODE_P].rc_size;
size_t dsize;
abd_t *dabd;
abd_t *cabds[] = {
rm->rm_col[CODE_P].rc_abd,
rm->rm_col[CODE_Q].rc_abd
rr->rr_col[CODE_P].rc_abd,
rr->rr_col[CODE_Q].rc_abd
};
raidz_math_begin();
raidz_copy(cabds[CODE_P], rm->rm_col[2].rc_abd, csize);
raidz_copy(cabds[CODE_Q], rm->rm_col[2].rc_abd, csize);
raidz_copy(cabds[CODE_P], rr->rr_col[2].rc_abd, csize);
raidz_copy(cabds[CODE_Q], rr->rr_col[2].rc_abd, csize);
for (c = 3; c < ncols; c++) {
dabd = rm->rm_col[c].rc_abd;
dsize = rm->rm_col[c].rc_size;
dabd = rr->rr_col[c].rc_abd;
dsize = rr->rr_col[c].rc_size;
abd_raidz_gen_iterate(cabds, dabd, csize, dsize, 2,
raidz_gen_pq_add);
@ -487,31 +488,31 @@ raidz_gen_pqr_add(void **c, const void *dc, const size_t csize,
/*
* Generate PQR parity (RAIDZ2)
*
* @rm RAIDZ map
* @rr RAIDZ row
*/
static raidz_inline void
raidz_generate_pqr_impl(raidz_map_t * const rm)
raidz_generate_pqr_impl(raidz_row_t * const rr)
{
size_t c;
const size_t ncols = raidz_ncols(rm);
const size_t csize = rm->rm_col[CODE_P].rc_size;
const size_t ncols = rr->rr_cols;
const size_t csize = rr->rr_col[CODE_P].rc_size;
size_t dsize;
abd_t *dabd;
abd_t *cabds[] = {
rm->rm_col[CODE_P].rc_abd,
rm->rm_col[CODE_Q].rc_abd,
rm->rm_col[CODE_R].rc_abd
rr->rr_col[CODE_P].rc_abd,
rr->rr_col[CODE_Q].rc_abd,
rr->rr_col[CODE_R].rc_abd
};
raidz_math_begin();
raidz_copy(cabds[CODE_P], rm->rm_col[3].rc_abd, csize);
raidz_copy(cabds[CODE_Q], rm->rm_col[3].rc_abd, csize);
raidz_copy(cabds[CODE_R], rm->rm_col[3].rc_abd, csize);
raidz_copy(cabds[CODE_P], rr->rr_col[3].rc_abd, csize);
raidz_copy(cabds[CODE_Q], rr->rr_col[3].rc_abd, csize);
raidz_copy(cabds[CODE_R], rr->rr_col[3].rc_abd, csize);
for (c = 4; c < ncols; c++) {
dabd = rm->rm_col[c].rc_abd;
dsize = rm->rm_col[c].rc_size;
dabd = rr->rr_col[c].rc_abd;
dsize = rr->rr_col[c].rc_size;
abd_raidz_gen_iterate(cabds, dabd, csize, dsize, 3,
raidz_gen_pqr_add);
@ -579,33 +580,36 @@ raidz_generate_pqr_impl(raidz_map_t * const rm)
* @syn_method raidz_add_abd()
* @rec_method not applicable
*
* @rm RAIDZ map
* @rr RAIDZ row
* @tgtidx array of missing data indexes
*/
static raidz_inline int
raidz_reconstruct_p_impl(raidz_map_t *rm, const int *tgtidx)
raidz_reconstruct_p_impl(raidz_row_t *rr, const int *tgtidx)
{
size_t c;
const size_t firstdc = raidz_parity(rm);
const size_t ncols = raidz_ncols(rm);
const size_t firstdc = rr->rr_firstdatacol;
const size_t ncols = rr->rr_cols;
const size_t x = tgtidx[TARGET_X];
const size_t xsize = rm->rm_col[x].rc_size;
abd_t *xabd = rm->rm_col[x].rc_abd;
const size_t xsize = rr->rr_col[x].rc_size;
abd_t *xabd = rr->rr_col[x].rc_abd;
size_t size;
abd_t *dabd;
if (xabd == NULL)
return (1 << CODE_P);
raidz_math_begin();
/* copy P into target */
raidz_copy(xabd, rm->rm_col[CODE_P].rc_abd, xsize);
raidz_copy(xabd, rr->rr_col[CODE_P].rc_abd, xsize);
/* generate p_syndrome */
for (c = firstdc; c < ncols; c++) {
if (c == x)
continue;
dabd = rm->rm_col[c].rc_abd;
size = MIN(rm->rm_col[c].rc_size, xsize);
dabd = rr->rr_col[c].rc_abd;
size = MIN(rr->rr_col[c].rc_size, xsize);
raidz_add(xabd, dabd, size);
}
@ -653,30 +657,33 @@ raidz_syn_q_abd(void **xc, const void *dc, const size_t xsize,
* @syn_method raidz_add_abd()
* @rec_method raidz_mul_abd_cb()
*
* @rm RAIDZ map
* @rr RAIDZ row
* @tgtidx array of missing data indexes
*/
static raidz_inline int
raidz_reconstruct_q_impl(raidz_map_t *rm, const int *tgtidx)
raidz_reconstruct_q_impl(raidz_row_t *rr, const int *tgtidx)
{
size_t c;
size_t dsize;
abd_t *dabd;
const size_t firstdc = raidz_parity(rm);
const size_t ncols = raidz_ncols(rm);
const size_t firstdc = rr->rr_firstdatacol;
const size_t ncols = rr->rr_cols;
const size_t x = tgtidx[TARGET_X];
abd_t *xabd = rm->rm_col[x].rc_abd;
const size_t xsize = rm->rm_col[x].rc_size;
abd_t *xabd = rr->rr_col[x].rc_abd;
const size_t xsize = rr->rr_col[x].rc_size;
abd_t *tabds[] = { xabd };
if (xabd == NULL)
return (1 << CODE_Q);
unsigned coeff[MUL_CNT];
raidz_rec_q_coeff(rm, tgtidx, coeff);
raidz_rec_q_coeff(rr, tgtidx, coeff);
raidz_math_begin();
/* Start with first data column if present */
if (firstdc != x) {
raidz_copy(xabd, rm->rm_col[firstdc].rc_abd, xsize);
raidz_copy(xabd, rr->rr_col[firstdc].rc_abd, xsize);
} else {
raidz_zero(xabd, xsize);
}
@ -687,8 +694,8 @@ raidz_reconstruct_q_impl(raidz_map_t *rm, const int *tgtidx)
dabd = NULL;
dsize = 0;
} else {
dabd = rm->rm_col[c].rc_abd;
dsize = rm->rm_col[c].rc_size;
dabd = rr->rr_col[c].rc_abd;
dsize = rr->rr_col[c].rc_size;
}
abd_raidz_gen_iterate(tabds, dabd, xsize, dsize, 1,
@ -696,7 +703,7 @@ raidz_reconstruct_q_impl(raidz_map_t *rm, const int *tgtidx)
}
/* add Q to the syndrome */
raidz_add(xabd, rm->rm_col[CODE_Q].rc_abd, xsize);
raidz_add(xabd, rr->rr_col[CODE_Q].rc_abd, xsize);
/* transform the syndrome */
abd_iterate_func(xabd, 0, xsize, raidz_mul_abd_cb, (void*) coeff);
@ -744,30 +751,33 @@ raidz_syn_r_abd(void **xc, const void *dc, const size_t tsize,
* @syn_method raidz_add_abd()
* @rec_method raidz_mul_abd_cb()
*
* @rm RAIDZ map
* @rr RAIDZ rr
* @tgtidx array of missing data indexes
*/
static raidz_inline int
raidz_reconstruct_r_impl(raidz_map_t *rm, const int *tgtidx)
raidz_reconstruct_r_impl(raidz_row_t *rr, const int *tgtidx)
{
size_t c;
size_t dsize;
abd_t *dabd;
const size_t firstdc = raidz_parity(rm);
const size_t ncols = raidz_ncols(rm);
const size_t firstdc = rr->rr_firstdatacol;
const size_t ncols = rr->rr_cols;
const size_t x = tgtidx[TARGET_X];
const size_t xsize = rm->rm_col[x].rc_size;
abd_t *xabd = rm->rm_col[x].rc_abd;
const size_t xsize = rr->rr_col[x].rc_size;
abd_t *xabd = rr->rr_col[x].rc_abd;
abd_t *tabds[] = { xabd };
if (xabd == NULL)
return (1 << CODE_R);
unsigned coeff[MUL_CNT];
raidz_rec_r_coeff(rm, tgtidx, coeff);
raidz_rec_r_coeff(rr, tgtidx, coeff);
raidz_math_begin();
/* Start with first data column if present */
if (firstdc != x) {
raidz_copy(xabd, rm->rm_col[firstdc].rc_abd, xsize);
raidz_copy(xabd, rr->rr_col[firstdc].rc_abd, xsize);
} else {
raidz_zero(xabd, xsize);
}
@ -779,8 +789,8 @@ raidz_reconstruct_r_impl(raidz_map_t *rm, const int *tgtidx)
dabd = NULL;
dsize = 0;
} else {
dabd = rm->rm_col[c].rc_abd;
dsize = rm->rm_col[c].rc_size;
dabd = rr->rr_col[c].rc_abd;
dsize = rr->rr_col[c].rc_size;
}
abd_raidz_gen_iterate(tabds, dabd, xsize, dsize, 1,
@ -788,7 +798,7 @@ raidz_reconstruct_r_impl(raidz_map_t *rm, const int *tgtidx)
}
/* add R to the syndrome */
raidz_add(xabd, rm->rm_col[CODE_R].rc_abd, xsize);
raidz_add(xabd, rr->rr_col[CODE_R].rc_abd, xsize);
/* transform the syndrome */
abd_iterate_func(xabd, 0, xsize, raidz_mul_abd_cb, (void *)coeff);
@ -881,31 +891,34 @@ raidz_rec_pq_abd(void **tc, const size_t tsize, void **c,
* @syn_method raidz_syn_pq_abd()
* @rec_method raidz_rec_pq_abd()
*
* @rm RAIDZ map
* @rr RAIDZ row
* @tgtidx array of missing data indexes
*/
static raidz_inline int
raidz_reconstruct_pq_impl(raidz_map_t *rm, const int *tgtidx)
raidz_reconstruct_pq_impl(raidz_row_t *rr, const int *tgtidx)
{
size_t c;
size_t dsize;
abd_t *dabd;
const size_t firstdc = raidz_parity(rm);
const size_t ncols = raidz_ncols(rm);
const size_t firstdc = rr->rr_firstdatacol;
const size_t ncols = rr->rr_cols;
const size_t x = tgtidx[TARGET_X];
const size_t y = tgtidx[TARGET_Y];
const size_t xsize = rm->rm_col[x].rc_size;
const size_t ysize = rm->rm_col[y].rc_size;
abd_t *xabd = rm->rm_col[x].rc_abd;
abd_t *yabd = rm->rm_col[y].rc_abd;
const size_t xsize = rr->rr_col[x].rc_size;
const size_t ysize = rr->rr_col[y].rc_size;
abd_t *xabd = rr->rr_col[x].rc_abd;
abd_t *yabd = rr->rr_col[y].rc_abd;
abd_t *tabds[2] = { xabd, yabd };
abd_t *cabds[] = {
rm->rm_col[CODE_P].rc_abd,
rm->rm_col[CODE_Q].rc_abd
rr->rr_col[CODE_P].rc_abd,
rr->rr_col[CODE_Q].rc_abd
};
if (xabd == NULL)
return ((1 << CODE_P) | (1 << CODE_Q));
unsigned coeff[MUL_CNT];
raidz_rec_pq_coeff(rm, tgtidx, coeff);
raidz_rec_pq_coeff(rr, tgtidx, coeff);
/*
* Check if some of targets is shorter then others
@ -921,8 +934,8 @@ raidz_reconstruct_pq_impl(raidz_map_t *rm, const int *tgtidx)
/* Start with first data column if present */
if (firstdc != x) {
raidz_copy(xabd, rm->rm_col[firstdc].rc_abd, xsize);
raidz_copy(yabd, rm->rm_col[firstdc].rc_abd, xsize);
raidz_copy(xabd, rr->rr_col[firstdc].rc_abd, xsize);
raidz_copy(yabd, rr->rr_col[firstdc].rc_abd, xsize);
} else {
raidz_zero(xabd, xsize);
raidz_zero(yabd, xsize);
@ -934,8 +947,8 @@ raidz_reconstruct_pq_impl(raidz_map_t *rm, const int *tgtidx)
dabd = NULL;
dsize = 0;
} else {
dabd = rm->rm_col[c].rc_abd;
dsize = rm->rm_col[c].rc_size;
dabd = rr->rr_col[c].rc_abd;
dsize = rr->rr_col[c].rc_size;
}
abd_raidz_gen_iterate(tabds, dabd, xsize, dsize, 2,
@ -946,7 +959,7 @@ raidz_reconstruct_pq_impl(raidz_map_t *rm, const int *tgtidx)
/* Copy shorter targets back to the original abd buffer */
if (ysize < xsize)
raidz_copy(rm->rm_col[y].rc_abd, yabd, ysize);
raidz_copy(rr->rr_col[y].rc_abd, yabd, ysize);
raidz_math_end();
@ -1038,30 +1051,34 @@ raidz_rec_pr_abd(void **t, const size_t tsize, void **c,
* @syn_method raidz_syn_pr_abd()
* @rec_method raidz_rec_pr_abd()
*
* @rm RAIDZ map
* @rr RAIDZ row
* @tgtidx array of missing data indexes
*/
static raidz_inline int
raidz_reconstruct_pr_impl(raidz_map_t *rm, const int *tgtidx)
raidz_reconstruct_pr_impl(raidz_row_t *rr, const int *tgtidx)
{
size_t c;
size_t dsize;
abd_t *dabd;
const size_t firstdc = raidz_parity(rm);
const size_t ncols = raidz_ncols(rm);
const size_t firstdc = rr->rr_firstdatacol;
const size_t ncols = rr->rr_cols;
const size_t x = tgtidx[0];
const size_t y = tgtidx[1];
const size_t xsize = rm->rm_col[x].rc_size;
const size_t ysize = rm->rm_col[y].rc_size;
abd_t *xabd = rm->rm_col[x].rc_abd;
abd_t *yabd = rm->rm_col[y].rc_abd;
const size_t xsize = rr->rr_col[x].rc_size;
const size_t ysize = rr->rr_col[y].rc_size;
abd_t *xabd = rr->rr_col[x].rc_abd;
abd_t *yabd = rr->rr_col[y].rc_abd;
abd_t *tabds[2] = { xabd, yabd };
abd_t *cabds[] = {
rm->rm_col[CODE_P].rc_abd,
rm->rm_col[CODE_R].rc_abd
rr->rr_col[CODE_P].rc_abd,
rr->rr_col[CODE_R].rc_abd
};
if (xabd == NULL)
return ((1 << CODE_P) | (1 << CODE_R));
unsigned coeff[MUL_CNT];
raidz_rec_pr_coeff(rm, tgtidx, coeff);
raidz_rec_pr_coeff(rr, tgtidx, coeff);
/*
* Check if some of targets are shorter then others.
@ -1077,8 +1094,8 @@ raidz_reconstruct_pr_impl(raidz_map_t *rm, const int *tgtidx)
/* Start with first data column if present */
if (firstdc != x) {
raidz_copy(xabd, rm->rm_col[firstdc].rc_abd, xsize);
raidz_copy(yabd, rm->rm_col[firstdc].rc_abd, xsize);
raidz_copy(xabd, rr->rr_col[firstdc].rc_abd, xsize);
raidz_copy(yabd, rr->rr_col[firstdc].rc_abd, xsize);
} else {
raidz_zero(xabd, xsize);
raidz_zero(yabd, xsize);
@ -1090,8 +1107,8 @@ raidz_reconstruct_pr_impl(raidz_map_t *rm, const int *tgtidx)
dabd = NULL;
dsize = 0;
} else {
dabd = rm->rm_col[c].rc_abd;
dsize = rm->rm_col[c].rc_size;
dabd = rr->rr_col[c].rc_abd;
dsize = rr->rr_col[c].rc_size;
}
abd_raidz_gen_iterate(tabds, dabd, xsize, dsize, 2,
@ -1104,14 +1121,14 @@ raidz_reconstruct_pr_impl(raidz_map_t *rm, const int *tgtidx)
* Copy shorter targets back to the original abd buffer
*/
if (ysize < xsize)
raidz_copy(rm->rm_col[y].rc_abd, yabd, ysize);
raidz_copy(rr->rr_col[y].rc_abd, yabd, ysize);
raidz_math_end();
if (ysize < xsize)
abd_free(yabd);
return ((1 << CODE_P) | (1 << CODE_Q));
return ((1 << CODE_P) | (1 << CODE_R));
}
@ -1201,30 +1218,34 @@ raidz_rec_qr_abd(void **t, const size_t tsize, void **c,
* @syn_method raidz_syn_qr_abd()
* @rec_method raidz_rec_qr_abd()
*
* @rm RAIDZ map
* @rr RAIDZ row
* @tgtidx array of missing data indexes
*/
static raidz_inline int
raidz_reconstruct_qr_impl(raidz_map_t *rm, const int *tgtidx)
raidz_reconstruct_qr_impl(raidz_row_t *rr, const int *tgtidx)
{
size_t c;
size_t dsize;
abd_t *dabd;
const size_t firstdc = raidz_parity(rm);
const size_t ncols = raidz_ncols(rm);
const size_t firstdc = rr->rr_firstdatacol;
const size_t ncols = rr->rr_cols;
const size_t x = tgtidx[TARGET_X];
const size_t y = tgtidx[TARGET_Y];
const size_t xsize = rm->rm_col[x].rc_size;
const size_t ysize = rm->rm_col[y].rc_size;
abd_t *xabd = rm->rm_col[x].rc_abd;
abd_t *yabd = rm->rm_col[y].rc_abd;
const size_t xsize = rr->rr_col[x].rc_size;
const size_t ysize = rr->rr_col[y].rc_size;
abd_t *xabd = rr->rr_col[x].rc_abd;
abd_t *yabd = rr->rr_col[y].rc_abd;
abd_t *tabds[2] = { xabd, yabd };
abd_t *cabds[] = {
rm->rm_col[CODE_Q].rc_abd,
rm->rm_col[CODE_R].rc_abd
rr->rr_col[CODE_Q].rc_abd,
rr->rr_col[CODE_R].rc_abd
};
if (xabd == NULL)
return ((1 << CODE_Q) | (1 << CODE_R));
unsigned coeff[MUL_CNT];
raidz_rec_qr_coeff(rm, tgtidx, coeff);
raidz_rec_qr_coeff(rr, tgtidx, coeff);
/*
* Check if some of targets is shorter then others
@ -1240,8 +1261,8 @@ raidz_reconstruct_qr_impl(raidz_map_t *rm, const int *tgtidx)
/* Start with first data column if present */
if (firstdc != x) {
raidz_copy(xabd, rm->rm_col[firstdc].rc_abd, xsize);
raidz_copy(yabd, rm->rm_col[firstdc].rc_abd, xsize);
raidz_copy(xabd, rr->rr_col[firstdc].rc_abd, xsize);
raidz_copy(yabd, rr->rr_col[firstdc].rc_abd, xsize);
} else {
raidz_zero(xabd, xsize);
raidz_zero(yabd, xsize);
@ -1253,8 +1274,8 @@ raidz_reconstruct_qr_impl(raidz_map_t *rm, const int *tgtidx)
dabd = NULL;
dsize = 0;
} else {
dabd = rm->rm_col[c].rc_abd;
dsize = rm->rm_col[c].rc_size;
dabd = rr->rr_col[c].rc_abd;
dsize = rr->rr_col[c].rc_size;
}
abd_raidz_gen_iterate(tabds, dabd, xsize, dsize, 2,
@ -1267,7 +1288,7 @@ raidz_reconstruct_qr_impl(raidz_map_t *rm, const int *tgtidx)
* Copy shorter targets back to the original abd buffer
*/
if (ysize < xsize)
raidz_copy(rm->rm_col[y].rc_abd, yabd, ysize);
raidz_copy(rr->rr_col[y].rc_abd, yabd, ysize);
raidz_math_end();
@ -1384,34 +1405,38 @@ raidz_rec_pqr_abd(void **t, const size_t tsize, void **c,
* @syn_method raidz_syn_pqr_abd()
* @rec_method raidz_rec_pqr_abd()
*
* @rm RAIDZ map
* @rr RAIDZ row
* @tgtidx array of missing data indexes
*/
static raidz_inline int
raidz_reconstruct_pqr_impl(raidz_map_t *rm, const int *tgtidx)
raidz_reconstruct_pqr_impl(raidz_row_t *rr, const int *tgtidx)
{
size_t c;
size_t dsize;
abd_t *dabd;
const size_t firstdc = raidz_parity(rm);
const size_t ncols = raidz_ncols(rm);
const size_t firstdc = rr->rr_firstdatacol;
const size_t ncols = rr->rr_cols;
const size_t x = tgtidx[TARGET_X];
const size_t y = tgtidx[TARGET_Y];
const size_t z = tgtidx[TARGET_Z];
const size_t xsize = rm->rm_col[x].rc_size;
const size_t ysize = rm->rm_col[y].rc_size;
const size_t zsize = rm->rm_col[z].rc_size;
abd_t *xabd = rm->rm_col[x].rc_abd;
abd_t *yabd = rm->rm_col[y].rc_abd;
abd_t *zabd = rm->rm_col[z].rc_abd;
const size_t xsize = rr->rr_col[x].rc_size;
const size_t ysize = rr->rr_col[y].rc_size;
const size_t zsize = rr->rr_col[z].rc_size;
abd_t *xabd = rr->rr_col[x].rc_abd;
abd_t *yabd = rr->rr_col[y].rc_abd;
abd_t *zabd = rr->rr_col[z].rc_abd;
abd_t *tabds[] = { xabd, yabd, zabd };
abd_t *cabds[] = {
rm->rm_col[CODE_P].rc_abd,
rm->rm_col[CODE_Q].rc_abd,
rm->rm_col[CODE_R].rc_abd
rr->rr_col[CODE_P].rc_abd,
rr->rr_col[CODE_Q].rc_abd,
rr->rr_col[CODE_R].rc_abd
};
if (xabd == NULL)
return ((1 << CODE_P) | (1 << CODE_Q) | (1 << CODE_R));
unsigned coeff[MUL_CNT];
raidz_rec_pqr_coeff(rm, tgtidx, coeff);
raidz_rec_pqr_coeff(rr, tgtidx, coeff);
/*
* Check if some of targets is shorter then others
@ -1431,9 +1456,9 @@ raidz_reconstruct_pqr_impl(raidz_map_t *rm, const int *tgtidx)
/* Start with first data column if present */
if (firstdc != x) {
raidz_copy(xabd, rm->rm_col[firstdc].rc_abd, xsize);
raidz_copy(yabd, rm->rm_col[firstdc].rc_abd, xsize);
raidz_copy(zabd, rm->rm_col[firstdc].rc_abd, xsize);
raidz_copy(xabd, rr->rr_col[firstdc].rc_abd, xsize);
raidz_copy(yabd, rr->rr_col[firstdc].rc_abd, xsize);
raidz_copy(zabd, rr->rr_col[firstdc].rc_abd, xsize);
} else {
raidz_zero(xabd, xsize);
raidz_zero(yabd, xsize);
@ -1446,8 +1471,8 @@ raidz_reconstruct_pqr_impl(raidz_map_t *rm, const int *tgtidx)
dabd = NULL;
dsize = 0;
} else {
dabd = rm->rm_col[c].rc_abd;
dsize = rm->rm_col[c].rc_size;
dabd = rr->rr_col[c].rc_abd;
dsize = rr->rr_col[c].rc_size;
}
abd_raidz_gen_iterate(tabds, dabd, xsize, dsize, 3,
@ -1460,9 +1485,9 @@ raidz_reconstruct_pqr_impl(raidz_map_t *rm, const int *tgtidx)
* Copy shorter targets back to the original abd buffer
*/
if (ysize < xsize)
raidz_copy(rm->rm_col[y].rc_abd, yabd, ysize);
raidz_copy(rr->rr_col[y].rc_abd, yabd, ysize);
if (zsize < xsize)
raidz_copy(rm->rm_col[z].rc_abd, zabd, zsize);
raidz_copy(rr->rr_col[z].rc_abd, zabd, zsize);
raidz_math_end();

View File

@ -25,6 +25,7 @@
*/
#include <sys/vdev_impl.h>
#include <sys/vdev_draid.h>
#include <sys/dsl_scan.h>
#include <sys/spa_impl.h>
#include <sys/metaslab_impl.h>
@ -63,13 +64,15 @@
*
* Limitations:
*
* - Only supported for mirror vdev types. Due to the variable stripe
* width used by raidz sequential reconstruction is not possible.
* - Sequential reconstruction is not possible on RAIDZ due to its
* variable stripe width. Note dRAID uses a fixed stripe width which
* avoids this issue, but comes at the expense of some usable capacity.
*
* - Block checksums are not verified during sequential reconstuction.
* - Block checksums are not verified during sequential reconstruction.
* Similar to traditional RAID the parity/mirror data is reconstructed
* but cannot be immediately double checked. For this reason when the
* last active resilver completes the pool is automatically scrubbed.
* last active resilver completes the pool is automatically scrubbed
* by default.
*
* - Deferred resilvers using sequential reconstruction are not currently
* supported. When adding another vdev to an active top-level resilver
@ -77,8 +80,8 @@
*
* Advantages:
*
* - Sequential reconstuction is performed in LBA order which may be faster
* than healing reconstuction particularly when using using HDDs (or
* - Sequential reconstruction is performed in LBA order which may be faster
* than healing reconstruction particularly when using using HDDs (or
* especially with SMR devices). Only allocated capacity is resilvered.
*
* - Sequential reconstruction is not constrained by ZFS block boundaries.
@ -86,9 +89,9 @@
* allowing all of these logical blocks to be repaired with a single IO.
*
* - Unlike a healing resilver or scrub which are pool wide operations,
* sequential reconstruction is handled by the top-level mirror vdevs.
* This allows for it to be started or canceled on a top-level vdev
* without impacting any other top-level vdevs in the pool.
* sequential reconstruction is handled by the top-level vdevs. This
* allows for it to be started or canceled on a top-level vdev without
* impacting any other top-level vdevs in the pool.
*
* - Data only referenced by a pool checkpoint will be repaired because
* that space is reflected in the space maps. This differs for a
@ -97,18 +100,36 @@
/*
* Maximum number of queued rebuild I/Os top-level vdev. The number of
* concurrent rebuild I/Os issued to the device is controlled by the
* zfs_vdev_rebuild_min_active and zfs_vdev_rebuild_max_active module
* options.
*/
unsigned int zfs_rebuild_queue_limit = 20;
/*
* Size of rebuild reads; defaults to 1MiB and is capped at SPA_MAXBLOCKSIZE.
* Size of rebuild reads; defaults to 1MiB per data disk and is capped at
* SPA_MAXBLOCKSIZE.
*/
unsigned long zfs_rebuild_max_segment = 1024 * 1024;
/*
* Maximum number of parallelly executed bytes per leaf vdev caused by a
* sequential resilver. We attempt to strike a balance here between keeping
* the vdev queues full of I/Os at all times and not overflowing the queues
* to cause long latency, which would cause long txg sync times.
*
* A large default value can be safely used here because the default target
* segment size is also large (zfs_rebuild_max_segment=1M). This helps keep
* the queue depth short.
*
* 32MB was selected as the default value to achieve good performance with
* a large 90-drive dRAID HDD configuration (draid2:8d:90c:2s). A sequential
* rebuild was unable to saturate all of the drives using smaller values.
* With a value of 32MB the sequential resilver write rate was measured at
* 800MB/s sustained while rebuilding to a distributed spare.
*/
unsigned long zfs_rebuild_vdev_limit = 32 << 20;
/*
* Automatically start a pool scrub when the last active sequential resilver
* completes in order to verify the checksums of all blocks which have been
* resilvered. This option is enabled by default and is strongly recommended.
*/
int zfs_rebuild_scrub_enabled = 1;
/*
* For vdev_rebuild_initiate_sync() and vdev_rebuild_reset_sync().
*/
@ -293,7 +314,7 @@ vdev_rebuild_complete_sync(void *arg, dmu_tx_t *tx)
VDEV_TOP_ZAP_VDEV_REBUILD_PHYS, sizeof (uint64_t),
REBUILD_PHYS_ENTRIES, vrp, tx));
vdev_dtl_reassess(vd, tx->tx_txg, vrp->vrp_max_txg, B_TRUE, B_TRUE);
vdev_dtl_reassess(vd, tx->tx_txg, vrp->vrp_max_txg, B_TRUE, B_TRUE);
spa_feature_decr(vd->vdev_spa, SPA_FEATURE_DEVICE_REBUILD, tx);
spa_history_log_internal(spa, "rebuild", tx,
@ -306,7 +327,16 @@ vdev_rebuild_complete_sync(void *arg, dmu_tx_t *tx)
vd->vdev_rebuilding = B_FALSE;
mutex_exit(&vd->vdev_rebuild_lock);
spa_notify_waiters(spa);
/*
* While we're in syncing context take the opportunity to
* setup the scrub when there are no more active rebuilds.
*/
if (!vdev_rebuild_active(spa->spa_root_vdev) &&
zfs_rebuild_scrub_enabled) {
pool_scan_func_t func = POOL_SCAN_SCRUB;
dsl_scan_setup_sync(&func, tx);
}
cv_broadcast(&vd->vdev_rebuild_cv);
}
@ -438,7 +468,7 @@ vdev_rebuild_cb(zio_t *zio)
vdev_rebuild_phys_t *vrp = &vr->vr_rebuild_phys;
vdev_t *vd = vr->vr_top_vdev;
mutex_enter(&vd->vdev_rebuild_io_lock);
mutex_enter(&vr->vr_io_lock);
if (zio->io_error == ENXIO && !vdev_writeable(vd)) {
/*
* The I/O failed because the top-level vdev was unavailable.
@ -455,34 +485,30 @@ vdev_rebuild_cb(zio_t *zio)
abd_free(zio->io_abd);
ASSERT3U(vd->vdev_rebuild_inflight, >, 0);
vd->vdev_rebuild_inflight--;
cv_broadcast(&vd->vdev_rebuild_io_cv);
mutex_exit(&vd->vdev_rebuild_io_lock);
ASSERT3U(vr->vr_bytes_inflight, >, 0);
vr->vr_bytes_inflight -= zio->io_size;
cv_broadcast(&vr->vr_io_cv);
mutex_exit(&vr->vr_io_lock);
spa_config_exit(vd->vdev_spa, SCL_STATE_ALL, vd);
}
/*
* Rebuild the data in this range by constructing a special dummy block
* pointer for the given range. It has no relation to any existing blocks
* in the pool. But by disabling checksum verification and issuing a scrub
* I/O mirrored vdevs will replicate the block using any available mirror
* leaf vdevs.
* Initialize a block pointer that can be used to read the given segment
* for sequential rebuild.
*/
static void
vdev_rebuild_rebuild_block(vdev_rebuild_t *vr, uint64_t start, uint64_t asize,
uint64_t txg)
vdev_rebuild_blkptr_init(blkptr_t *bp, vdev_t *vd, uint64_t start,
uint64_t asize)
{
vdev_t *vd = vr->vr_top_vdev;
spa_t *spa = vd->vdev_spa;
uint64_t psize = asize;
ASSERT(vd->vdev_ops == &vdev_mirror_ops ||
ASSERT(vd->vdev_ops == &vdev_draid_ops ||
vd->vdev_ops == &vdev_mirror_ops ||
vd->vdev_ops == &vdev_replacing_ops ||
vd->vdev_ops == &vdev_spare_ops);
blkptr_t blk, *bp = &blk;
uint64_t psize = vd->vdev_ops == &vdev_draid_ops ?
vdev_draid_asize_to_psize(vd, asize) : asize;
BP_ZERO(bp);
DVA_SET_VDEV(&bp->blk_dva[0], vd->vdev_id);
@ -499,19 +525,6 @@ vdev_rebuild_rebuild_block(vdev_rebuild_t *vr, uint64_t start, uint64_t asize,
BP_SET_LEVEL(bp, 0);
BP_SET_DEDUP(bp, 0);
BP_SET_BYTEORDER(bp, ZFS_HOST_BYTEORDER);
/*
* We increment the issued bytes by the asize rather than the psize
* so the scanned and issued bytes may be directly compared. This
* is consistent with the scrub/resilver issued reporting.
*/
vr->vr_pass_bytes_issued += asize;
vr->vr_rebuild_phys.vrp_bytes_issued += asize;
zio_nowait(zio_read(spa->spa_txg_zio[txg & TXG_MASK], spa, bp,
abd_alloc(psize, B_FALSE), psize, vdev_rebuild_cb, vr,
ZIO_PRIORITY_REBUILD, ZIO_FLAG_RAW | ZIO_FLAG_CANFAIL |
ZIO_FLAG_RESILVER, NULL));
}
/*
@ -525,6 +538,7 @@ vdev_rebuild_range(vdev_rebuild_t *vr, uint64_t start, uint64_t size)
uint64_t ms_id __maybe_unused = vr->vr_scan_msp->ms_id;
vdev_t *vd = vr->vr_top_vdev;
spa_t *spa = vd->vdev_spa;
blkptr_t blk;
ASSERT3U(ms_id, ==, start >> vd->vdev_ms_shift);
ASSERT3U(ms_id, ==, (start + size - 1) >> vd->vdev_ms_shift);
@ -532,14 +546,26 @@ vdev_rebuild_range(vdev_rebuild_t *vr, uint64_t start, uint64_t size)
vr->vr_pass_bytes_scanned += size;
vr->vr_rebuild_phys.vrp_bytes_scanned += size;
mutex_enter(&vd->vdev_rebuild_io_lock);
/*
* Rebuild the data in this range by constructing a special block
* pointer. It has no relation to any existing blocks in the pool.
* However, by disabling checksum verification and issuing a scrub IO
* we can reconstruct and repair any children with missing data.
*/
vdev_rebuild_blkptr_init(&blk, vd, start, size);
uint64_t psize = BP_GET_PSIZE(&blk);
if (!vdev_dtl_need_resilver(vd, &blk.blk_dva[0], psize, TXG_UNKNOWN))
return (0);
mutex_enter(&vr->vr_io_lock);
/* Limit in flight rebuild I/Os */
while (vd->vdev_rebuild_inflight >= zfs_rebuild_queue_limit)
cv_wait(&vd->vdev_rebuild_io_cv, &vd->vdev_rebuild_io_lock);
while (vr->vr_bytes_inflight >= vr->vr_bytes_inflight_max)
cv_wait(&vr->vr_io_cv, &vr->vr_io_lock);
vd->vdev_rebuild_inflight++;
mutex_exit(&vd->vdev_rebuild_io_lock);
vr->vr_bytes_inflight += psize;
mutex_exit(&vr->vr_io_lock);
dmu_tx_t *tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir);
VERIFY0(dmu_tx_assign(tx, TXG_WAIT));
@ -558,45 +584,29 @@ vdev_rebuild_range(vdev_rebuild_t *vr, uint64_t start, uint64_t size)
/* When exiting write out our progress. */
if (vdev_rebuild_should_stop(vd)) {
mutex_enter(&vd->vdev_rebuild_io_lock);
vd->vdev_rebuild_inflight--;
mutex_exit(&vd->vdev_rebuild_io_lock);
mutex_enter(&vr->vr_io_lock);
vr->vr_bytes_inflight -= psize;
mutex_exit(&vr->vr_io_lock);
spa_config_exit(vd->vdev_spa, SCL_STATE_ALL, vd);
mutex_exit(&vd->vdev_rebuild_lock);
dmu_tx_commit(tx);
return (SET_ERROR(EINTR));
}
mutex_exit(&vd->vdev_rebuild_lock);
vr->vr_scan_offset[txg & TXG_MASK] = start + size;
vdev_rebuild_rebuild_block(vr, start, size, txg);
dmu_tx_commit(tx);
vr->vr_scan_offset[txg & TXG_MASK] = start + size;
vr->vr_pass_bytes_issued += size;
vr->vr_rebuild_phys.vrp_bytes_issued += size;
zio_nowait(zio_read(spa->spa_txg_zio[txg & TXG_MASK], spa, &blk,
abd_alloc(psize, B_FALSE), psize, vdev_rebuild_cb, vr,
ZIO_PRIORITY_REBUILD, ZIO_FLAG_RAW | ZIO_FLAG_CANFAIL |
ZIO_FLAG_RESILVER, NULL));
return (0);
}
/*
* Split range into legally-sized logical chunks given the constraints of the
* top-level mirror vdev type.
*/
static uint64_t
vdev_rebuild_chunk_size(vdev_t *vd, uint64_t start, uint64_t size)
{
uint64_t chunk_size, max_asize, max_segment;
ASSERT(vd->vdev_ops == &vdev_mirror_ops ||
vd->vdev_ops == &vdev_replacing_ops ||
vd->vdev_ops == &vdev_spare_ops);
max_segment = MIN(P2ROUNDUP(zfs_rebuild_max_segment,
1 << vd->vdev_ashift), SPA_MAXBLOCKSIZE);
max_asize = vdev_psize_to_asize(vd, max_segment);
chunk_size = MIN(size, max_asize);
return (chunk_size);
}
/*
* Issues rebuild I/Os for all ranges in the provided vr->vr_tree range tree.
*/
@ -625,7 +635,14 @@ vdev_rebuild_ranges(vdev_rebuild_t *vr)
while (size > 0) {
uint64_t chunk_size;
chunk_size = vdev_rebuild_chunk_size(vd, start, size);
/*
* Split range into legally-sized logical chunks
* given the constraints of the top-level vdev
* being rebuilt (dRAID or mirror).
*/
ASSERT3P(vd->vdev_ops, !=, NULL);
chunk_size = vd->vdev_ops->vdev_op_rebuild_asize(vd,
start, size, zfs_rebuild_max_segment);
error = vdev_rebuild_range(vr, start, chunk_size);
if (error != 0)
@ -747,10 +764,16 @@ vdev_rebuild_thread(void *arg)
vr->vr_top_vdev = vd;
vr->vr_scan_msp = NULL;
vr->vr_scan_tree = range_tree_create(NULL, RANGE_SEG64, NULL, 0, 0);
mutex_init(&vr->vr_io_lock, NULL, MUTEX_DEFAULT, NULL);
cv_init(&vr->vr_io_cv, NULL, CV_DEFAULT, NULL);
vr->vr_pass_start_time = gethrtime();
vr->vr_pass_bytes_scanned = 0;
vr->vr_pass_bytes_issued = 0;
vr->vr_bytes_inflight_max = MAX(1ULL << 20,
zfs_rebuild_vdev_limit * vd->vdev_children);
uint64_t update_est_time = gethrtime();
vdev_rebuild_update_bytes_est(vd, 0);
@ -780,21 +803,32 @@ vdev_rebuild_thread(void *arg)
ASSERT0(range_tree_space(vr->vr_scan_tree));
/*
* Disable any new allocations to this metaslab and wait
* for any writes inflight to complete. This is needed to
* ensure all allocated ranges are rebuilt.
*/
/* Disable any new allocations to this metaslab */
metaslab_disable(msp);
spa_config_exit(spa, SCL_CONFIG, FTAG);
txg_wait_synced(dsl, 0);
mutex_enter(&msp->ms_sync_lock);
mutex_enter(&msp->ms_lock);
/*
* If there are outstanding allocations wait for them to be
* synced. This is needed to ensure all allocated ranges are
* on disk and therefore will be rebuilt.
*/
for (int j = 0; j < TXG_SIZE; j++) {
if (range_tree_space(msp->ms_allocating[j])) {
mutex_exit(&msp->ms_lock);
mutex_exit(&msp->ms_sync_lock);
txg_wait_synced(dsl, 0);
mutex_enter(&msp->ms_sync_lock);
mutex_enter(&msp->ms_lock);
break;
}
}
/*
* When a metaslab has been allocated from read its allocated
* ranges from the space map object in to the vr_scan_tree.
* ranges from the space map object into the vr_scan_tree.
* Then add inflight / unflushed ranges and remove inflight /
* unflushed frees. This is the minimum range to be rebuilt.
*/
@ -827,7 +861,7 @@ vdev_rebuild_thread(void *arg)
/*
* To provide an accurate estimate re-calculate the estimated
* size every 5 minutes to account for recent allocations and
* frees made space maps which have not yet been rebuilt.
* frees made to space maps which have not yet been rebuilt.
*/
if (gethrtime() > update_est_time + SEC2NSEC(300)) {
update_est_time = gethrtime();
@ -851,11 +885,14 @@ vdev_rebuild_thread(void *arg)
spa_config_exit(spa, SCL_CONFIG, FTAG);
/* Wait for any remaining rebuild I/O to complete */
mutex_enter(&vd->vdev_rebuild_io_lock);
while (vd->vdev_rebuild_inflight > 0)
cv_wait(&vd->vdev_rebuild_io_cv, &vd->vdev_rebuild_io_lock);
mutex_enter(&vr->vr_io_lock);
while (vr->vr_bytes_inflight > 0)
cv_wait(&vr->vr_io_cv, &vr->vr_io_lock);
mutex_exit(&vd->vdev_rebuild_io_lock);
mutex_exit(&vr->vr_io_lock);
mutex_destroy(&vr->vr_io_lock);
cv_destroy(&vr->vr_io_cv);
spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
@ -1100,5 +1137,11 @@ vdev_rebuild_get_stats(vdev_t *tvd, vdev_rebuild_stat_t *vrs)
/* BEGIN CSTYLED */
ZFS_MODULE_PARAM(zfs, zfs_, rebuild_max_segment, ULONG, ZMOD_RW,
"Max segment size in bytes of rebuild reads");
"Max segment size in bytes of rebuild reads");
ZFS_MODULE_PARAM(zfs, zfs_, rebuild_vdev_limit, ULONG, ZMOD_RW,
"Max bytes in flight per leaf vdev for sequential resilvers");
ZFS_MODULE_PARAM(zfs, zfs_, rebuild_scrub_enabled, INT, ZMOD_RW,
"Automatically scrub after sequential resilver completes");
/* END CSTYLED */

View File

@ -250,7 +250,7 @@ vdev_remove_initiate_sync(void *arg, dmu_tx_t *tx)
spa_vdev_removal_t *svr = NULL;
uint64_t txg __maybe_unused = dmu_tx_get_txg(tx);
ASSERT3P(vd->vdev_ops, !=, &vdev_raidz_ops);
ASSERT0(vdev_get_nparity(vd));
svr = spa_vdev_removal_create(vd);
ASSERT(vd->vdev_removing);
@ -1120,7 +1120,7 @@ static void
vdev_remove_enlist_zaps(vdev_t *vd, nvlist_t *zlist)
{
ASSERT3P(zlist, !=, NULL);
ASSERT3P(vd->vdev_ops, !=, &vdev_raidz_ops);
ASSERT0(vdev_get_nparity(vd));
if (vd->vdev_leaf_zap != 0) {
char zkey[32];
@ -2041,7 +2041,7 @@ spa_vdev_remove_top_check(vdev_t *vd)
/*
* All vdevs in normal class must have the same ashift
* and not be raidz.
* and not be raidz or draid.
*/
vdev_t *rvd = spa->spa_root_vdev;
int num_indirect = 0;
@ -2064,7 +2064,7 @@ spa_vdev_remove_top_check(vdev_t *vd)
num_indirect++;
if (!vdev_is_concrete(cvd))
continue;
if (cvd->vdev_ops == &vdev_raidz_ops)
if (vdev_get_nparity(cvd) != 0)
return (SET_ERROR(EINVAL));
/*
* Need the mirror to be mirror of leaf vdevs only
@ -2217,18 +2217,30 @@ spa_vdev_remove(spa_t *spa, uint64_t guid, boolean_t unspare)
* in this pool.
*/
if (vd == NULL || unspare) {
if (vd == NULL)
vd = spa_lookup_by_guid(spa, guid, B_TRUE);
ev = spa_event_create(spa, vd, NULL,
ESC_ZFS_VDEV_REMOVE_AUX);
char *type;
boolean_t draid_spare = B_FALSE;
vd_type = VDEV_TYPE_SPARE;
vd_path = spa_strdup(fnvlist_lookup_string(
nv, ZPOOL_CONFIG_PATH));
spa_vdev_remove_aux(spa->spa_spares.sav_config,
ZPOOL_CONFIG_SPARES, spares, nspares, nv);
spa_load_spares(spa);
spa->spa_spares.sav_sync = B_TRUE;
if (nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, &type)
== 0 && strcmp(type, VDEV_TYPE_DRAID_SPARE) == 0)
draid_spare = B_TRUE;
if (vd == NULL && draid_spare) {
error = SET_ERROR(ENOTSUP);
} else {
if (vd == NULL)
vd = spa_lookup_by_guid(spa,
guid, B_TRUE);
ev = spa_event_create(spa, vd, NULL,
ESC_ZFS_VDEV_REMOVE_AUX);
vd_type = VDEV_TYPE_SPARE;
vd_path = spa_strdup(fnvlist_lookup_string(
nv, ZPOOL_CONFIG_PATH));
spa_vdev_remove_aux(spa->spa_spares.sav_config,
ZPOOL_CONFIG_SPARES, spares, nspares, nv);
spa_load_spares(spa);
spa->spa_spares.sav_sync = B_TRUE;
}
} else {
error = SET_ERROR(EBUSY);
}

View File

@ -142,9 +142,13 @@ vdev_root_state_change(vdev_t *vd, int faulted, int degraded)
}
vdev_ops_t vdev_root_ops = {
.vdev_op_init = NULL,
.vdev_op_fini = NULL,
.vdev_op_open = vdev_root_open,
.vdev_op_close = vdev_root_close,
.vdev_op_asize = vdev_default_asize,
.vdev_op_min_asize = vdev_default_min_asize,
.vdev_op_min_alloc = NULL,
.vdev_op_io_start = NULL, /* not applicable to the root */
.vdev_op_io_done = NULL, /* not applicable to the root */
.vdev_op_state_change = vdev_root_state_change,
@ -153,6 +157,11 @@ vdev_ops_t vdev_root_ops = {
.vdev_op_rele = NULL,
.vdev_op_remap = NULL,
.vdev_op_xlate = NULL,
.vdev_op_rebuild_asize = NULL,
.vdev_op_metaslab_init = NULL,
.vdev_op_config_generate = NULL,
.vdev_op_nparity = NULL,
.vdev_op_ndisks = NULL,
.vdev_op_type = VDEV_TYPE_ROOT, /* name of this vdev type */
.vdev_op_leaf = B_FALSE /* not a leaf vdev */
};

View File

@ -311,7 +311,8 @@ vdev_trim_change_state(vdev_t *vd, vdev_trim_state_t new_state,
vd->vdev_trim_secure = secure;
}
boolean_t resumed = !!(vd->vdev_trim_state == VDEV_TRIM_SUSPENDED);
vdev_trim_state_t old_state = vd->vdev_trim_state;
boolean_t resumed = (old_state == VDEV_TRIM_SUSPENDED);
vd->vdev_trim_state = new_state;
dmu_tx_t *tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir);
@ -332,9 +333,12 @@ vdev_trim_change_state(vdev_t *vd, vdev_trim_state_t new_state,
"vdev=%s suspended", vd->vdev_path);
break;
case VDEV_TRIM_CANCELED:
spa_event_notify(spa, vd, NULL, ESC_ZFS_TRIM_CANCEL);
spa_history_log_internal(spa, "trim", tx,
"vdev=%s canceled", vd->vdev_path);
if (old_state == VDEV_TRIM_ACTIVE ||
old_state == VDEV_TRIM_SUSPENDED) {
spa_event_notify(spa, vd, NULL, ESC_ZFS_TRIM_CANCEL);
spa_history_log_internal(spa, "trim", tx,
"vdev=%s canceled", vd->vdev_path);
}
break;
case VDEV_TRIM_COMPLETE:
spa_event_notify(spa, vd, NULL, ESC_ZFS_TRIM_FINISH);
@ -601,6 +605,32 @@ vdev_trim_ranges(trim_args_t *ta)
return (0);
}
static void
vdev_trim_xlate_last_rs_end(void *arg, range_seg64_t *physical_rs)
{
uint64_t *last_rs_end = (uint64_t *)arg;
if (physical_rs->rs_end > *last_rs_end)
*last_rs_end = physical_rs->rs_end;
}
static void
vdev_trim_xlate_progress(void *arg, range_seg64_t *physical_rs)
{
vdev_t *vd = (vdev_t *)arg;
uint64_t size = physical_rs->rs_end - physical_rs->rs_start;
vd->vdev_trim_bytes_est += size;
if (vd->vdev_trim_last_offset >= physical_rs->rs_end) {
vd->vdev_trim_bytes_done += size;
} else if (vd->vdev_trim_last_offset > physical_rs->rs_start &&
vd->vdev_trim_last_offset <= physical_rs->rs_end) {
vd->vdev_trim_bytes_done +=
vd->vdev_trim_last_offset - physical_rs->rs_start;
}
}
/*
* Calculates the completion percentage of a manual TRIM.
*/
@ -618,27 +648,35 @@ vdev_trim_calculate_progress(vdev_t *vd)
metaslab_t *msp = vd->vdev_top->vdev_ms[i];
mutex_enter(&msp->ms_lock);
uint64_t ms_free = msp->ms_size -
metaslab_allocated_space(msp);
if (vd->vdev_top->vdev_ops == &vdev_raidz_ops)
ms_free /= vd->vdev_top->vdev_children;
uint64_t ms_free = (msp->ms_size -
metaslab_allocated_space(msp)) /
vdev_get_ndisks(vd->vdev_top);
/*
* Convert the metaslab range to a physical range
* on our vdev. We use this to determine if we are
* in the middle of this metaslab range.
*/
range_seg64_t logical_rs, physical_rs;
range_seg64_t logical_rs, physical_rs, remain_rs;
logical_rs.rs_start = msp->ms_start;
logical_rs.rs_end = msp->ms_start + msp->ms_size;
vdev_xlate(vd, &logical_rs, &physical_rs);
/* Metaslab space after this offset has not been trimmed. */
vdev_xlate(vd, &logical_rs, &physical_rs, &remain_rs);
if (vd->vdev_trim_last_offset <= physical_rs.rs_start) {
vd->vdev_trim_bytes_est += ms_free;
mutex_exit(&msp->ms_lock);
continue;
} else if (vd->vdev_trim_last_offset > physical_rs.rs_end) {
}
/* Metaslab space before this offset has been trimmed */
uint64_t last_rs_end = physical_rs.rs_end;
if (!vdev_xlate_is_empty(&remain_rs)) {
vdev_xlate_walk(vd, &remain_rs,
vdev_trim_xlate_last_rs_end, &last_rs_end);
}
if (vd->vdev_trim_last_offset > last_rs_end) {
vd->vdev_trim_bytes_done += ms_free;
vd->vdev_trim_bytes_est += ms_free;
mutex_exit(&msp->ms_lock);
@ -659,21 +697,9 @@ vdev_trim_calculate_progress(vdev_t *vd)
rs != NULL; rs = zfs_btree_next(bt, &idx, &idx)) {
logical_rs.rs_start = rs_get_start(rs, rt);
logical_rs.rs_end = rs_get_end(rs, rt);
vdev_xlate(vd, &logical_rs, &physical_rs);
uint64_t size = physical_rs.rs_end -
physical_rs.rs_start;
vd->vdev_trim_bytes_est += size;
if (vd->vdev_trim_last_offset >= physical_rs.rs_end) {
vd->vdev_trim_bytes_done += size;
} else if (vd->vdev_trim_last_offset >
physical_rs.rs_start &&
vd->vdev_trim_last_offset <=
physical_rs.rs_end) {
vd->vdev_trim_bytes_done +=
vd->vdev_trim_last_offset -
physical_rs.rs_start;
}
vdev_xlate_walk(vd, &logical_rs,
vdev_trim_xlate_progress, vd);
}
mutex_exit(&msp->ms_lock);
}
@ -741,8 +767,38 @@ vdev_trim_load(vdev_t *vd)
return (err);
}
static void
vdev_trim_xlate_range_add(void *arg, range_seg64_t *physical_rs)
{
trim_args_t *ta = arg;
vdev_t *vd = ta->trim_vdev;
/*
* Only a manual trim will be traversing the vdev sequentially.
* For an auto trim all valid ranges should be added.
*/
if (ta->trim_type == TRIM_TYPE_MANUAL) {
/* Only add segments that we have not visited yet */
if (physical_rs->rs_end <= vd->vdev_trim_last_offset)
return;
/* Pick up where we left off mid-range. */
if (vd->vdev_trim_last_offset > physical_rs->rs_start) {
ASSERT3U(physical_rs->rs_end, >,
vd->vdev_trim_last_offset);
physical_rs->rs_start = vd->vdev_trim_last_offset;
}
}
ASSERT3U(physical_rs->rs_end, >, physical_rs->rs_start);
range_tree_add(ta->trim_tree, physical_rs->rs_start,
physical_rs->rs_end - physical_rs->rs_start);
}
/*
* Convert the logical range into a physical range and add it to the
* Convert the logical range into physical ranges and add them to the
* range tree passed in the trim_args_t.
*/
static void
@ -750,7 +806,7 @@ vdev_trim_range_add(void *arg, uint64_t start, uint64_t size)
{
trim_args_t *ta = arg;
vdev_t *vd = ta->trim_vdev;
range_seg64_t logical_rs, physical_rs;
range_seg64_t logical_rs;
logical_rs.rs_start = start;
logical_rs.rs_end = start + size;
@ -767,44 +823,7 @@ vdev_trim_range_add(void *arg, uint64_t start, uint64_t size)
}
ASSERT(vd->vdev_ops->vdev_op_leaf);
vdev_xlate(vd, &logical_rs, &physical_rs);
IMPLY(vd->vdev_top == vd,
logical_rs.rs_start == physical_rs.rs_start);
IMPLY(vd->vdev_top == vd,
logical_rs.rs_end == physical_rs.rs_end);
/*
* Only a manual trim will be traversing the vdev sequentially.
* For an auto trim all valid ranges should be added.
*/
if (ta->trim_type == TRIM_TYPE_MANUAL) {
/* Only add segments that we have not visited yet */
if (physical_rs.rs_end <= vd->vdev_trim_last_offset)
return;
/* Pick up where we left off mid-range. */
if (vd->vdev_trim_last_offset > physical_rs.rs_start) {
ASSERT3U(physical_rs.rs_end, >,
vd->vdev_trim_last_offset);
physical_rs.rs_start = vd->vdev_trim_last_offset;
}
}
ASSERT3U(physical_rs.rs_end, >=, physical_rs.rs_start);
/*
* With raidz, it's possible that the logical range does not live on
* this leaf vdev. We only add the physical range to this vdev's if it
* has a length greater than 0.
*/
if (physical_rs.rs_end > physical_rs.rs_start) {
range_tree_add(ta->trim_tree, physical_rs.rs_start,
physical_rs.rs_end - physical_rs.rs_start);
} else {
ASSERT3U(physical_rs.rs_end, ==, physical_rs.rs_start);
}
vdev_xlate_walk(vd, &logical_rs, vdev_trim_xlate_range_add, arg);
}
/*

View File

@ -1111,7 +1111,9 @@ zfs_ereport_start_checksum(spa_t *spa, vdev_t *vd, const zbookmark_phys_t *zb,
bcopy(info, report->zcr_ckinfo, sizeof (*info));
}
report->zcr_align = 1ULL << vd->vdev_top->vdev_ashift;
report->zcr_sector = 1ULL << vd->vdev_top->vdev_ashift;
report->zcr_align =
vdev_psize_to_asize(vd->vdev_top, report->zcr_sector);
report->zcr_length = length;
#ifdef _KERNEL

View File

@ -1702,16 +1702,16 @@ zio_write_compress(zio_t *zio)
return (zio);
} else {
/*
* Round up compressed size up to the ashift
* of the smallest-ashift device, and zero the tail.
* This ensures that the compressed size of the BP
* (and thus compressratio property) are correct,
* Round compressed size up to the minimum allocation
* size of the smallest-ashift device, and zero the
* tail. This ensures that the compressed size of the
* BP (and thus compressratio property) are correct,
* in that we charge for the padding used to fill out
* the last sector.
*/
ASSERT3U(spa->spa_min_ashift, >=, SPA_MINBLOCKSHIFT);
size_t rounded = (size_t)P2ROUNDUP(psize,
1ULL << spa->spa_min_ashift);
ASSERT3U(spa->spa_min_alloc, >=, SPA_MINBLOCKSHIFT);
size_t rounded = (size_t)roundup(psize,
spa->spa_min_alloc);
if (rounded >= lsize) {
compress = ZIO_COMPRESS_OFF;
zio_buf_free(cbuf, lsize);
@ -3754,19 +3754,37 @@ zio_vdev_io_start(zio_t *zio)
* However, indirect vdevs point off to other vdevs which may have
* DTL's, so we never bypass them. The child i/os on concrete vdevs
* will be properly bypassed instead.
*
* Leaf DTL_PARTIAL can be empty when a legitimate write comes from
* a dRAID spare vdev. For example, when a dRAID spare is first
* used, its spare blocks need to be written to but the leaf vdev's
* of such blocks can have empty DTL_PARTIAL.
*
* There seemed no clean way to allow such writes while bypassing
* spurious ones. At this point, just avoid all bypassing for dRAID
* for correctness.
*/
if ((zio->io_flags & ZIO_FLAG_IO_REPAIR) &&
!(zio->io_flags & ZIO_FLAG_SELF_HEAL) &&
zio->io_txg != 0 && /* not a delegated i/o */
vd->vdev_ops != &vdev_indirect_ops &&
vd->vdev_top->vdev_ops != &vdev_draid_ops &&
!vdev_dtl_contains(vd, DTL_PARTIAL, zio->io_txg, 1)) {
ASSERT(zio->io_type == ZIO_TYPE_WRITE);
zio_vdev_io_bypass(zio);
return (zio);
}
if (vd->vdev_ops->vdev_op_leaf && (zio->io_type == ZIO_TYPE_READ ||
zio->io_type == ZIO_TYPE_WRITE || zio->io_type == ZIO_TYPE_TRIM)) {
/*
* Select the next best leaf I/O to process. Distributed spares are
* excluded since they dispatch the I/O directly to a leaf vdev after
* applying the dRAID mapping.
*/
if (vd->vdev_ops->vdev_op_leaf &&
vd->vdev_ops != &vdev_draid_spare_ops &&
(zio->io_type == ZIO_TYPE_READ ||
zio->io_type == ZIO_TYPE_WRITE ||
zio->io_type == ZIO_TYPE_TRIM)) {
if (zio->io_type == ZIO_TYPE_READ && vdev_cache_read(zio))
return (zio);
@ -3803,8 +3821,8 @@ zio_vdev_io_done(zio_t *zio)
if (zio->io_delay)
zio->io_delay = gethrtime() - zio->io_delay;
if (vd != NULL && vd->vdev_ops->vdev_op_leaf) {
if (vd != NULL && vd->vdev_ops->vdev_op_leaf &&
vd->vdev_ops != &vdev_draid_spare_ops) {
vdev_queue_io_done(zio);
if (zio->io_type == ZIO_TYPE_WRITE)
@ -4206,7 +4224,7 @@ zio_checksum_verify(zio_t *zio)
if (zio->io_prop.zp_checksum == ZIO_CHECKSUM_OFF)
return (zio);
ASSERT(zio->io_prop.zp_checksum == ZIO_CHECKSUM_LABEL);
ASSERT3U(zio->io_prop.zp_checksum, ==, ZIO_CHECKSUM_LABEL);
}
if ((error = zio_checksum_error(zio, &info)) != 0) {

View File

@ -265,6 +265,12 @@ zio_handle_fault_injection(zio_t *zio, int error)
if (zio->io_type != ZIO_TYPE_READ)
return (0);
/*
* A rebuild I/O has no checksum to verify.
*/
if (zio->io_priority == ZIO_PRIORITY_REBUILD && error == ECKSUM)
return (0);
rw_enter(&inject_lock, RW_READER);
for (handler = list_head(&inject_handlers); handler != NULL;

View File

@ -36,6 +36,7 @@ export ZPOOL_SCRIPT_DIR=$$CMD_DIR/zpool/zpool.d
export ZPOOL_SCRIPTS_PATH=$$CMD_DIR/zpool/zpool.d
export CONTRIB_DIR=@abs_top_builddir@/contrib
export LIB_DIR=@abs_top_builddir@/lib
export SYSCONF_DIR=@abs_top_builddir@/etc
export INSTALL_UDEV_DIR=@udevdir@
export INSTALL_UDEV_RULE_DIR=@udevruledir@

View File

@ -166,6 +166,8 @@ if [ "${INSTALL}" = "yes" ]; then
"$INSTALL_UDEV_RULE_DIR/90-zfs.rules"
install "$CMD_DIR/zpool/zpool.d" \
"$INSTALL_SYSCONF_DIR/zfs/zpool.d"
install "$SYSCONF_DIR/zfs/draid.d" \
"$INSTALL_SYSCONF_DIR/zfs/draid.d"
install "$CONTRIB_DIR/pyzfs/libzfs_core" \
"$INSTALL_PYTHON_DIR/libzfs_core"
# Ideally we would install these in the configured ${libdir}, which is
@ -185,6 +187,7 @@ else
remove "$INSTALL_UDEV_RULE_DIR/69-vdev.rules"
remove "$INSTALL_UDEV_RULE_DIR/90-zfs.rules"
remove "$INSTALL_SYSCONF_DIR/zfs/zpool.d"
remove "$INSTALL_SYSCONF_DIR/zfs/draid.d"
remove "$INSTALL_PYTHON_DIR/libzfs_core"
remove "/lib/libzfs_core.so"
remove "/lib/libnvpair.so"

View File

@ -18,6 +18,7 @@
#
# Copyright (c) 2015 by Delphix. All rights reserved.
# Copyright (C) 2016 Lawrence Livermore National Security, LLC.
# Copyright (c) 2017, Intel Corporation.
#
BASE_DIR=$(dirname "$0")
@ -246,27 +247,60 @@ while [[ $timeout -eq 0 ]] || [[ $curtime -le $((starttime + timeout)) ]]; do
or_die rm -rf "$workdir"
or_die mkdir "$workdir"
# switch between common arrangements & fully randomized
if [[ $((RANDOM % 2)) -eq 0 ]]; then
mirrors=2
raidz=0
parity=1
vdevs=2
else
mirrors=$(((RANDOM % 3) * 1))
parity=$(((RANDOM % 3) + 1))
raidz=$((((RANDOM % 9) + parity + 1) * (RANDOM % 2)))
vdevs=$(((RANDOM % 3) + 3))
fi
# switch between three types of configs
# 1/3 basic, 1/3 raidz mix, and 1/3 draid mix
choice=$((RANDOM % 3))
# ashift range 9 - 15
align=$(((RANDOM % 2) * 3 + 9))
runtime=$((RANDOM % 100))
# randomly use special classes
class="special=random"
if [[ $choice -eq 0 ]]; then
# basic mirror only
parity=1
mirrors=2
draid_data=0
draid_spares=0
raid_children=0
vdevs=2
raid_type="raidz"
elif [[ $choice -eq 1 ]]; then
# fully randomized mirror/raidz (sans dRAID)
parity=$(((RANDOM % 3) + 1))
mirrors=$(((RANDOM % 3) * 1))
draid_data=0
draid_spares=0
raid_children=$((((RANDOM % 9) + parity + 1) * (RANDOM % 2)))
vdevs=$(((RANDOM % 3) + 3))
raid_type="raidz"
else
# fully randomized dRAID (sans mirror/raidz)
parity=$(((RANDOM % 3) + 1))
mirrors=0
draid_data=$(((RANDOM % 8) + 3))
draid_spares=$(((RANDOM % 2) + parity))
stripe=$((draid_data + parity))
extra=$((draid_spares + (RANDOM % 4)))
raid_children=$(((((RANDOM % 4) + 1) * stripe) + extra))
vdevs=$((RANDOM % 3))
raid_type="draid"
fi
# run from 30 to 120 seconds
runtime=$(((RANDOM % 90) + 30))
passtime=$((RANDOM % (runtime / 3 + 1) + 10))
zopt="$zopt -K $raid_type"
zopt="$zopt -m $mirrors"
zopt="$zopt -r $raidz"
zopt="$zopt -r $raid_children"
zopt="$zopt -D $draid_data"
zopt="$zopt -S $draid_spares"
zopt="$zopt -R $parity"
zopt="$zopt -v $vdevs"
zopt="$zopt -a $align"
zopt="$zopt -C $class"
zopt="$zopt -T $runtime"
zopt="$zopt -P $passtime"
zopt="$zopt -s $size"

View File

@ -333,6 +333,8 @@ tests = ['zpool_create_001_pos', 'zpool_create_002_pos',
'zpool_create_020_pos', 'zpool_create_021_pos', 'zpool_create_022_pos',
'zpool_create_023_neg', 'zpool_create_024_pos',
'zpool_create_encrypted', 'zpool_create_crypt_combos',
'zpool_create_draid_001_pos', 'zpool_create_draid_002_pos',
'zpool_create_draid_003_pos', 'zpool_create_draid_004_pos',
'zpool_create_features_001_pos', 'zpool_create_features_002_pos',
'zpool_create_features_003_pos', 'zpool_create_features_004_neg',
'zpool_create_features_005_pos',
@ -375,7 +377,7 @@ tests = ['zpool_import_001_pos', 'zpool_import_002_pos',
'zpool_import_006_pos', 'zpool_import_007_pos', 'zpool_import_008_pos',
'zpool_import_009_neg', 'zpool_import_010_pos', 'zpool_import_011_neg',
'zpool_import_012_pos', 'zpool_import_013_neg', 'zpool_import_014_pos',
'zpool_import_015_pos',
'zpool_import_015_pos', 'zpool_import_016_pos', 'zpool_import_017_pos',
'zpool_import_features_001_pos', 'zpool_import_features_002_neg',
'zpool_import_features_003_pos', 'zpool_import_missing_001_pos',
'zpool_import_missing_002_pos', 'zpool_import_missing_003_pos',
@ -710,12 +712,14 @@ tests = ['redacted_compressed', 'redacted_contents', 'redacted_deleted',
tags = ['functional', 'redacted_send']
[tests/functional/raidz]
tests = ['raidz_001_neg', 'raidz_002_pos']
tests = ['raidz_001_neg', 'raidz_002_pos', 'raidz_003_pos', 'raidz_004_pos']
tags = ['functional', 'raidz']
[tests/functional/redundancy]
tests = ['redundancy_001_pos', 'redundancy_002_pos', 'redundancy_003_pos',
'redundancy_004_neg']
tests = ['redundancy_draid1', 'redundancy_draid2', 'redundancy_draid3',
'redundancy_draid_spare1', 'redundancy_draid_spare2',
'redundancy_draid_spare3', 'redundancy_mirror', 'redundancy_raidz1',
'redundancy_raidz2', 'redundancy_raidz3', 'redundancy_stripe']
tags = ['functional', 'redundancy']
[tests/functional/refquota]

View File

@ -218,6 +218,7 @@ maybe = {
'no_space/enospc_002_pos': ['FAIL', enospc_reason],
'projectquota/setup': ['SKIP', exec_reason],
'redundancy/redundancy_004_neg': ['FAIL', '7290'],
'redundancy/redundancy_draid_spare3': ['SKIP', known_reason],
'reservation/reservation_008_pos': ['FAIL', '7741'],
'reservation/reservation_018_pos': ['FAIL', '5642'],
'rsend/rsend_019_pos': ['FAIL', '6086'],

View File

@ -6,6 +6,7 @@ SUBDIRS = \
chg_usr_exec \
devname2devid \
dir_rd_update \
draid \
file_check \
file_trunc \
file_write \

1
tests/zfs-tests/cmd/draid/.gitignore vendored Normal file
View File

@ -0,0 +1 @@
/draid

View File

@ -0,0 +1,15 @@
include $(top_srcdir)/config/Rules.am
pkgexecdir = $(datadir)/@PACKAGE@/zfs-tests/bin
AM_CFLAGS += $(ZLIB_CFLAGS)
pkgexec_PROGRAMS = draid
draid_SOURCES = draid.c
draid_LDADD = \
$(abs_top_builddir)/lib/libzpool/libzpool.la \
$(abs_top_builddir)/lib/libnvpair/libnvpair.la
draid_LDADD += $(ZLIB_LIBS)

File diff suppressed because it is too large Load Diff

View File

@ -197,6 +197,7 @@ export ZFSTEST_FILES='badsend
chg_usr_exec
devname2devid
dir_rd_update
draid
file_check
file_trunc
file_write

View File

@ -2336,7 +2336,7 @@ function check_pool_status # pool token keyword <verbose>
function is_pool_resilvering #pool <verbose>
{
check_pool_status "$1" "scan" \
"resilver[ ()0-9A-Za-z_-]* in progress since" $2
"resilver[ ()0-9A-Za-z:_-]* in progress since" $2
return $?
}

View File

@ -60,6 +60,7 @@ MULTIHOST_IMPORT_INTERVALS multihost.import_intervals zfs_multihost_import_inter
MULTIHOST_INTERVAL multihost.interval zfs_multihost_interval
OVERRIDE_ESTIMATE_RECORDSIZE send.override_estimate_recordsize zfs_override_estimate_recordsize
PREFETCH_DISABLE prefetch.disable zfs_prefetch_disable
REBUILD_SCRUB_ENABLED rebuild_scrub_enabled zfs_rebuild_scrub_enabled
REMOVAL_SUSPEND_PROGRESS removal_suspend_progress zfs_removal_suspend_progress
REMOVE_MAX_SEGMENT remove_max_segment zfs_remove_max_segment
RESILVER_MIN_TIME_MS resilver_min_time_ms zfs_resilver_min_time_ms

View File

@ -66,7 +66,8 @@ function setup_filesystem #disklist #pool #fs #mntpoint #type #vdev
if [[ $vdev != "" && \
$vdev != "mirror" && \
$vdev != "raidz" ]] ; then
$vdev != "raidz" && \
$vdev != "draid" ]] ; then
log_note "Wrong vdev: (\"$vdev\")"
return 1

View File

@ -55,23 +55,26 @@ log_assert "'zpool add <pool> <vdev> ...' can add devices to the pool."
log_onexit cleanup
set -A keywords "" "mirror" "raidz" "raidz1" "spare"
set -A keywords "" "mirror" "raidz" "raidz1" "draid:1s" "draid1:1s" "spare"
pooldevs="${DISK0} \
\"${DISK0} ${DISK1}\" \
\"${DISK0} ${DISK1} ${DISK2}\""
mirrordevs="\"${DISK0} ${DISK1}\""
raidzdevs="\"${DISK0} ${DISK1}\""
draiddevs="\"${DISK0} ${DISK1} ${DISK2}\""
disk0=$TEST_BASE_DIR/disk0
disk1=$TEST_BASE_DIR/disk1
truncate -s $MINVDEVSIZE $disk0 $disk1
disk2=$TEST_BASE_DIR/disk2
truncate -s $MINVDEVSIZE $disk0 $disk1 $disk2
typeset -i i=0
typeset vdev
eval set -A poolarray $pooldevs
eval set -A mirrorarray $mirrordevs
eval set -A raidzarray $raidzdevs
eval set -A draidarray $draiddevs
while (( $i < ${#keywords[*]} )); do
@ -107,6 +110,19 @@ while (( $i < ${#keywords[*]} )); do
destroy_pool "$TESTPOOL"
done
;;
draid:1s|draid1:1s)
for vdev in "${draidarray[@]}"; do
create_pool "$TESTPOOL" "${keywords[i]}" \
"$disk0" "$disk1" "$disk2"
log_must poolexists "$TESTPOOL"
log_must zpool add "$TESTPOOL" ${keywords[i]} $vdev
log_must vdevs_in_pool "$TESTPOOL" "$vdev"
log_must vdevs_in_pool "$TESTPOOL" "draid1-0-0"
log_must vdevs_in_pool "$TESTPOOL" "draid1-1-0"
destroy_pool "$TESTPOOL"
done
;;
esac

View File

@ -27,6 +27,10 @@ dist_pkgdata_SCRIPTS = \
zpool_create_024_pos.ksh \
zpool_create_encrypted.ksh \
zpool_create_crypt_combos.ksh \
zpool_create_draid_001_pos.ksh \
zpool_create_draid_002_pos.ksh \
zpool_create_draid_003_pos.ksh \
zpool_create_draid_004_pos.ksh \
zpool_create_features_001_pos.ksh \
zpool_create_features_002_pos.ksh \
zpool_create_features_003_pos.ksh \
@ -36,5 +40,6 @@ dist_pkgdata_SCRIPTS = \
zpool_create_tempname.ksh
dist_pkgdata_DATA = \
draidcfg.gz \
zpool_create.cfg \
zpool_create.shlib

View File

@ -64,14 +64,16 @@ pooldevs="${DISK0} \
\"${DISK0} ${DISK1}\" \
\"${DISK0} ${DISK1} ${DISK2}\" \
\"$disk1 $disk2\""
raidzdevs="\"${DISK0} ${DISK1} ${DISK2}\""
mirrordevs="\"${DISK0} ${DISK1}\" \
$raidzdevs \
\"$disk1 $disk2\""
raidzdevs="\"${DISK0} ${DISK1} ${DISK2}\""
draiddevs="\"${DISK0} ${DISK1} ${DISK2}\""
create_pool_test "$TESTPOOL" "" "$pooldevs"
create_pool_test "$TESTPOOL" "mirror" "$mirrordevs"
create_pool_test "$TESTPOOL" "raidz" "$raidzdevs"
create_pool_test "$TESTPOOL" "raidz1" "$raidzdevs"
create_pool_test "$TESTPOOL" "draid" "$draiddevs"
log_pass "'zpool create <pool> <vspec> ...' success."

View File

@ -54,7 +54,7 @@ log_assert "'zpool create [-R root][-m mountpoint] <pool> <vdev> ...' can create
"an alternate pool or a new pool mounted at the specified mountpoint."
log_onexit cleanup
set -A pooltype "" "mirror" "raidz" "raidz1" "raidz2"
set -A pooltype "" "mirror" "raidz" "raidz1" "raidz2" "draid" "draid2"
#
# cleanup the pools created in previous case if zpool_create_004_pos timedout
@ -67,8 +67,8 @@ done
rm -rf $TESTDIR
log_must mkdir -p $TESTDIR
typeset -i i=1
while (( i < 4 )); do
log_must mkfile $FILESIZE $TESTDIR/file.$i
while (( i < 5 )); do
log_must truncate -s $FILESIZE $TESTDIR/file.$i
(( i = i + 1 ))
done
@ -87,7 +87,7 @@ do
log_must zpool destroy -f $TESTPOOL
[[ -d $TESTDIR1 ]] && rm -rf $TESTDIR1
log_must zpool create $opt $TESTPOOL ${pooltype[i]} \
$file.1 $file.2 $file.3
$file.1 $file.2 $file.3 $file.4
! poolexists $TESTPOOL && \
log_fail "Creating pool with $opt fails."
mpt=`zfs mount | egrep "^$TESTPOOL[^/]" | awk '{print $2}'`

View File

@ -97,6 +97,20 @@ set -A valid_args \
"raidz2 $vdev0 $vdev1 $vdev2 spare $vdev3 raidz2 $vdev4 $vdev5 $vdev6" \
"raidz3 $vdev0 $vdev1 $vdev2 $vdev3 \
mirror $vdev4 $vdev5 $vdev6 $vdev7" \
"draid $vdev0 $vdev1 $vdev2 mirror $vdev3 $vdev4" \
"draid $vdev0 $vdev1 $vdev2 raidz1 $vdev3 $vdev4 $vdev5" \
"draid $vdev0 $vdev1 $vdev2 draid1 $vdev3 $vdev4 $vdev5" \
"draid $vdev0 $vdev1 $vdev2 special mirror $vdev3 $vdev4" \
"draid2 $vdev0 $vdev1 $vdev2 $vdev3 mirror $vdev4 $vdev5 $vdev6" \
"draid2 $vdev0 $vdev1 $vdev2 $vdev3 raidz2 $vdev4 $vdev5 $vdev6" \
"draid2 $vdev0 $vdev1 $vdev2 $vdev3 draid2 $vdev4 $vdev5 $vdev6 $vdev7"\
"draid2 $vdev0 $vdev1 $vdev2 $vdev3 \
special mirror $vdev4 $vdev5 $vdev6" \
"draid2 $vdev0 $vdev1 $vdev2 $vdev3 \
special mirror $vdev4 $vdev5 $vdev6 \
cache $vdev7 log mirror $vdev8 $vdev9" \
"draid $vdev0 $vdev1 $vdev2 draid $vdev4 $vdev5 $vdev6 $vdev7 \
special mirror $vdev8 $vdev9" \
"spare $vdev0 $vdev1 $vdev2 mirror $vdev3 $vdev4 raidz $vdev5 $vdev6"
set -A forced_args \
@ -109,11 +123,19 @@ set -A forced_args \
"raidz $vdev0 $vdev1 raidz2 $vdev2 $vdev3 $vdev4" \
"raidz $vdev0 $vdev1 raidz2 $vdev2 $vdev3 $vdev4 spare $vdev5" \
"raidz $vdev0 $vdev1 spare $vdev2 raidz2 $vdev3 $vdev4 $vdev5" \
"raidz $vdev0 $vdev1 draid2 $vdev2 $vdev3 $vdev4 $vdev5" \
"raidz $vdev0 $vdev1 draid3 $vdev2 $vdev3 $vdev4 $vdev5 $vdev6" \
"mirror $vdev0 $vdev1 raidz $vdev2 $vdev3 raidz2 $vdev4 $vdev5 $vdev6" \
"mirror $vdev0 $vdev1 raidz $vdev2 $vdev3 \
raidz2 $vdev4 $vdev5 $vdev6 spare $vdev7" \
"mirror $vdev0 $vdev1 raidz $vdev2 $vdev3 \
spare $vdev4 raidz2 $vdev5 $vdev6 $vdev7" \
"mirror $vdev0 $vdev1 draid $vdev2 $vdev3 $vdev4 \
draid2 $vdev5 $vdev6 $vdev7 $vdev8 spare $vdev9" \
"draid $vdev0 $vdev1 $vdev2 $vdev3 \
draid2 $vdev4 $vdev5 $vdev6 $vdev7 $vdev8" \
"draid $vdev0 $vdev1 $vdev2 draid $vdev4 $vdev5 $vdev6 \
special mirror $vdev7 $vdev8 $vdev9" \
"spare $vdev0 $vdev1 $vdev2 mirror $vdev3 $vdev4 \
raidz2 $vdev5 $vdev6 $vdev7"

View File

@ -54,13 +54,16 @@ set -A args "" "-?" "-n" "-f" "-nf" "-fn" "-f -n" "--f" "-e" "-s" \
"$TESTPOOL c0txd0" "$TESTPOOL c0t0dx" "$TESTPOOL cxtxdx" \
"$TESTPOOL mirror" "$TESTPOOL raidz" "$TESTPOOL mirror raidz" \
"$TESTPOOL raidz1" "$TESTPOOL mirror raidz1" \
"$TESTPOOL draid1" "$TESTPOOL mirror draid1" \
"$TESTPOOL mirror c?t?d?" "$TESTPOOL mirror $DISK0 c0t1d?" \
"$TESTPOOL RAIDZ $DISK0 $DISK1" \
"$TESTPOOL $DISK0 log $DISK1 log $DISK2" \
"$TESTPOOL $DISK0 spare $DISK1 spare $DISK2" \
"$TESTPOOL RAIDZ1 $DISK0 $DISK1" \
"$TESTPOOL MIRROR $DISK0" "$TESTPOOL raidz $DISK0" \
"$TESTPOOL raidz1 $DISK0" \
"$TESTPOOL RAIDZ1 $DISK0 $DISK1" "$TESTPOOL MIRROR $DISK0" \
"$TESTPOOL DRAID $DISK1 $DISK2 $DISK3" "$TESTPOOL raidz $DISK0" \
"$TESTPOOL raidz1 $DISK0" "$TESTPOOL draid $DISK0" \
"$TESTPOOL draid2 $DISK0 $DISK1" \
"$TESTPOOL draid $DISK0 $DISK1 $DISK2 spare s0-draid1-0" \
"1tank $DISK0" "1234 $DISK0" "?tank $DISK0" \
"tan%k $DISK0" "ta@# $DISK0" "tan+k $DISK0" \
"$BYND_MAX_NAME $DISK0"

View File

@ -63,7 +63,7 @@ log_onexit cleanup
unset NOINUSE_CHECK
typeset opt
for opt in "" "mirror" "raidz" "raidz1"; do
for opt in "" "mirror" "raidz" "draid"; do
if [[ $opt == "" ]]; then
typeset disks=$DISK0
else

View File

@ -63,15 +63,16 @@ log_must zfs create $TESTPOOL/$TESTFS
log_must zfs set mountpoint=$TESTDIR $TESTPOOL/$TESTFS
typeset -l devsize=$(($SPA_MINDEVSIZE - 1024 * 1024))
for files in $TESTDIR/file1 $TESTDIR/file2
for files in $TESTDIR/file1 $TESTDIR/file2 $TESTDIR/file3
do
log_must mkfile $devsize $files
log_must truncate -s $devsize $files
done
set -A args \
"$TOOSMALL $TESTDIR/file1" "$TESTPOOL1 $TESTDIR/file1 $TESTDIR/file2" \
"$TOOSMALL mirror $TESTDIR/file1 $TESTDIR/file2" \
"$TOOSMALL raidz $TESTDIR/file1 $TESTDIR/file2"
"$TOOSMALL raidz $TESTDIR/file1 $TESTDIR/file2" \
"$TOOSMALL draid $TESTDIR/file1 $TESTDIR/file2 $TESTDIR/file3"
typeset -i i=0
while [[ $i -lt ${#args[*]} ]]; do

View File

@ -54,7 +54,7 @@ function cleanup
destroy_pool $pool
done
rm -rf $disk1 $disk2 $disk3
rm -rf $disk1 $disk2 $disk3 $disk4
if [[ -n $saved_dump_dev ]]; then
log_must dumpadm -u -d $saved_dump_dev
@ -66,12 +66,16 @@ log_onexit cleanup
disk1=$(create_blockfile $FILESIZE)
disk2=$(create_blockfile $FILESIZE)
disk3=$(create_blockfile $FILESIZE1)
disk3=$(create_blockfile $FILESIZE)
disk4=$(create_blockfile $FILESIZE1)
mirror1="$DISK0 $DISK1"
mirror2="$disk1 $disk2"
raidz1=$mirror1
raidz2=$mirror2
diff_size_dev="$disk2 $disk3"
draid1="$DISK0 $DISK1 $DISK2"
draid2="$disk1 $disk2 $disk3"
diff_size_dev="$disk2 $disk4"
draid_diff_size_dev="$disk1 $disk2 $disk4"
vfstab_dev=$(find_vfstab_dev)
if is_illumos; then
@ -91,13 +95,17 @@ set -A arg \
"$TESTPOOL1 mirror mirror $mirror1 mirror $mirror2" \
"$TESTPOOL1 raidz raidz $raidz1 raidz $raidz2" \
"$TESTPOOL1 raidz1 raidz1 $raidz1 raidz1 $raidz2" \
"$TESTPOOL1 draid draid $draid draid $draid2" \
"$TESTPOOL1 mirror raidz $raidz1 raidz $raidz2" \
"$TESTPOOL1 mirror raidz1 $raidz1 raidz1 $raidz2" \
"$TESTPOOL1 mirror draid $draid1 draid $draid2" \
"$TESTPOOL1 raidz mirror $mirror1 mirror $mirror2" \
"$TESTPOOL1 raidz1 mirror $mirror1 mirror $mirror2" \
"$TESTPOOL1 draid1 mirror $mirror1 mirror $mirror2" \
"$TESTPOOL1 mirror $diff_size_dev" \
"$TESTPOOL1 raidz $diff_size_dev" \
"$TESTPOOL1 raidz1 $diff_size_dev" \
"$TESTPOOL1 draid1 $draid_diff_size_dev" \
"$TESTPOOL1 mirror $mirror1 spare $mirror2 spare $diff_size_dev" \
"$TESTPOOL1 $vfstab_dev" \
"$TESTPOOL1 ${DISK0}s10" \

View File

@ -0,0 +1,75 @@
#!/bin/ksh -p
#
# CDDL HEADER START
#
# The contents of this file are subject to the terms of the
# Common Development and Distribution License (the "License").
# You may not use this file except in compliance with the License.
#
# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
# or http://www.opensolaris.org/os/licensing.
# See the License for the specific language governing permissions
# and limitations under the License.
#
# When distributing Covered Code, include this CDDL HEADER in each
# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
# If applicable, add the following below this CDDL HEADER, with the
# fields enclosed by brackets "[]" replaced with your own identifying
# information: Portions Copyright [yyyy] [name of copyright owner]
#
# CDDL HEADER END
#
#
# Copyright (c) 2020 Lawrence Livermore National Security, LLC.
. $STF_SUITE/include/libtest.shlib
#
# DESCRIPTION:
# Create a variety of dRAID pools using the minimal dRAID vdev syntax.
#
# STRATEGY:
# 1) Create the required number of allowed dRAID vdevs.
# 2) Create few pools of various sizes using the draid1|draid2|draid3 syntax.
#
verify_runnable "global"
function cleanup
{
poolexists $TESTPOOL && destroy_pool $TESTPOOL
rm -f $all_vdevs
rmdir $TESTDIR
}
log_assert "'zpool create <pool> <draid1|2|3> ...' can create a pool."
log_onexit cleanup
all_vdevs=$(echo $TESTDIR/file.{01..84})
mkdir $TESTDIR
log_must truncate -s $MINVDEVSIZE $all_vdevs
# Verify all configurations up to 24 vdevs.
for parity in {1..3}; do
for children in {$((parity + 2))..24}; do
vdevs=$(echo $TESTDIR/file.{01..${children}})
log_must zpool create $TESTPOOL draid$parity $vdevs
log_must poolexists $TESTPOOL
destroy_pool $TESTPOOL
done
done
# Spot check a few large configurations.
children_counts="53 84"
for children in $children_counts; do
vdevs=$(echo $TESTDIR/file.{01..${children}})
log_must zpool create $TESTPOOL draid $vdevs
log_must poolexists $TESTPOOL
destroy_pool $TESTPOOL
done
log_pass "'zpool create <pool> <draid1|2|3> <vdevs> ...' success."

View File

@ -0,0 +1,82 @@
#!/bin/ksh -p
#
# CDDL HEADER START
#
# The contents of this file are subject to the terms of the
# Common Development and Distribution License (the "License").
# You may not use this file except in compliance with the License.
#
# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
# or http://www.opensolaris.org/os/licensing.
# See the License for the specific language governing permissions
# and limitations under the License.
#
# When distributing Covered Code, include this CDDL HEADER in each
# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
# If applicable, add the following below this CDDL HEADER, with the
# fields enclosed by brackets "[]" replaced with your own identifying
# information: Portions Copyright [yyyy] [name of copyright owner]
#
# CDDL HEADER END
#
#
# Copyright (c) 2020 Lawrence Livermore National Security, LLC.
. $STF_SUITE/include/libtest.shlib
#
# DESCRIPTION:
# Create dRAID pool using the maximum number of vdevs (255). Then verify
# that creating a pool with 256 fails as expected.
#
# STRATEGY:
# 1) Verify a pool with fewer than the required vdevs fails.
# 2) Verify pools with a valid number of vdevs succeed.
# 3) Verify a pool which exceeds the maximum number of vdevs fails.
#
verify_runnable "global"
function cleanup
{
poolexists $TESTPOOL && destroy_pool $TESTPOOL
rm -f $all_vdevs
rmdir $TESTDIR
}
log_assert "'zpool create <pool> draid <vdevs>'"
log_onexit cleanup
all_vdevs=$(echo $TESTDIR/file.{01..256})
mkdir $TESTDIR
log_must truncate -s $MINVDEVSIZE $all_vdevs
# Below maximum dRAID vdev count for specified parity level.
log_mustnot zpool create $TESTPOOL draid1 $(echo $TESTDIR/file.{01..01})
log_mustnot zpool create $TESTPOOL draid2 $(echo $TESTDIR/file.{01..02})
log_mustnot zpool create $TESTPOOL draid3 $(echo $TESTDIR/file.{01..03})
# Verify pool sizes from 2-10. Values in between are skipped to speed
# up the test case but will be exercised by the random pool creation
# done in zpool_create_draid_002_pos.ksh.
for (( i=2; i<=10; i++ )); do
log_must zpool create $TESTPOOL draid:${i}c \
$(echo $TESTDIR/file.{01..$i})
log_must destroy_pool $TESTPOOL
done
# Verify pool sizes from 254-255.
for (( i=254; i<=255; i++ )); do
log_must zpool create $TESTPOOL draid:${i}c \
$(echo $TESTDIR/file.{01..$i})
log_must destroy_pool $TESTPOOL
done
# Exceeds maximum dRAID vdev count (256).
log_mustnot zpool create $TESTPOOL draid $(echo $TESTDIR/file.{01..256})
log_pass "'zpool create <pool> draid <vdevs>'"

View File

@ -0,0 +1,112 @@
#!/bin/ksh -p
#
# CDDL HEADER START
#
# The contents of this file are subject to the terms of the
# Common Development and Distribution License (the "License").
# You may not use this file except in compliance with the License.
#
# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
# or http://www.opensolaris.org/os/licensing.
# See the License for the specific language governing permissions
# and limitations under the License.
#
# When distributing Covered Code, include this CDDL HEADER in each
# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
# If applicable, add the following below this CDDL HEADER, with the
# fields enclosed by brackets "[]" replaced with your own identifying
# information: Portions Copyright [yyyy] [name of copyright owner]
#
# CDDL HEADER END
#
#
# Copyright (c) 2020 Lawrence Livermore National Security, LLC.
. $STF_SUITE/include/libtest.shlib
#
# DESCRIPTION:
# Verify allowed striped widths (data+parity) and hot spares may be
# configured at pool creation time.
#
# STRATEGY:
# 1) Test valid stripe/spare combinations given the number of children.
# 2) Test invalid stripe/spare/children combinations outside the allow limits.
#
verify_runnable "global"
function cleanup
{
poolexists $TESTPOOL && destroy_pool $TESTPOOL
rm -f $draid_vdevs
rmdir $TESTDIR
}
log_assert "'zpool create <pool> draid:#d:#c:#s <vdevs>'"
log_onexit cleanup
mkdir $TESTDIR
# Generate 10 random valid configurations to test.
for (( i=0; i<10; i++ )); do
parity=$(random_int_between 1 3)
spares=$(random_int_between 0 3)
data=$(random_int_between 1 16)
(( min_children = (data + parity + spares) ))
children=$(random_int_between $min_children 32)
draid="draid${parity}:${data}d:${children}c:${spares}s"
draid_vdevs=$(echo $TESTDIR/file.{01..$children})
log_must truncate -s $MINVDEVSIZE $draid_vdevs
log_must zpool create $TESTPOOL $draid $draid_vdevs
log_must poolexists $TESTPOOL
destroy_pool $TESTPOOL
rm -f $draid_vdevs
done
children=32
draid_vdevs=$(echo $TESTDIR/file.{01..$children})
log_must truncate -s $MINVDEVSIZE $draid_vdevs
mkdir $TESTDIR
log_must truncate -s $MINVDEVSIZE $draid_vdevs
# Out of order and unknown suffixes should fail.
log_mustnot zpool create $TESTPOOL draid:d8 $draid_vdevs
log_mustnot zpool create $TESTPOOL draid:s3 $draid_vdevs
log_mustnot zpool create $TESTPOOL draid:c32 $draid_vdevs
log_mustnot zpool create $TESTPOOL draid:10x $draid_vdevs
log_mustnot zpool create $TESTPOOL draid:x10 $draid_vdevs
# Exceeds maximum data disks (limited by total children)
log_must zpool create $TESTPOOL draid2:30d $draid_vdevs
log_must destroy_pool $TESTPOOL
log_mustnot zpool create $TESTPOOL draid2:31d $draid_vdevs
# At least one data disk must be requested.
log_mustnot zpool create $TESTPOOL draid2:0d $draid_vdevs
# Check invalid parity levels.
log_mustnot zpool create $TESTPOOL draid0 $draid_vdevs
log_mustnot zpool create $TESTPOOL draid4 $draid_vdevs
# Spares are limited: spares < children - (parity + data).
log_must zpool create $TESTPOOL draid2:20d:10s $draid_vdevs
log_must destroy_pool $TESTPOOL
log_mustnot zpool create $TESTPOOL draid2:20d:11s $draid_vdevs
# The required children argument is enforced.
log_mustnot zpool create $TESTPOOL draid2:0c $draid_vdevs
log_mustnot zpool create $TESTPOOL draid2:31c $draid_vdevs
log_must zpool create $TESTPOOL draid2:32c $draid_vdevs
destroy_pool $TESTPOOL
log_pass "'zpool create <pool> draid:#d:#c:#s <vdevs>'"

View File

@ -0,0 +1,43 @@
#!/bin/ksh -p
#
# CDDL HEADER START
#
# The contents of this file are subject to the terms of the
# Common Development and Distribution License (the "License").
# You may not use this file except in compliance with the License.
#
# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
# or http://www.opensolaris.org/os/licensing.
# See the License for the specific language governing permissions
# and limitations under the License.
#
# When distributing Covered Code, include this CDDL HEADER in each
# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
# If applicable, add the following below this CDDL HEADER, with the
# fields enclosed by brackets "[]" replaced with your own identifying
# information: Portions Copyright [yyyy] [name of copyright owner]
#
# CDDL HEADER END
#
#
# Copyright (c) 2020 Lawrence Livermore National Security, LLC.
#
. $STF_SUITE/include/libtest.shlib
#
# DESCRIPTION:
# Verify generated dRAID permutation maps against the authoritative
# reference file contains the full permutations.
#
verify_runnable "global"
log_assert "'draid verify'"
DRAIDCFG="$STF_SUITE/tests/functional/cli_root/zpool_create/draidcfg.gz"
log_must draid verify $DRAIDCFG
log_pass "'draid verify'"

View File

@ -72,7 +72,7 @@ log_onexit cleanup
log_assert "zpool can be autoexpanded after set autoexpand=on on vdev expansion"
for type in " " mirror raidz raidz2; do
for type in " " mirror raidz draid; do
log_note "Setting up loopback, scsi_debug, and file vdevs"
log_must truncate -s $org_size $FILE_LO
DEV1=$(losetup -f)
@ -144,6 +144,16 @@ for type in " " mirror raidz raidz2; do
if [[ $? -ne 0 ]] ; then
log_fail "pool $TESTPOOL1 has not expanded"
fi
elif [[ $type == "draid" ]]; then
typeset expansion_size=$((2*($exp_size-$org_size)))
zpool history -il $TESTPOOL1 | \
grep "pool '$TESTPOOL1' size:" | \
grep "vdev online" | \
grep "(+${expansion_size})" >/dev/null 2>&1
if [[ $? -ne 0 ]]; then
log_fail "pool $TESTPOOL has not expanded"
fi
else
typeset expansion_size=$((3*($exp_size-$org_size)))
zpool history -il $TESTPOOL1 | \

View File

@ -63,7 +63,7 @@ log_onexit cleanup
log_assert "zpool can expand after zpool online -e zvol vdevs on vdev expansion"
for type in " " mirror raidz raidz2; do
for type in " " mirror raidz draid:1s; do
# Initialize the file devices and the pool
for i in 1 2 3; do
log_must truncate -s $org_size ${TEMPFILE}.$i
@ -92,6 +92,8 @@ for type in " " mirror raidz raidz2; do
if [[ $type == "mirror" ]]; then
typeset expected_zpool_expandsize=$(($exp_size-$org_size))
elif [[ $type == "draid:1s" ]]; then
typeset expected_zpool_expandsize=$((2*($exp_size-$org_size)))
else
typeset expected_zpool_expandsize=$((3*($exp_size-$org_size)))
fi
@ -147,6 +149,17 @@ for type in " " mirror raidz raidz2; do
log_fail "pool $TESTPOOL1 has not expanded " \
"after zpool online -e"
fi
elif [[ $type == "draid:1s" ]]; then
typeset expansion_size=$((2*($exp_size-$org_size)))
zpool history -il $TESTPOOL1 | \
grep "pool '$TESTPOOL1' size:" | \
grep "vdev online" | \
grep "(+${expansion_size})" >/dev/null 2>&1
if [[ $? -ne 0 ]] ; then
log_fail "pool $TESTPOOL1 has not expanded " \
"after zpool online -e"
fi
else
typeset expansion_size=$((3*($exp_size-$org_size)))
zpool history -il $TESTPOOL1 | \
@ -160,9 +173,17 @@ for type in " " mirror raidz raidz2; do
fi
fi
else
log_fail "pool $TESTPOOL1 did not expand after vdev expansion " \
"and zpool online -e"
log_fail "pool $TESTPOOL1 did not expand after vdev " \
"expansion and zpool online -e"
fi
# For dRAID pools verify the distributed spare was resized after
# expansion and it is large enough to be used to replace a pool vdev.
if [[ $type == "draid:1s" ]]; then
log_must zpool replace -w $TESTPOOL1 $TEMPFILE.3 draid1-0-0
verify_pool $TESTPOOL1
fi
log_must zpool destroy $TESTPOOL1
done
log_pass "zpool can expand after zpool online -e"

View File

@ -73,7 +73,7 @@ log_onexit cleanup
log_assert "zpool can not expand if set autoexpand=off after vdev expansion"
for type in " " mirror raidz raidz2; do
for type in " " mirror raidz draid; do
log_note "Setting up loopback, scsi_debug, and file vdevs"
log_must truncate -s $org_size $FILE_LO
DEV1=$(losetup -f)

View File

@ -61,7 +61,7 @@ log_onexit cleanup
log_assert "After vdev expansion, all 4 labels have the same set of uberblocks."
for type in " " mirror raidz raidz2; do
for type in " " mirror raidz draid; do
for i in 1 2 3; do
log_must truncate -s $org_size ${TEMPFILE}.$i
done

Some files were not shown because too many files have changed in this diff Show More