mirror of https://github.com/openzfs/zfs.git
Fix i/o error handling of livelists and zap iteration
Pool-wide metadata is stored in the MOS (Meta Object Set). This metadata is stored in triplicate, in addition to any pool-level reduncancy (e.g. RAIDZ). However, if all 3+ copies of this metadata are not available, we can still get EIO/ECKSUM when reading from the MOS. If we encounter such an error in syncing context, we have typically already committed to making a change that we now can't do because of the corrupt/missing metadata. We typically "handle" this with a `VERIFY()` or `zfs_panic_recover()`. This prevents the system from continuing on in an undefined state, while minimizing the amount of error-handling code. However, there are some code paths that ignore these i/o errors, or `ASSERT()` that they don't happen. Since assertions are disabled on non-debug builds, they effectively ignore them as well. This can lead to ZFS continuing on in an incorrect state, potentially leading to on-disk inconsistencies. This commit adds handling for these i/o errors on MOS metadata, typically with a `VERIFY()`: * Handle error return from `zap_cursor_retrieve()` in 4 places in `dsl_deadlist.c`. * Handle error return from `zap_contains()` in `dsl_dir_hold_obj()`. Turns out this call isn't necessary because we can always call `zap_lookup()`. * Handle error return from `zap_lookup()` in `dsl_fs_ss_limit_check()`. * Handle error return from `zap_remove()` in `dsl_dir_rename_sync()`. * Handle error return from `zap_lookup()` in `dsl_dir_remove_livelist()`. * Handle error return from `dsl_process_sub_livelist()` in `spa_livelist_delete_cb()`. Additionally: * Augment the internal history log message for `zfs destroy` to note which method is used (e.g. bptree, livelist, or, synchronous) and the mintxg. * Correct a comment in `dbuf_init()`. * Correct indentation in `dsl_dir_remove_livelist()`. Reviewed by: Sara Hartse <sara.hartse@delphix.com> Reviewed-by: George Wilson <george.wilson@delphix.com> Signed-off-by: Matthew Ahrens <mahrens@delphix.com> Closes #10643
This commit is contained in:
parent
1b376d176e
commit
d87676a9fa
|
@ -134,6 +134,7 @@ dsl_deadlist_load_tree(dsl_deadlist_t *dl)
|
|||
{
|
||||
zap_cursor_t zc;
|
||||
zap_attribute_t za;
|
||||
int error;
|
||||
|
||||
ASSERT(MUTEX_HELD(&dl->dl_lock));
|
||||
|
||||
|
@ -162,7 +163,7 @@ dsl_deadlist_load_tree(dsl_deadlist_t *dl)
|
|||
sizeof (dsl_deadlist_entry_t),
|
||||
offsetof(dsl_deadlist_entry_t, dle_node));
|
||||
for (zap_cursor_init(&zc, dl->dl_os, dl->dl_object);
|
||||
zap_cursor_retrieve(&zc, &za) == 0;
|
||||
(error = zap_cursor_retrieve(&zc, &za)) == 0;
|
||||
zap_cursor_advance(&zc)) {
|
||||
dsl_deadlist_entry_t *dle = kmem_alloc(sizeof (*dle), KM_SLEEP);
|
||||
dle->dle_mintxg = zfs_strtonum(za.za_name, NULL);
|
||||
|
@ -177,6 +178,7 @@ dsl_deadlist_load_tree(dsl_deadlist_t *dl)
|
|||
|
||||
avl_add(&dl->dl_tree, dle);
|
||||
}
|
||||
VERIFY3U(error, ==, ENOENT);
|
||||
zap_cursor_fini(&zc);
|
||||
|
||||
for (dsl_deadlist_entry_t *dle = avl_first(&dl->dl_tree);
|
||||
|
@ -206,6 +208,7 @@ dsl_deadlist_load_cache(dsl_deadlist_t *dl)
|
|||
{
|
||||
zap_cursor_t zc;
|
||||
zap_attribute_t za;
|
||||
int error;
|
||||
|
||||
ASSERT(MUTEX_HELD(&dl->dl_lock));
|
||||
|
||||
|
@ -219,7 +222,7 @@ dsl_deadlist_load_cache(dsl_deadlist_t *dl)
|
|||
sizeof (dsl_deadlist_cache_entry_t),
|
||||
offsetof(dsl_deadlist_cache_entry_t, dlce_node));
|
||||
for (zap_cursor_init(&zc, dl->dl_os, dl->dl_object);
|
||||
zap_cursor_retrieve(&zc, &za) == 0;
|
||||
(error = zap_cursor_retrieve(&zc, &za)) == 0;
|
||||
zap_cursor_advance(&zc)) {
|
||||
if (za.za_first_integer == empty_bpobj)
|
||||
continue;
|
||||
|
@ -236,6 +239,7 @@ dsl_deadlist_load_cache(dsl_deadlist_t *dl)
|
|||
0, 0, 0, ZIO_PRIORITY_SYNC_READ);
|
||||
avl_add(&dl->dl_cache, dlce);
|
||||
}
|
||||
VERIFY3U(error, ==, ENOENT);
|
||||
zap_cursor_fini(&zc);
|
||||
|
||||
for (dsl_deadlist_cache_entry_t *dlce = avl_first(&dl->dl_cache);
|
||||
|
@ -378,6 +382,7 @@ dsl_deadlist_free(objset_t *os, uint64_t dlobj, dmu_tx_t *tx)
|
|||
dmu_object_info_t doi;
|
||||
zap_cursor_t zc;
|
||||
zap_attribute_t za;
|
||||
int error;
|
||||
|
||||
VERIFY0(dmu_object_info(os, dlobj, &doi));
|
||||
if (doi.doi_type == DMU_OT_BPOBJ) {
|
||||
|
@ -386,7 +391,7 @@ dsl_deadlist_free(objset_t *os, uint64_t dlobj, dmu_tx_t *tx)
|
|||
}
|
||||
|
||||
for (zap_cursor_init(&zc, os, dlobj);
|
||||
zap_cursor_retrieve(&zc, &za) == 0;
|
||||
(error = zap_cursor_retrieve(&zc, &za)) == 0;
|
||||
zap_cursor_advance(&zc)) {
|
||||
uint64_t obj = za.za_first_integer;
|
||||
if (obj == dmu_objset_pool(os)->dp_empty_bpobj)
|
||||
|
@ -394,6 +399,7 @@ dsl_deadlist_free(objset_t *os, uint64_t dlobj, dmu_tx_t *tx)
|
|||
else
|
||||
bpobj_free(os, obj, tx);
|
||||
}
|
||||
VERIFY3U(error, ==, ENOENT);
|
||||
zap_cursor_fini(&zc);
|
||||
VERIFY0(dmu_object_free(os, dlobj, tx));
|
||||
}
|
||||
|
@ -824,6 +830,7 @@ dsl_deadlist_merge(dsl_deadlist_t *dl, uint64_t obj, dmu_tx_t *tx)
|
|||
dmu_buf_t *bonus;
|
||||
dsl_deadlist_phys_t *dlp;
|
||||
dmu_object_info_t doi;
|
||||
int error;
|
||||
|
||||
VERIFY0(dmu_object_info(dl->dl_os, obj, &doi));
|
||||
if (doi.doi_type == DMU_OT_BPOBJ) {
|
||||
|
@ -836,12 +843,13 @@ dsl_deadlist_merge(dsl_deadlist_t *dl, uint64_t obj, dmu_tx_t *tx)
|
|||
|
||||
mutex_enter(&dl->dl_lock);
|
||||
for (zap_cursor_init(&zc, dl->dl_os, obj);
|
||||
zap_cursor_retrieve(&zc, &za) == 0;
|
||||
(error = zap_cursor_retrieve(&zc, &za)) == 0;
|
||||
zap_cursor_advance(&zc)) {
|
||||
uint64_t mintxg = zfs_strtonum(za.za_name, NULL);
|
||||
dsl_deadlist_insert_bpobj(dl, za.za_first_integer, mintxg, tx);
|
||||
VERIFY0(zap_remove_int(dl->dl_os, obj, mintxg, tx));
|
||||
}
|
||||
VERIFY3U(error, ==, ENOENT);
|
||||
zap_cursor_fini(&zc);
|
||||
|
||||
VERIFY0(dmu_bonus_hold(dl->dl_os, obj, FTAG, &bonus));
|
||||
|
|
|
@ -738,6 +738,10 @@ old_synchronous_dataset_destroy(dsl_dataset_t *ds, dmu_tx_t *tx)
|
|||
{
|
||||
struct killarg ka;
|
||||
|
||||
spa_history_log_internal_ds(ds, "destroy", tx,
|
||||
"(synchronous, mintxg=%llu)",
|
||||
(long long)dsl_dataset_phys(ds)->ds_prev_snap_txg);
|
||||
|
||||
/*
|
||||
* Free everything that we point to (that's born after
|
||||
* the previous snapshot, if we are a clone)
|
||||
|
@ -902,6 +906,14 @@ dsl_async_clone_destroy(dsl_dataset_t *ds, dmu_tx_t *tx)
|
|||
spa_t *spa = dmu_tx_pool(tx)->dp_spa;
|
||||
VERIFY0(dmu_objset_from_ds(ds, &os));
|
||||
|
||||
uint64_t mintxg = 0;
|
||||
dsl_deadlist_entry_t *dle = dsl_deadlist_first(&dd->dd_livelist);
|
||||
if (dle != NULL)
|
||||
mintxg = dle->dle_mintxg;
|
||||
|
||||
spa_history_log_internal_ds(ds, "destroy", tx,
|
||||
"(livelist, mintxg=%llu)", (long long)mintxg);
|
||||
|
||||
/* Check that the clone is in a correct state to be deleted */
|
||||
dsl_clone_destroy_assert(dd);
|
||||
|
||||
|
@ -922,7 +934,7 @@ dsl_async_clone_destroy(dsl_dataset_t *ds, dmu_tx_t *tx)
|
|||
spa->spa_livelists_to_delete = zap_obj;
|
||||
} else if (error != 0) {
|
||||
zfs_panic_recover("zfs: error %d was returned while looking "
|
||||
"up DMU_POOL_DELETED_CLONES in the zap");
|
||||
"up DMU_POOL_DELETED_CLONES in the zap", error);
|
||||
return;
|
||||
}
|
||||
VERIFY0(zap_add_int(mos, zap_obj, to_delete, tx));
|
||||
|
@ -952,6 +964,10 @@ dsl_async_dataset_destroy(dsl_dataset_t *ds, dmu_tx_t *tx)
|
|||
dsl_pool_t *dp = dmu_tx_pool(tx);
|
||||
objset_t *mos = dp->dp_meta_objset;
|
||||
|
||||
spa_history_log_internal_ds(ds, "destroy", tx,
|
||||
"(bptree, mintxg=%llu)",
|
||||
(long long)dsl_dataset_phys(ds)->ds_prev_snap_txg);
|
||||
|
||||
zil_destroy_sync(dmu_objset_zil(os), tx);
|
||||
|
||||
if (!spa_feature_is_active(dp->dp_spa,
|
||||
|
@ -1003,9 +1019,6 @@ dsl_destroy_head_sync_impl(dsl_dataset_t *ds, dmu_tx_t *tx)
|
|||
rrw_exit(&ds->ds_bp_rwlock, FTAG);
|
||||
ASSERT(RRW_WRITE_HELD(&dp->dp_config_rwlock));
|
||||
|
||||
/* We need to log before removing it from the namespace. */
|
||||
spa_history_log_internal_ds(ds, "destroy", tx, " ");
|
||||
|
||||
dsl_dir_cancel_waiters(ds->ds_dir);
|
||||
|
||||
rmorigin = (dsl_dir_is_clone(ds->ds_dir) &&
|
||||
|
|
|
@ -197,25 +197,27 @@ dsl_dir_hold_obj(dsl_pool_t *dp, uint64_t ddobj,
|
|||
dd->dd_dbuf = dbuf;
|
||||
dd->dd_pool = dp;
|
||||
|
||||
if (dsl_dir_is_zapified(dd) &&
|
||||
zap_contains(dp->dp_meta_objset, ddobj,
|
||||
DD_FIELD_CRYPTO_KEY_OBJ) == 0) {
|
||||
VERIFY0(zap_lookup(dp->dp_meta_objset,
|
||||
ddobj, DD_FIELD_CRYPTO_KEY_OBJ,
|
||||
sizeof (uint64_t), 1, &dd->dd_crypto_obj));
|
||||
|
||||
/* check for on-disk format errata */
|
||||
if (dsl_dir_incompatible_encryption_version(dd)) {
|
||||
dp->dp_spa->spa_errata =
|
||||
ZPOOL_ERRATA_ZOL_6845_ENCRYPTION;
|
||||
}
|
||||
}
|
||||
|
||||
mutex_init(&dd->dd_lock, NULL, MUTEX_DEFAULT, NULL);
|
||||
mutex_init(&dd->dd_activity_lock, NULL, MUTEX_DEFAULT, NULL);
|
||||
cv_init(&dd->dd_activity_cv, NULL, CV_DEFAULT, NULL);
|
||||
dsl_prop_init(dd);
|
||||
|
||||
if (dsl_dir_is_zapified(dd)) {
|
||||
err = zap_lookup(dp->dp_meta_objset,
|
||||
ddobj, DD_FIELD_CRYPTO_KEY_OBJ,
|
||||
sizeof (uint64_t), 1, &dd->dd_crypto_obj);
|
||||
if (err == 0) {
|
||||
/* check for on-disk format errata */
|
||||
if (dsl_dir_incompatible_encryption_version(
|
||||
dd)) {
|
||||
dp->dp_spa->spa_errata =
|
||||
ZPOOL_ERRATA_ZOL_6845_ENCRYPTION;
|
||||
}
|
||||
} else if (err != ENOENT) {
|
||||
goto errout;
|
||||
}
|
||||
}
|
||||
|
||||
dsl_dir_snap_cmtime_update(dd);
|
||||
|
||||
if (dsl_dir_phys(dd)->dd_parent_obj) {
|
||||
|
@ -863,9 +865,14 @@ dsl_fs_ss_limit_check(dsl_dir_t *dd, uint64_t delta, zfs_prop_t prop,
|
|||
* stop since we know there is no limit here (or above). The counts are
|
||||
* not valid on this node and we know we won't touch this node's counts.
|
||||
*/
|
||||
if (!dsl_dir_is_zapified(dd) || zap_lookup(os, dd->dd_object,
|
||||
count_prop, sizeof (count), 1, &count) == ENOENT)
|
||||
if (!dsl_dir_is_zapified(dd))
|
||||
return (0);
|
||||
err = zap_lookup(os, dd->dd_object,
|
||||
count_prop, sizeof (count), 1, &count);
|
||||
if (err == ENOENT)
|
||||
return (0);
|
||||
if (err != 0)
|
||||
return (err);
|
||||
|
||||
err = dsl_prop_get_dd(dd, zfs_prop_to_name(prop), 8, 1, &limit, NULL,
|
||||
B_FALSE);
|
||||
|
@ -2047,7 +2054,6 @@ dsl_dir_rename_sync(void *arg, dmu_tx_t *tx)
|
|||
dsl_pool_t *dp = dmu_tx_pool(tx);
|
||||
dsl_dir_t *dd, *newparent;
|
||||
const char *mynewname;
|
||||
int error;
|
||||
objset_t *mos = dp->dp_meta_objset;
|
||||
|
||||
VERIFY0(dsl_dir_hold(dp, ddra->ddra_oldname, FTAG, &dd, NULL));
|
||||
|
@ -2114,10 +2120,9 @@ dsl_dir_rename_sync(void *arg, dmu_tx_t *tx)
|
|||
dmu_buf_will_dirty(dd->dd_dbuf, tx);
|
||||
|
||||
/* remove from old parent zapobj */
|
||||
error = zap_remove(mos,
|
||||
VERIFY0(zap_remove(mos,
|
||||
dsl_dir_phys(dd->dd_parent)->dd_child_dir_zapobj,
|
||||
dd->dd_myname, tx);
|
||||
ASSERT0(error);
|
||||
dd->dd_myname, tx));
|
||||
|
||||
(void) strlcpy(dd->dd_myname, mynewname,
|
||||
sizeof (dd->dd_myname));
|
||||
|
@ -2259,49 +2264,45 @@ dsl_dir_remove_livelist(dsl_dir_t *dd, dmu_tx_t *tx, boolean_t total)
|
|||
zthr_t *ll_condense_thread = spa->spa_livelist_condense_zthr;
|
||||
if (ll_condense_thread != NULL &&
|
||||
(to_condense.ds != NULL) && (to_condense.ds->ds_dir == dd)) {
|
||||
/*
|
||||
* We use zthr_wait_cycle_done instead of zthr_cancel
|
||||
* because we don't want to destroy the zthr, just have
|
||||
* it skip its current task.
|
||||
*/
|
||||
spa->spa_to_condense.cancelled = B_TRUE;
|
||||
zthr_wait_cycle_done(ll_condense_thread);
|
||||
/*
|
||||
* If we've returned from zthr_wait_cycle_done without
|
||||
* clearing the to_condense data structure it's either
|
||||
* because the no-wait synctask has started (which is
|
||||
* indicated by 'syncing' field of to_condense) and we
|
||||
* can expect it to clear to_condense on its own.
|
||||
* Otherwise, we returned before the zthr ran. The
|
||||
* checkfunc will now fail as cancelled == B_TRUE so we
|
||||
* can safely NULL out ds, allowing a different dir's
|
||||
* livelist to be condensed.
|
||||
*
|
||||
* We can be sure that the to_condense struct will not
|
||||
* be repopulated at this stage because both this
|
||||
* function and dsl_livelist_try_condense execute in
|
||||
* syncing context.
|
||||
*/
|
||||
if ((spa->spa_to_condense.ds != NULL) &&
|
||||
!spa->spa_to_condense.syncing) {
|
||||
dmu_buf_rele(spa->spa_to_condense.ds->ds_dbuf,
|
||||
spa);
|
||||
spa->spa_to_condense.ds = NULL;
|
||||
}
|
||||
/*
|
||||
* We use zthr_wait_cycle_done instead of zthr_cancel
|
||||
* because we don't want to destroy the zthr, just have
|
||||
* it skip its current task.
|
||||
*/
|
||||
spa->spa_to_condense.cancelled = B_TRUE;
|
||||
zthr_wait_cycle_done(ll_condense_thread);
|
||||
/*
|
||||
* If we've returned from zthr_wait_cycle_done without
|
||||
* clearing the to_condense data structure it's either
|
||||
* because the no-wait synctask has started (which is
|
||||
* indicated by 'syncing' field of to_condense) and we
|
||||
* can expect it to clear to_condense on its own.
|
||||
* Otherwise, we returned before the zthr ran. The
|
||||
* checkfunc will now fail as cancelled == B_TRUE so we
|
||||
* can safely NULL out ds, allowing a different dir's
|
||||
* livelist to be condensed.
|
||||
*
|
||||
* We can be sure that the to_condense struct will not
|
||||
* be repopulated at this stage because both this
|
||||
* function and dsl_livelist_try_condense execute in
|
||||
* syncing context.
|
||||
*/
|
||||
if ((spa->spa_to_condense.ds != NULL) &&
|
||||
!spa->spa_to_condense.syncing) {
|
||||
dmu_buf_rele(spa->spa_to_condense.ds->ds_dbuf,
|
||||
spa);
|
||||
spa->spa_to_condense.ds = NULL;
|
||||
}
|
||||
}
|
||||
|
||||
dsl_dir_livelist_close(dd);
|
||||
int err = zap_lookup(dp->dp_meta_objset, dd->dd_object,
|
||||
DD_FIELD_LIVELIST, sizeof (uint64_t), 1, &obj);
|
||||
if (err == 0) {
|
||||
VERIFY0(zap_remove(dp->dp_meta_objset, dd->dd_object,
|
||||
DD_FIELD_LIVELIST, tx));
|
||||
if (total) {
|
||||
dsl_deadlist_free(dp->dp_meta_objset, obj, tx);
|
||||
spa_feature_decr(spa, SPA_FEATURE_LIVELIST, tx);
|
||||
}
|
||||
} else {
|
||||
ASSERT3U(err, !=, ENOENT);
|
||||
VERIFY0(zap_lookup(dp->dp_meta_objset, dd->dd_object,
|
||||
DD_FIELD_LIVELIST, sizeof (uint64_t), 1, &obj));
|
||||
VERIFY0(zap_remove(dp->dp_meta_objset, dd->dd_object,
|
||||
DD_FIELD_LIVELIST, tx));
|
||||
if (total) {
|
||||
dsl_deadlist_free(dp->dp_meta_objset, obj, tx);
|
||||
spa_feature_decr(spa, SPA_FEATURE_LIVELIST, tx);
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -2527,7 +2527,7 @@ spa_livelist_delete_cb(void *arg, zthr_t *z)
|
|||
sublist_delete_sync, &sync_arg, 0,
|
||||
ZFS_SPACE_CHECK_DESTROY));
|
||||
} else {
|
||||
ASSERT(err == EINTR);
|
||||
VERIFY3U(err, ==, EINTR);
|
||||
}
|
||||
bplist_clear(&to_free);
|
||||
bplist_destroy(&to_free);
|
||||
|
|
Loading…
Reference in New Issue