From 5a42ef04fd390dc96fbbf31bc9f3d05695998211 Mon Sep 17 00:00:00 2001 From: Paul Dagnelie Date: Wed, 1 Apr 2020 10:02:06 -0700 Subject: [PATCH] Add 'zfs wait' command Add a mechanism to wait for delete queue to drain. When doing redacted send/recv, many workflows involve deleting files that contain sensitive data. Because of the way zfs handles file deletions, snapshots taken quickly after a rm operation can sometimes still contain the file in question, especially if the file is very large. This can result in issues for redacted send/recv users who expect the deleted files to be redacted in the send streams, and not appear in their clones. This change duplicates much of the zpool wait related logic into a zfs wait command, which can be used to wait until the internal deleteq has been drained. Additional wait activities may be added in the future. Reviewed-by: Matthew Ahrens Reviewed-by: Brian Behlendorf Reviewed-by: John Gallagher Signed-off-by: Paul Dagnelie Closes #9707 --- cmd/zfs/zfs_main.c | 91 +++++++++++++- configure.ac | 1 + include/libzfs.h | 3 + include/libzfs_core.h | 1 + include/sys/dsl_dir.h | 8 ++ include/sys/fs/zfs.h | 12 ++ lib/libzfs/libzfs_dataset.c | 28 +++++ lib/libzfs_core/libzfs_core.c | 20 ++++ man/man8/Makefile.am | 1 + man/man8/zfs-wait.8 | 71 +++++++++++ man/man8/zfs.8 | 5 + module/os/linux/zfs/zfs_dir.c | 11 ++ module/os/linux/zfs/zfs_vfsops.c | 6 + module/zfs/dsl_dataset.c | 28 +++-- module/zfs/dsl_destroy.c | 4 + module/zfs/dsl_dir.c | 113 ++++++++++++++++++ module/zfs/zfs_ioctl.c | 82 +++++++++++++ tests/runfiles/common.run | 4 + .../libzfs_input_check/libzfs_input_check.c | 14 +++ .../tests/functional/cli_root/Makefile.am | 1 + .../functional/cli_root/zfs_wait/Makefile.am | 8 ++ .../functional/cli_root/zfs_wait/cleanup.ksh | 20 ++++ .../functional/cli_root/zfs_wait/setup.ksh | 21 ++++ .../cli_root/zfs_wait/zfs_wait.kshlib | 80 +++++++++++++ .../cli_root/zfs_wait/zfs_wait_deleteq.ksh | 57 +++++++++ 25 files changed, 679 insertions(+), 11 deletions(-) create mode 100644 man/man8/zfs-wait.8 create mode 100644 tests/zfs-tests/tests/functional/cli_root/zfs_wait/Makefile.am create mode 100755 tests/zfs-tests/tests/functional/cli_root/zfs_wait/cleanup.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zfs_wait/setup.ksh create mode 100644 tests/zfs-tests/tests/functional/cli_root/zfs_wait/zfs_wait.kshlib create mode 100755 tests/zfs-tests/tests/functional/cli_root/zfs_wait/zfs_wait_deleteq.ksh diff --git a/cmd/zfs/zfs_main.c b/cmd/zfs/zfs_main.c index 5e5bbc9728..ae71cdc881 100644 --- a/cmd/zfs/zfs_main.c +++ b/cmd/zfs/zfs_main.c @@ -122,6 +122,7 @@ static int zfs_do_change_key(int argc, char **argv); static int zfs_do_project(int argc, char **argv); static int zfs_do_version(int argc, char **argv); static int zfs_do_redact(int argc, char **argv); +static int zfs_do_wait(int argc, char **argv); #ifdef __FreeBSD__ static int zfs_do_jail(int argc, char **argv); @@ -183,7 +184,8 @@ typedef enum { HELP_VERSION, HELP_REDACT, HELP_JAIL, - HELP_UNJAIL + HELP_UNJAIL, + HELP_WAIT, } zfs_help_t; typedef struct zfs_command { @@ -248,6 +250,7 @@ static zfs_command_t command_table[] = { { "unload-key", zfs_do_unload_key, HELP_UNLOAD_KEY }, { "change-key", zfs_do_change_key, HELP_CHANGE_KEY }, { "redact", zfs_do_redact, HELP_REDACT }, + { "wait", zfs_do_wait, HELP_WAIT }, #ifdef __FreeBSD__ { "jail", zfs_do_jail, HELP_JAIL }, @@ -410,6 +413,8 @@ get_usage(zfs_help_t idx) return (gettext("\tjail \n")); case HELP_UNJAIL: return (gettext("\tunjail \n")); + case HELP_WAIT: + return (gettext("\twait [-t ] \n")); } abort(); @@ -8317,6 +8322,90 @@ zfs_do_project(int argc, char **argv) return (ret); } +static int +zfs_do_wait(int argc, char **argv) +{ + boolean_t enabled[ZFS_WAIT_NUM_ACTIVITIES]; + int error, i; + char c; + + /* By default, wait for all types of activity. */ + for (i = 0; i < ZFS_WAIT_NUM_ACTIVITIES; i++) + enabled[i] = B_TRUE; + + while ((c = getopt(argc, argv, "t:")) != -1) { + switch (c) { + case 't': + { + static char *col_subopts[] = { "deleteq", NULL }; + char *value; + + /* Reset activities array */ + bzero(&enabled, sizeof (enabled)); + while (*optarg != '\0') { + int activity = getsubopt(&optarg, col_subopts, + &value); + + if (activity < 0) { + (void) fprintf(stderr, + gettext("invalid activity '%s'\n"), + value); + usage(B_FALSE); + } + + enabled[activity] = B_TRUE; + } + break; + } + case '?': + (void) fprintf(stderr, gettext("invalid option '%c'\n"), + optopt); + usage(B_FALSE); + } + } + + argv += optind; + argc -= optind; + if (argc < 1) { + (void) fprintf(stderr, gettext("missing 'filesystem' " + "argument\n")); + usage(B_FALSE); + } + if (argc > 1) { + (void) fprintf(stderr, gettext("too many arguments\n")); + usage(B_FALSE); + } + + zfs_handle_t *zhp = zfs_open(g_zfs, argv[0], ZFS_TYPE_FILESYSTEM); + if (zhp == NULL) + return (1); + + for (;;) { + boolean_t missing = B_FALSE; + boolean_t any_waited = B_FALSE; + + for (int i = 0; i < ZFS_WAIT_NUM_ACTIVITIES; i++) { + boolean_t waited; + + if (!enabled[i]) + continue; + + error = zfs_wait_status(zhp, i, &missing, &waited); + if (error != 0 || missing) + break; + + any_waited = (any_waited || waited); + } + + if (error != 0 || missing || !any_waited) + break; + } + + zfs_close(zhp); + + return (error); +} + /* * Display version message */ diff --git a/configure.ac b/configure.ac index eeb0a3843a..370a1970ff 100644 --- a/configure.ac +++ b/configure.ac @@ -264,6 +264,7 @@ AC_CONFIG_FILES([ tests/zfs-tests/tests/functional/cli_root/zfs_unmount/Makefile tests/zfs-tests/tests/functional/cli_root/zfs_unshare/Makefile tests/zfs-tests/tests/functional/cli_root/zfs_upgrade/Makefile + tests/zfs-tests/tests/functional/cli_root/zfs_wait/Makefile tests/zfs-tests/tests/functional/cli_root/zpool/Makefile tests/zfs-tests/tests/functional/cli_root/zpool_add/Makefile tests/zfs-tests/tests/functional/cli_root/zpool_attach/Makefile diff --git a/include/libzfs.h b/include/libzfs.h index 236a73130c..7633579d43 100644 --- a/include/libzfs.h +++ b/include/libzfs.h @@ -507,6 +507,9 @@ extern nvlist_t *zfs_get_user_props(zfs_handle_t *); extern nvlist_t *zfs_get_recvd_props(zfs_handle_t *); extern nvlist_t *zfs_get_clones_nvl(zfs_handle_t *); +extern int zfs_wait_status(zfs_handle_t *, zfs_wait_activity_t, + boolean_t *, boolean_t *); + /* * zfs encryption management */ diff --git a/include/libzfs_core.h b/include/libzfs_core.h index c4b4f8e716..18ce6994a0 100644 --- a/include/libzfs_core.h +++ b/include/libzfs_core.h @@ -133,6 +133,7 @@ int lzc_pool_checkpoint_discard(const char *); int lzc_wait(const char *, zpool_wait_activity_t, boolean_t *); int lzc_wait_tag(const char *, zpool_wait_activity_t, uint64_t, boolean_t *); +int lzc_wait_fs(const char *, zfs_wait_activity_t, boolean_t *); #ifdef __cplusplus } diff --git a/include/sys/dsl_dir.h b/include/sys/dsl_dir.h index bb69210271..88fd610354 100644 --- a/include/sys/dsl_dir.h +++ b/include/sys/dsl_dir.h @@ -121,6 +121,11 @@ struct dsl_dir { bplist_t dd_pending_frees; bplist_t dd_pending_allocs; + kmutex_t dd_activity_lock; + kcondvar_t dd_activity_cv; + boolean_t dd_activity_cancelled; + uint64_t dd_activity_waiters; + /* protected by dd_lock; keep at end of struct for better locality */ char dd_myname[ZFS_MAX_DATASET_NAME_LEN]; }; @@ -192,6 +197,9 @@ boolean_t dsl_dir_is_zapified(dsl_dir_t *dd); void dsl_dir_livelist_open(dsl_dir_t *dd, uint64_t obj); void dsl_dir_livelist_close(dsl_dir_t *dd); void dsl_dir_remove_livelist(dsl_dir_t *dd, dmu_tx_t *tx, boolean_t total); +int dsl_dir_wait(dsl_dir_t *dd, dsl_dataset_t *ds, zfs_wait_activity_t activity, + boolean_t *waited); +void dsl_dir_cancel_waiters(dsl_dir_t *dd); /* internal reserved dir name */ #define MOS_DIR_NAME "$MOS" diff --git a/include/sys/fs/zfs.h b/include/sys/fs/zfs.h index 3484b13e37..477356aa77 100644 --- a/include/sys/fs/zfs.h +++ b/include/sys/fs/zfs.h @@ -1282,6 +1282,7 @@ typedef enum zfs_ioc { ZFS_IOC_REDACT, /* 0x5a51 */ ZFS_IOC_GET_BOOKMARK_PROPS, /* 0x5a52 */ ZFS_IOC_WAIT, /* 0x5a53 */ + ZFS_IOC_WAIT_FS, /* 0x5a54 */ /* * Per-platform (Optional) - 6/128 numbers reserved. @@ -1358,6 +1359,11 @@ typedef enum { ZPOOL_WAIT_NUM_ACTIVITIES } zpool_wait_activity_t; +typedef enum { + ZFS_WAIT_DELETEQ, + ZFS_WAIT_NUM_ACTIVITIES +} zfs_wait_activity_t; + /* * Bookmark name values. */ @@ -1415,6 +1421,12 @@ typedef enum { #define ZPOOL_WAIT_TAG "wait_tag" #define ZPOOL_WAIT_WAITED "wait_waited" +/* + * The following are names used when invoking ZFS_IOC_WAIT_FS. + */ +#define ZFS_WAIT_ACTIVITY "wait_activity" +#define ZFS_WAIT_WAITED "wait_waited" + /* * Flags for ZFS_IOC_VDEV_SET_STATE */ diff --git a/lib/libzfs/libzfs_dataset.c b/lib/libzfs/libzfs_dataset.c index 48d6563232..45e7a79fb4 100644 --- a/lib/libzfs/libzfs_dataset.c +++ b/lib/libzfs/libzfs_dataset.c @@ -5599,3 +5599,31 @@ zvol_volsize_to_reservation(zpool_handle_t *zph, uint64_t volsize, volsize += numdb; return (volsize); } + +/* + * Wait for the given activity and return the status of the wait (whether or not + * any waiting was done) in the 'waited' parameter. Non-existent fses are + * reported via the 'missing' parameter, rather than by printing an error + * message. This is convenient when this function is called in a loop over a + * long period of time (as it is, for example, by zfs's wait cmd). In that + * scenario, a fs being exported or destroyed should be considered a normal + * event, so we don't want to print an error when we find that the fs doesn't + * exist. + */ +int +zfs_wait_status(zfs_handle_t *zhp, zfs_wait_activity_t activity, + boolean_t *missing, boolean_t *waited) +{ + int error = lzc_wait_fs(zhp->zfs_name, activity, waited); + *missing = (error == ENOENT); + if (*missing) + return (0); + + if (error != 0) { + (void) zfs_standard_error_fmt(zhp->zfs_hdl, error, + dgettext(TEXT_DOMAIN, "error waiting in fs '%s'"), + zhp->zfs_name); + } + + return (error); +} diff --git a/lib/libzfs_core/libzfs_core.c b/lib/libzfs_core/libzfs_core.c index f65db4ff46..18143d3640 100644 --- a/lib/libzfs_core/libzfs_core.c +++ b/lib/libzfs_core/libzfs_core.c @@ -1621,3 +1621,23 @@ lzc_wait_tag(const char *pool, zpool_wait_activity_t activity, uint64_t tag, { return (wait_common(pool, activity, B_TRUE, tag, waited)); } + +int +lzc_wait_fs(const char *fs, zfs_wait_activity_t activity, boolean_t *waited) +{ + nvlist_t *args = fnvlist_alloc(); + nvlist_t *result = NULL; + + fnvlist_add_int32(args, ZFS_WAIT_ACTIVITY, activity); + + int error = lzc_ioctl(ZFS_IOC_WAIT_FS, fs, args, &result); + + if (error == 0 && waited != NULL) + *waited = fnvlist_lookup_boolean_value(result, + ZFS_WAIT_WAITED); + + fnvlist_free(args); + fnvlist_free(result); + + return (error); +} diff --git a/man/man8/Makefile.am b/man/man8/Makefile.am index f81a1f6720..8239c2157d 100644 --- a/man/man8/Makefile.am +++ b/man/man8/Makefile.am @@ -41,6 +41,7 @@ dist_man_MANS = \ zfs-unmount.8 \ zfs-upgrade.8 \ zfs-userspace.8 \ + zfs-wait.8 \ zgenhostid.8 \ zinject.8 \ zpool.8 \ diff --git a/man/man8/zfs-wait.8 b/man/man8/zfs-wait.8 new file mode 100644 index 0000000000..dcc679bb07 --- /dev/null +++ b/man/man8/zfs-wait.8 @@ -0,0 +1,71 @@ +.\" +.\" CDDL HEADER START +.\" +.\" The contents of this file are subject to the terms of the +.\" Common Development and Distribution License (the "License"). +.\" You may not use this file except in compliance with the License. +.\" +.\" You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +.\" or http://www.opensolaris.org/os/licensing. +.\" See the License for the specific language governing permissions +.\" and limitations under the License. +.\" +.\" When distributing Covered Code, include this CDDL HEADER in each +.\" file and include the License file at usr/src/OPENSOLARIS.LICENSE. +.\" If applicable, add the following below this CDDL HEADER, with the +.\" fields enclosed by brackets "[]" replaced with your own identifying +.\" information: Portions Copyright [yyyy] [name of copyright owner] +.\" +.\" CDDL HEADER END +.\" +.\" +.\" Copyright (c) 2007, Sun Microsystems, Inc. All Rights Reserved. +.\" Copyright (c) 2012, 2018 by Delphix. All rights reserved. +.\" Copyright (c) 2012 Cyril Plisko. All Rights Reserved. +.\" Copyright (c) 2017 Datto Inc. +.\" Copyright (c) 2018 George Melikov. All Rights Reserved. +.\" Copyright 2017 Nexenta Systems, Inc. +.\" Copyright (c) 2017 Open-E, Inc. All Rights Reserved. +.\" +.Dd August 9, 2019 +.Dt ZFS-WAIT 8 +.Os Linux +.Sh NAME +.Nm zfs Ns Pf - Cm wait +.Nd Wait for background activity to stop in a ZFS filesystem +.Sh SYNOPSIS +.Nm +.Cm wait +.Op Fl t Ar activity Ns Oo , Ns Ar activity Ns Oc Ns ... +.Ar fs +.Sh DESCRIPTION +.Bl -tag -width Ds +.It Xo +.Nm +.Cm wait +.Op Fl t Ar activity Ns Oo , Ns Ar activity Ns Oc Ns ... +.Ar fs +.Xc +Waits until all background activity of the given types has ceased in the given +filesystem. +The activity could cease because it has completed or because the filesystem has +been destroyed or unmounted. +If no activities are specified, the command waits until background activity of +every type listed below has ceased. +If there is no activity of the given types in progress, the command returns +immediately. +.Pp +These are the possible values for +.Ar activity , +along with what each one waits for: +.Bd -literal + deleteq The filesystem's internal delete queue to empty +.Ed +.Pp +Note that the internal delete queue does not finish draining until +all large files have had time to be fully destroyed and all open file +handles to unlinked files are closed. +.El +.El +.Sh SEE ALSO +.Xr lsof 8 diff --git a/man/man8/zfs.8 b/man/man8/zfs.8 index eb6e0e33e4..587f16c4ed 100644 --- a/man/man8/zfs.8 +++ b/man/man8/zfs.8 @@ -281,6 +281,11 @@ Attaches a filesystem to a jail. .It Xr zfs-unjail 8 Detaches a filesystem from a jail. .El +.Ss Waiting +.Bl -tag -width "" +.It Xr zfs-wait 8 +Wait for background activity in a filesystem to complete. +.El .Sh EXIT STATUS The .Nm diff --git a/module/os/linux/zfs/zfs_dir.c b/module/os/linux/zfs/zfs_dir.c index 7ebf38ddb6..591e35fd10 100644 --- a/module/os/linux/zfs/zfs_dir.c +++ b/module/os/linux/zfs/zfs_dir.c @@ -52,6 +52,8 @@ #include #include #include +#include +#include /* * zfs_match_find() is used by zfs_dirent_lock() to perform zap lookups @@ -739,6 +741,8 @@ zfs_rmnode(znode_t *zp) zfs_unlinked_add(xzp, tx); } + mutex_enter(&os->os_dsl_dataset->ds_dir->dd_activity_lock); + /* * Remove this znode from the unlinked set. If a has rollback has * occurred while a file is open and unlinked. Then when the file @@ -749,6 +753,13 @@ zfs_rmnode(znode_t *zp) zp->z_id, tx); VERIFY(error == 0 || error == ENOENT); + uint64_t count; + if (zap_count(os, zfsvfs->z_unlinkedobj, &count) == 0 && count == 0) { + cv_broadcast(&os->os_dsl_dataset->ds_dir->dd_activity_cv); + } + + mutex_exit(&os->os_dsl_dataset->ds_dir->dd_activity_lock); + dataset_kstats_update_nunlinked_kstat(&zfsvfs->z_kstat, 1); zfs_znode_delete(zp, tx); diff --git a/module/os/linux/zfs/zfs_vfsops.c b/module/os/linux/zfs/zfs_vfsops.c index 478e078626..b6757d1bcb 100644 --- a/module/os/linux/zfs/zfs_vfsops.c +++ b/module/os/linux/zfs/zfs_vfsops.c @@ -55,6 +55,7 @@ #include #include #include +#include #include #include #include @@ -872,6 +873,8 @@ zfsvfs_setup(zfsvfs_t *zfsvfs, boolean_t mounting) "num_entries in unlinked set: %llu", zs.zs_num_entries); zfs_unlinked_drain(zfsvfs); + dsl_dir_t *dd = zfsvfs->z_os->os_dsl_dataset->ds_dir; + dd->dd_activity_cancelled = B_FALSE; } /* @@ -1423,6 +1426,8 @@ zfsvfs_teardown(zfsvfs_t *zfsvfs, boolean_t unmounting) txg_wait_synced(dmu_objset_pool(zfsvfs->z_os), 0); } dmu_objset_evict_dbufs(zfsvfs->z_os); + dsl_dir_t *dd = os->os_dsl_dataset->ds_dir; + dsl_dir_cancel_waiters(dd); return (0); } @@ -1813,6 +1818,7 @@ zfs_resume_fs(zfsvfs_t *zfsvfs, dsl_dataset_t *ds) if (err != 0) goto bail; + ds->ds_dir->dd_activity_cancelled = B_FALSE; VERIFY(zfsvfs_setup(zfsvfs, B_FALSE) == 0); zfs_set_fuid_feature(zfsvfs); diff --git a/module/zfs/dsl_dataset.c b/module/zfs/dsl_dataset.c index 3e5a67bdb1..2d6e95e314 100644 --- a/module/zfs/dsl_dataset.c +++ b/module/zfs/dsl_dataset.c @@ -3077,20 +3077,26 @@ dsl_dataset_rename_snapshot(const char *fsname, static int dsl_dataset_handoff_check(dsl_dataset_t *ds, void *owner, dmu_tx_t *tx) { - boolean_t held; + boolean_t held = B_FALSE; if (!dmu_tx_is_syncing(tx)) return (0); - if (owner != NULL) { - VERIFY3P(ds->ds_owner, ==, owner); - dsl_dataset_long_rele(ds, owner); - } - - held = dsl_dataset_long_held(ds); - - if (owner != NULL) - dsl_dataset_long_hold(ds, owner); + dsl_dir_t *dd = ds->ds_dir; + mutex_enter(&dd->dd_activity_lock); + uint64_t holds = zfs_refcount_count(&ds->ds_longholds) - + (owner != NULL ? 1 : 0); + /* + * The value of dd_activity_waiters can chance as soon as we drop the + * lock, but we're fine with that; new waiters coming in or old + * waiters leaving doesn't cause problems, since we're going to cancel + * waiters later anyway. The goal of this check is to verify that no + * non-waiters have long-holds, and all new long-holds will be + * prevented because we're holding the pool config as writer. + */ + if (holds != dd->dd_activity_waiters) + held = B_TRUE; + mutex_exit(&dd->dd_activity_lock); if (held) return (SET_ERROR(EBUSY)); @@ -4036,6 +4042,8 @@ dsl_dataset_clone_swap_sync_impl(dsl_dataset_t *clone, DMU_MAX_ACCESS * spa_asize_inflation); ASSERT3P(clone->ds_prev, ==, origin_head->ds_prev); + dsl_dir_cancel_waiters(origin_head->ds_dir); + /* * Swap per-dataset feature flags. */ diff --git a/module/zfs/dsl_destroy.c b/module/zfs/dsl_destroy.c index 01b5f080d9..883928f0ec 100644 --- a/module/zfs/dsl_destroy.c +++ b/module/zfs/dsl_destroy.c @@ -766,6 +766,8 @@ dsl_destroy_head_check_impl(dsl_dataset_t *ds, int expected_holds) if (zfs_refcount_count(&ds->ds_longholds) != expected_holds) return (SET_ERROR(EBUSY)); + ASSERT0(ds->ds_dir->dd_activity_waiters); + mos = ds->ds_dir->dd_pool->dp_meta_objset; /* @@ -1002,6 +1004,8 @@ dsl_destroy_head_sync_impl(dsl_dataset_t *ds, dmu_tx_t *tx) /* We need to log before removing it from the namespace. */ spa_history_log_internal_ds(ds, "destroy", tx, " "); + dsl_dir_cancel_waiters(ds->ds_dir); + rmorigin = (dsl_dir_is_clone(ds->ds_dir) && DS_IS_DEFER_DESTROY(ds->ds_prev) && dsl_dataset_phys(ds->ds_prev)->ds_num_children == 2 && diff --git a/module/zfs/dsl_dir.c b/module/zfs/dsl_dir.c index 172ebc72c6..63ecb1d39e 100644 --- a/module/zfs/dsl_dir.c +++ b/module/zfs/dsl_dir.c @@ -51,6 +51,9 @@ #include #include "zfs_namecheck.h" #include "zfs_prop.h" +#ifdef _KERNEL +#include +#endif /* * Filesystem and Snapshot Limits @@ -160,6 +163,8 @@ dsl_dir_evict_async(void *dbu) dsl_dir_livelist_close(dd); dsl_prop_fini(dd); + cv_destroy(&dd->dd_activity_cv); + mutex_destroy(&dd->dd_activity_lock); mutex_destroy(&dd->dd_lock); kmem_free(dd, sizeof (dsl_dir_t)); } @@ -207,6 +212,8 @@ dsl_dir_hold_obj(dsl_pool_t *dp, uint64_t ddobj, } mutex_init(&dd->dd_lock, NULL, MUTEX_DEFAULT, NULL); + mutex_init(&dd->dd_activity_lock, NULL, MUTEX_DEFAULT, NULL); + cv_init(&dd->dd_activity_cv, NULL, CV_DEFAULT, NULL); dsl_prop_init(dd); dsl_dir_snap_cmtime_update(dd); @@ -280,6 +287,8 @@ dsl_dir_hold_obj(dsl_pool_t *dp, uint64_t ddobj, if (dsl_deadlist_is_open(&dd->dd_livelist)) dsl_dir_livelist_close(dd); dsl_prop_fini(dd); + cv_destroy(&dd->dd_activity_cv); + mutex_destroy(&dd->dd_activity_lock); mutex_destroy(&dd->dd_lock); kmem_free(dd, sizeof (dsl_dir_t)); dd = winner; @@ -310,6 +319,8 @@ errout: if (dsl_deadlist_is_open(&dd->dd_livelist)) dsl_dir_livelist_close(dd); dsl_prop_fini(dd); + cv_destroy(&dd->dd_activity_cv); + mutex_destroy(&dd->dd_activity_lock); mutex_destroy(&dd->dd_lock); kmem_free(dd, sizeof (dsl_dir_t)); dmu_buf_rele(dbuf, tag); @@ -2282,6 +2293,108 @@ dsl_dir_remove_livelist(dsl_dir_t *dd, dmu_tx_t *tx, boolean_t total) } } +static int +dsl_dir_activity_in_progress(dsl_dir_t *dd, dsl_dataset_t *ds, + zfs_wait_activity_t activity, boolean_t *in_progress) +{ + int error = 0; + + ASSERT(MUTEX_HELD(&dd->dd_activity_lock)); + + switch (activity) { + case ZFS_WAIT_DELETEQ: { +#ifdef _KERNEL + objset_t *os; + error = dmu_objset_from_ds(ds, &os); + if (error != 0) + break; + + mutex_enter(&os->os_user_ptr_lock); + void *user = dmu_objset_get_user(os); + mutex_exit(&os->os_user_ptr_lock); + if (dmu_objset_type(os) != DMU_OST_ZFS || + user == NULL || zfs_get_vfs_flag_unmounted(os)) { + *in_progress = B_FALSE; + return (0); + } + + uint64_t readonly = B_FALSE; + error = zfs_get_temporary_prop(ds, ZFS_PROP_READONLY, &readonly, + NULL); + + if (error != 0) + break; + + if (readonly || !spa_writeable(dd->dd_pool->dp_spa)) { + *in_progress = B_FALSE; + return (0); + } + + uint64_t count, unlinked_obj; + error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_UNLINKED_SET, 8, 1, + &unlinked_obj); + if (error != 0) { + dsl_dataset_rele(ds, FTAG); + break; + } + error = zap_count(os, unlinked_obj, &count); + + if (error == 0) + *in_progress = (count != 0); + break; +#else + /* + * The delete queue is ZPL specific, and libzpool doesn't have + * it. It doesn't make sense to wait for it. + */ + *in_progress = B_FALSE; + break; +#endif + } + default: + panic("unrecognized value for activity %d", activity); + } + + return (error); +} + +int +dsl_dir_wait(dsl_dir_t *dd, dsl_dataset_t *ds, zfs_wait_activity_t activity, + boolean_t *waited) +{ + int error = 0; + boolean_t in_progress; + dsl_pool_t *dp = dd->dd_pool; + for (;;) { + dsl_pool_config_enter(dp, FTAG); + error = dsl_dir_activity_in_progress(dd, ds, activity, + &in_progress); + dsl_pool_config_exit(dp, FTAG); + if (error != 0 || !in_progress) + break; + + *waited = B_TRUE; + + if (cv_wait_sig(&dd->dd_activity_cv, &dd->dd_activity_lock) == + 0 || dd->dd_activity_cancelled) { + error = SET_ERROR(EINTR); + break; + } + } + return (error); +} + +void +dsl_dir_cancel_waiters(dsl_dir_t *dd) +{ + mutex_enter(&dd->dd_activity_lock); + dd->dd_activity_cancelled = B_TRUE; + cv_broadcast(&dd->dd_activity_cv); + while (dd->dd_activity_waiters > 0) + cv_wait(&dd->dd_activity_cv, &dd->dd_activity_lock); + mutex_exit(&dd->dd_activity_lock); +} + #if defined(_KERNEL) EXPORT_SYMBOL(dsl_dir_set_quota); EXPORT_SYMBOL(dsl_dir_set_reservation); diff --git a/module/zfs/zfs_ioctl.c b/module/zfs/zfs_ioctl.c index d57aef5095..fb9435341d 100644 --- a/module/zfs/zfs_ioctl.c +++ b/module/zfs/zfs_ioctl.c @@ -4072,6 +4072,83 @@ zfs_ioc_wait(const char *name, nvlist_t *innvl, nvlist_t *outnvl) return (error); } +/* + * This ioctl waits for activity of a particular type to complete. If there is + * no activity of that type in progress, it returns immediately, and the + * returned value "waited" is false. If there is activity in progress, and no + * tag is passed in, the ioctl blocks until all activity of that type is + * complete, and then returns with "waited" set to true. + * + * If a thread waiting in the ioctl receives a signal, the call will return + * immediately, and the return value will be EINTR. + * + * innvl: { + * "wait_activity" -> int32_t + * } + * + * outnvl: "waited" -> boolean_t + */ +static const zfs_ioc_key_t zfs_keys_fs_wait[] = { + {ZFS_WAIT_ACTIVITY, DATA_TYPE_INT32, 0}, +}; + +static int +zfs_ioc_wait_fs(const char *name, nvlist_t *innvl, nvlist_t *outnvl) +{ + int32_t activity; + boolean_t waited = B_FALSE; + int error; + dsl_pool_t *dp; + dsl_dir_t *dd; + dsl_dataset_t *ds; + + if (nvlist_lookup_int32(innvl, ZFS_WAIT_ACTIVITY, &activity) != 0) + return (SET_ERROR(EINVAL)); + + if (activity >= ZFS_WAIT_NUM_ACTIVITIES || activity < 0) + return (SET_ERROR(EINVAL)); + + if ((error = dsl_pool_hold(name, FTAG, &dp)) != 0) + return (error); + + if ((error = dsl_dataset_hold(dp, name, FTAG, &ds)) != 0) { + dsl_pool_rele(dp, FTAG); + return (error); + } + + dd = ds->ds_dir; + mutex_enter(&dd->dd_activity_lock); + dd->dd_activity_waiters++; + + /* + * We get a long-hold here so that the dsl_dataset_t and dsl_dir_t + * aren't evicted while we're waiting. Normally this is prevented by + * holding the pool, but we can't do that while we're waiting since + * that would prevent TXGs from syncing out. Some of the functionality + * of long-holds (e.g. preventing deletion) is unnecessary for this + * case, since we would cancel the waiters before proceeding with a + * deletion. An alternative mechanism for keeping the dataset around + * could be developed but this is simpler. + */ + dsl_dataset_long_hold(ds, FTAG); + dsl_pool_rele(dp, FTAG); + + error = dsl_dir_wait(dd, ds, activity, &waited); + + dsl_dataset_long_rele(ds, FTAG); + dd->dd_activity_waiters--; + if (dd->dd_activity_waiters == 0) + cv_signal(&dd->dd_activity_cv); + mutex_exit(&dd->dd_activity_lock); + + dsl_dataset_rele(ds, FTAG); + + if (error == 0) + fnvlist_add_boolean_value(outnvl, ZFS_WAIT_WAITED, waited); + + return (error); +} + /* * fsname is name of dataset to rollback (to most recent snapshot) * @@ -6915,6 +6992,11 @@ zfs_ioctl_init(void) POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_FALSE, B_FALSE, zfs_keys_pool_wait, ARRAY_SIZE(zfs_keys_pool_wait)); + zfs_ioctl_register("wait_fs", ZFS_IOC_WAIT_FS, + zfs_ioc_wait_fs, zfs_secpolicy_none, DATASET_NAME, + POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_FALSE, B_FALSE, + zfs_keys_fs_wait, ARRAY_SIZE(zfs_keys_fs_wait)); + /* IOCTLS that use the legacy function signature */ zfs_ioctl_register_legacy(ZFS_IOC_POOL_FREEZE, zfs_ioc_pool_freeze, diff --git a/tests/runfiles/common.run b/tests/runfiles/common.run index 84ea70f07e..af720ad9b2 100644 --- a/tests/runfiles/common.run +++ b/tests/runfiles/common.run @@ -288,6 +288,10 @@ tests = ['zfs_upgrade_001_pos', 'zfs_upgrade_002_pos', 'zfs_upgrade_003_pos', 'zfs_upgrade_007_neg'] tags = ['functional', 'cli_root', 'zfs_upgrade'] +[tests/functional/cli_root/zfs_wait] +tests = ['zfs_wait_deleteq'] +tags = ['functional', 'cli_root', 'zfs_wait'] + [tests/functional/cli_root/zpool] tests = ['zpool_001_neg', 'zpool_002_pos', 'zpool_003_pos', 'zpool_colors'] tags = ['functional', 'cli_root', 'zpool'] diff --git a/tests/zfs-tests/cmd/libzfs_input_check/libzfs_input_check.c b/tests/zfs-tests/cmd/libzfs_input_check/libzfs_input_check.c index 47e8ff5e20..3f6147509f 100644 --- a/tests/zfs-tests/cmd/libzfs_input_check/libzfs_input_check.c +++ b/tests/zfs-tests/cmd/libzfs_input_check/libzfs_input_check.c @@ -739,6 +739,18 @@ test_wait(const char *pool) nvlist_free(optional); } +static void +test_wait_fs(const char *dataset) +{ + nvlist_t *required = fnvlist_alloc(); + + fnvlist_add_int32(required, "wait_activity", 2); + + IOC_INPUT_TEST(ZFS_IOC_WAIT_FS, dataset, required, NULL, EINVAL); + + nvlist_free(required); +} + static void zfs_ioc_input_tests(const char *pool) { @@ -826,6 +838,7 @@ zfs_ioc_input_tests(const char *pool) test_vdev_trim(pool); test_wait(pool); + test_wait_fs(dataset); /* * cleanup @@ -980,6 +993,7 @@ validate_ioc_values(void) CHECK(ZFS_IOC_BASE + 81 == ZFS_IOC_REDACT); CHECK(ZFS_IOC_BASE + 82 == ZFS_IOC_GET_BOOKMARK_PROPS); CHECK(ZFS_IOC_BASE + 83 == ZFS_IOC_WAIT); + CHECK(ZFS_IOC_BASE + 84 == ZFS_IOC_WAIT_FS); CHECK(ZFS_IOC_PLATFORM_BASE + 1 == ZFS_IOC_EVENTS_NEXT); CHECK(ZFS_IOC_PLATFORM_BASE + 2 == ZFS_IOC_EVENTS_CLEAR); CHECK(ZFS_IOC_PLATFORM_BASE + 3 == ZFS_IOC_EVENTS_SEEK); diff --git a/tests/zfs-tests/tests/functional/cli_root/Makefile.am b/tests/zfs-tests/tests/functional/cli_root/Makefile.am index 01af9d6b94..8d99df09f4 100644 --- a/tests/zfs-tests/tests/functional/cli_root/Makefile.am +++ b/tests/zfs-tests/tests/functional/cli_root/Makefile.am @@ -32,6 +32,7 @@ SUBDIRS = \ zfs_unmount \ zfs_unshare \ zfs_upgrade \ + zfs_wait \ zpool \ zpool_add \ zpool_attach \ diff --git a/tests/zfs-tests/tests/functional/cli_root/zfs_wait/Makefile.am b/tests/zfs-tests/tests/functional/cli_root/zfs_wait/Makefile.am new file mode 100644 index 0000000000..d401fe68b1 --- /dev/null +++ b/tests/zfs-tests/tests/functional/cli_root/zfs_wait/Makefile.am @@ -0,0 +1,8 @@ +pkgdatadir = $(datadir)/@PACKAGE@/zfs-tests/tests/functional/cli_root/zfs_wait +dist_pkgdata_SCRIPTS = \ + setup.ksh \ + cleanup.ksh \ + zfs_wait_deleteq.ksh + +dist_pkgdata_DATA = \ + zfs_wait.kshlib diff --git a/tests/zfs-tests/tests/functional/cli_root/zfs_wait/cleanup.ksh b/tests/zfs-tests/tests/functional/cli_root/zfs_wait/cleanup.ksh new file mode 100755 index 0000000000..456d2d0c2d --- /dev/null +++ b/tests/zfs-tests/tests/functional/cli_root/zfs_wait/cleanup.ksh @@ -0,0 +1,20 @@ +#!/bin/ksh -p +# +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright (c) 2018 by Delphix. All rights reserved. +# + +. $STF_SUITE/include/libtest.shlib + +default_cleanup diff --git a/tests/zfs-tests/tests/functional/cli_root/zfs_wait/setup.ksh b/tests/zfs-tests/tests/functional/cli_root/zfs_wait/setup.ksh new file mode 100755 index 0000000000..cca05fee72 --- /dev/null +++ b/tests/zfs-tests/tests/functional/cli_root/zfs_wait/setup.ksh @@ -0,0 +1,21 @@ +#!/bin/ksh -p +# +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright (c) 2018 by Delphix. All rights reserved. +# + +. $STF_SUITE/include/libtest.shlib +DISK=${DISKS%% *} + +default_setup $DISK diff --git a/tests/zfs-tests/tests/functional/cli_root/zfs_wait/zfs_wait.kshlib b/tests/zfs-tests/tests/functional/cli_root/zfs_wait/zfs_wait.kshlib new file mode 100644 index 0000000000..9f62a7c92e --- /dev/null +++ b/tests/zfs-tests/tests/functional/cli_root/zfs_wait/zfs_wait.kshlib @@ -0,0 +1,80 @@ +#!/bin/ksh +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright (c) 2018, 2019 by Delphix. All rights reserved. +# + +typeset -a disk_array=($(find_disks $DISKS)) + +typeset -r DISK1=${disk_array[0]} +typeset -r DISK2=${disk_array[1]} +typeset -r DISK3=${disk_array[2]} + +# +# When the condition it is waiting for becomes true, 'zfs wait' should return +# promptly. We want to enforce this, but any check will be racey because it will +# take some small but indeterminate amount of time for the waiting thread to be +# woken up and for the process to exit. +# +# To deal with this, we provide a grace period after the condition becomes true +# during which 'zfs wait' can exit. If it hasn't exited by the time the grace +# period expires we assume something is wrong and fail the test. While there is +# no value that can really be correct, the idea is we choose something large +# enough that it shouldn't cause issues in practice. +# +typeset -r WAIT_EXIT_GRACE=2.0 + +function proc_exists # pid +{ + ps -p $1 >/dev/null +} + +function proc_must_exist # pid +{ + proc_exists $1 || log_fail "zpool process exited too soon" +} + +function proc_must_not_exist # pid +{ + proc_exists $1 && log_fail "zpool process took too long to exit" +} + +function get_time +{ + date +'%H:%M:%S' +} + +function kill_if_running +{ + typeset pid=$1 + [[ $pid ]] && proc_exists $pid && log_must kill -s TERM $pid +} + +# Log a command and then start it running in the background +function log_bkgrnd +{ + log_note "$(get_time) Starting cmd in background '$@'" + "$@" & +} + +# Check that a background process has completed and exited with a status of 0 +function bkgrnd_proc_succeeded +{ + typeset pid=$1 + + log_must sleep $WAIT_EXIT_GRACE + + proc_must_not_exist $pid + wait $pid || log_fail "process exited with status $?" + log_note "$(get_time) wait completed successfully" +} diff --git a/tests/zfs-tests/tests/functional/cli_root/zfs_wait/zfs_wait_deleteq.ksh b/tests/zfs-tests/tests/functional/cli_root/zfs_wait/zfs_wait_deleteq.ksh new file mode 100755 index 0000000000..00c5a109c0 --- /dev/null +++ b/tests/zfs-tests/tests/functional/cli_root/zfs_wait/zfs_wait_deleteq.ksh @@ -0,0 +1,57 @@ +#!/bin/ksh -p +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright (c) 2018 by Delphix. All rights reserved. +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/cli_root/zfs_wait/zfs_wait.kshlib + +# +# DESCRIPTION: +# 'zfs wait' works when waiting for checkpoint discard to complete. +# +# STRATEGY: +# 1. Create a file +# 2. Open a file descriptor pointing to that file. +# 3. Delete the file. +# 4. Start a background process waiting for the delete queue to empty. +# 5. Verify that the command doesn't return immediately. +# 6. Close the open file descriptor. +# 7. Verify that the command returns soon after the descriptor is closed. +# + +function cleanup +{ + kill_if_running $pid + exec 3<&- +} + + +typeset -r TESTFILE="/$TESTPOOL/testfile" +typeset pid + +log_onexit cleanup + +log_must touch $TESTFILE +exec 3<> $TESTFILE +log_must rm $TESTFILE +log_bkgrnd zfs wait -t deleteq $TESTPOOL +pid=$! +proc_must_exist $pid + +exec 3<&- +log_must sleep 0.5 +bkgrnd_proc_succeeded $pid + +log_pass "'zfs wait -t discard' works."