Btrfs: introduce btrfs_{start, end}_nocow_write() for each subvolume

If the snapshot creation happened after the nocow write but before the dirty data flush, we would fail to flush the dirty data because of no space. So we must keep track of when those nocow write operations start and when they end, if there are nocow writers, the snapshot creators must wait. In order to implement this function, I introduce btrfs_{start, end}_nocow_write(), which is similar to mnt_{want,drop}_write(). These two functions are only used for nocow file write operations. Signed-off-by: Miao Xie <miaox@cn.fujitsu.com> Signed-off-by: Josef Bacik <jbacik@fb.com>
2014-03-06 13:38:19 +08:00 · 2014-03-06 13:38:19 +08:00 · 8257b2dc3c
parent 52483bc26f
commit 8257b2dc3c
5 changed files with 133 additions and 10 deletions
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@ -1681,6 +1681,11 @@ struct btrfs_fs_info {
 	unsigned int update_uuid_tree_gen:1;
 };
 struct btrfs_subvolume_writers {
 	struct percpu_counter	counter;
 	wait_queue_head_t	wait;
 };
 /*
 * in ram representation of the tree.  extent_root is used for all allocations
 * and for the extent tree extent_root root.
@ -1829,6 +1834,8 @@ struct btrfs_root {
 	 * manipulation with the read-only status via SUBVOL_SETFLAGS
 	 */
 	int send_in_progress;
 	struct btrfs_subvolume_writers *subv_writers;
 	atomic_t will_be_snapshoted;
 };
 struct btrfs_ioctl_defrag_range_args {
@ -3353,6 +3360,9 @@ int btrfs_init_space_info(struct btrfs_fs_info *fs_info);
 int btrfs_delayed_refs_qgroup_accounting(struct btrfs_trans_handle *trans,
 					 struct btrfs_fs_info *fs_info);
 int __get_raid_index(u64 flags);
 int btrfs_start_nocow_write(struct btrfs_root *root);
 void btrfs_end_nocow_write(struct btrfs_root *root);
 /* ctree.c */
 int btrfs_bin_search(struct extent_buffer *eb, struct btrfs_key *key,
 		     int level, int *slot);
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@ -1149,6 +1149,32 @@ void clean_tree_block(struct btrfs_trans_handle *trans, struct btrfs_root *root,
 	}
 }
 static struct btrfs_subvolume_writers *btrfs_alloc_subvolume_writers(void)
 {
 	struct btrfs_subvolume_writers *writers;
 	int ret;
 	writers = kmalloc(sizeof(*writers), GFP_NOFS);
 	if (!writers)
 		return ERR_PTR(-ENOMEM);
 	ret = percpu_counter_init(&writers->counter, 0);
 	if (ret < 0) {
 		kfree(writers);
 		return ERR_PTR(ret);
 	}
 	init_waitqueue_head(&writers->wait);
 	return writers;
 }
 static void
 btrfs_free_subvolume_writers(struct btrfs_subvolume_writers *writers)
 {
 	percpu_counter_destroy(&writers->counter);
 	kfree(writers);
 }
 static void __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize,
 			 u32 stripesize, struct btrfs_root *root,
 			 struct btrfs_fs_info *fs_info,
@ -1205,6 +1231,7 @@ static void __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize,
 	atomic_set(&root->log_batch, 0);
 	atomic_set(&root->orphan_inodes, 0);
 	atomic_set(&root->refs, 1);
 	atomic_set(&root->will_be_snapshoted, 0);
 	root->log_transid = 0;
 	root->log_transid_committed = -1;
 	root->last_log_commit = 0;
@ -1502,6 +1529,7 @@ struct btrfs_root *btrfs_read_fs_root(struct btrfs_root *tree_root,
 int btrfs_init_fs_root(struct btrfs_root *root)
 {
 	int ret;
 	struct btrfs_subvolume_writers *writers;
 	root->free_ino_ctl = kzalloc(sizeof(*root->free_ino_ctl), GFP_NOFS);
 	root->free_ino_pinned = kzalloc(sizeof(*root->free_ino_pinned),
@ -1511,6 +1539,13 @@ int btrfs_init_fs_root(struct btrfs_root *root)
 		goto fail;
 	}
 	writers = btrfs_alloc_subvolume_writers();
 	if (IS_ERR(writers)) {
 		ret = PTR_ERR(writers);
 		goto fail;
 	}
 	root->subv_writers = writers;
 	btrfs_init_free_ino_ctl(root);
 	mutex_init(&root->fs_commit_mutex);
 	spin_lock_init(&root->cache_lock);
@ -1518,8 +1553,11 @@ int btrfs_init_fs_root(struct btrfs_root *root)
 	ret = get_anon_bdev(&root->anon_dev);
 	if (ret)
-		goto fail;
+		goto free_writers;
 	return 0;
 free_writers:
 	btrfs_free_subvolume_writers(root->subv_writers);
 fail:
 	kfree(root->free_ino_ctl);
 	kfree(root->free_ino_pinned);
@ -3459,6 +3497,8 @@ static void free_fs_root(struct btrfs_root *root)
 	root->orphan_block_rsv = NULL;
 	if (root->anon_dev)
 		free_anon_bdev(root->anon_dev);
 	if (root->subv_writers)
 		btrfs_free_subvolume_writers(root->subv_writers);
 	free_extent_buffer(root->node);
 	free_extent_buffer(root->commit_root);
 	kfree(root->free_ino_ctl);
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@ -8938,3 +8938,38 @@ int btrfs_trim_fs(struct btrfs_root *root, struct fstrim_range *range)
 	range->len = trimmed;
 	return ret;
 }
 /*
 * btrfs_{start,end}_write() is similar to mnt_{want, drop}_write(),
 * they are used to prevent the some tasks writing data into the page cache
 * by nocow before the subvolume is snapshoted, but flush the data into
 * the disk after the snapshot creation.
 */
 void btrfs_end_nocow_write(struct btrfs_root *root)
 {
 	percpu_counter_dec(&root->subv_writers->counter);
 	/*
 	 * Make sure counter is updated before we wake up
 	 * waiters.
 	 */
 	smp_mb();
 	if (waitqueue_active(&root->subv_writers->wait))
 		wake_up(&root->subv_writers->wait);
 }
 int btrfs_start_nocow_write(struct btrfs_root *root)
 {
 	if (unlikely(atomic_read(&root->will_be_snapshoted)))
 		return 0;
 	percpu_counter_inc(&root->subv_writers->counter);
 	/*
 	 * Make sure counter is updated before we check for snapshot creation.
 	 */
 	smp_mb();
 	if (unlikely(atomic_read(&root->will_be_snapshoted))) {
 		btrfs_end_nocow_write(root);
 		return 0;
 	}
 	return 1;
 }
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@ -1410,6 +1410,10 @@ static noinline int check_can_nocow(struct inode *inode, loff_t pos,
 	u64 num_bytes;
 	int ret;
 	ret = btrfs_start_nocow_write(root);
 	if (!ret)
 		return -ENOSPC;
 	lockstart = round_down(pos, root->sectorsize);
 	lockend = round_up(pos + *write_bytes, root->sectorsize) - 1;
@ -1427,11 +1431,13 @@ static noinline int check_can_nocow(struct inode *inode, loff_t pos,
 	num_bytes = lockend - lockstart + 1;
 	ret = can_nocow_extent(inode, lockstart, &num_bytes, NULL, NULL, NULL);
-	if (ret <= 0)
+	if (ret <= 0) {
 		ret = 0;
-	else
+		btrfs_end_nocow_write(root);
 	} else {
 		*write_bytes = min_t(size_t, *write_bytes ,
 				     num_bytes - pos + lockstart);
 	}
 	unlock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend);
@ -1520,6 +1526,8 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file,
 			if (!only_release_metadata)
 				btrfs_free_reserved_data_space(inode,
 							       reserve_bytes);
 			else
 				btrfs_end_nocow_write(root);
 			break;
 		}
@ -1608,6 +1616,9 @@ again:
 		}
 		release_bytes = 0;
 		if (only_release_metadata)
 			btrfs_end_nocow_write(root);
 		if (only_release_metadata && copied > 0) {
 			u64 lockstart = round_down(pos, root->sectorsize);
 			u64 lockend = lockstart +
@ -1634,10 +1645,12 @@ again:
 	kfree(pages);
 	if (release_bytes) {
-		if (only_release_metadata)
+		if (only_release_metadata) {
 			btrfs_end_nocow_write(root);
 			btrfs_delalloc_release_metadata(inode, release_bytes);
-		else
+		} else {
 			btrfs_delalloc_release_space(inode, release_bytes);
 		}
 	}
 	return num_written ? num_written : ret;
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@ -611,6 +611,23 @@ fail:
 	return ret;
 }
 static void btrfs_wait_nocow_write(struct btrfs_root *root)
 {
 	s64 writers;
 	DEFINE_WAIT(wait);
 	do {
 		prepare_to_wait(&root->subv_writers->wait, &wait,
 				TASK_UNINTERRUPTIBLE);
 		writers = percpu_counter_sum(&root->subv_writers->counter);
 		if (writers)
 			schedule();
 		finish_wait(&root->subv_writers->wait, &wait);
 	} while (writers);
 }
 static int create_snapshot(struct btrfs_root *root, struct inode *dir,
 			   struct dentry *dentry, char *name, int namelen,
 			   u64 *async_transid, bool readonly,
@ -624,15 +641,21 @@ static int create_snapshot(struct btrfs_root *root, struct inode *dir,
 	if (!root->ref_cows)
 		return -EINVAL;
 	atomic_inc(&root->will_be_snapshoted);
 	smp_mb__after_atomic_inc();
 	btrfs_wait_nocow_write(root);
 	ret = btrfs_start_delalloc_inodes(root, 0);
 	if (ret)
-		return ret;
+		goto out;
 	btrfs_wait_ordered_extents(root, -1);
 	pending_snapshot = kzalloc(sizeof(*pending_snapshot), GFP_NOFS);
-	if (!pending_snapshot)
+	if (!pending_snapshot) {
-		return -ENOMEM;
+		ret = -ENOMEM;
 		goto out;
 	}
 	btrfs_init_block_rsv(&pending_snapshot->block_rsv,
 			     BTRFS_BLOCK_RSV_TEMP);
@ -649,7 +672,7 @@ static int create_snapshot(struct btrfs_root *root, struct inode *dir,
 					&pending_snapshot->qgroup_reserved,
 					false);
 	if (ret)
-		goto out;
+		goto free;
 	pending_snapshot->dentry = dentry;
 	pending_snapshot->root = root;
@ -700,8 +723,10 @@ fail:
 	btrfs_subvolume_release_metadata(BTRFS_I(dir)->root,
 					 &pending_snapshot->block_rsv,
 					 pending_snapshot->qgroup_reserved);
-out:
+free:
 	kfree(pending_snapshot);
 out:
 	atomic_dec(&root->will_be_snapshoted);
 	return ret;
 }