From af2679e4a7f68f5b99593f61e9cdb8b5b69b0bec Mon Sep 17 00:00:00 2001
From: Liu Bo <bo.li.liu@oracle.com>
Date: Thu, 25 Jan 2018 11:02:48 -0700
Subject: [PATCH 001/164] Btrfs: enhance leak debug checker for extent state
 and extent buffer

This prints out eb->bflags since it contains some useful information,
e.g. whether eb is dirty.

Signed-off-by: Liu Bo <bo.li.liu@oracle.com>
Reviewed-by: Josef Bacik <jbacik@fb.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/extent_io.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index dfeb74a0be77..5bd7ecefc2bc 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -76,8 +76,8 @@ void btrfs_leak_debug_check(void)
 
 	while (!list_empty(&buffers)) {
 		eb = list_entry(buffers.next, struct extent_buffer, leak_list);
-		pr_err("BTRFS: buffer leak start %llu len %lu refs %d\n",
-		       eb->start, eb->len, atomic_read(&eb->refs));
+		pr_err("BTRFS: buffer leak start %llu len %lu refs %d bflags %lu\n",
+		       eb->start, eb->len, atomic_read(&eb->refs), eb->bflags);
 		list_del(&eb->leak_list);
 		kmem_cache_free(extent_buffer_cache, eb);
 	}

From 062d4d1f4085c3135fc159fb7eac96c31b4f47f9 Mon Sep 17 00:00:00 2001
From: Qu Wenruo <wqu@suse.com>
Date: Tue, 30 Jan 2018 18:20:46 +0800
Subject: [PATCH 002/164] btrfs: Refactor parameter of BTRFS_MAX_DEVS() from
 root to fs_info

Signed-off-by: Qu Wenruo <wqu@suse.com>
Reviewed-by: Anand Jain <anand.jain@oracle.com>
Reviewed-by: Nikolay Borisov <nborisov@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/volumes.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index b2d05c6b1c56..c9e5f8306d02 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -4672,7 +4672,7 @@ static void check_raid56_incompat_flag(struct btrfs_fs_info *info, u64 type)
 	btrfs_set_fs_incompat(info, RAID56);
 }
 
-#define BTRFS_MAX_DEVS(r) ((BTRFS_MAX_ITEM_SIZE(r->fs_info)		\
+#define BTRFS_MAX_DEVS(info) ((BTRFS_MAX_ITEM_SIZE(info)	\
 			- sizeof(struct btrfs_chunk))		\
 			/ sizeof(struct btrfs_stripe) + 1)
 
@@ -4729,7 +4729,7 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
 		max_stripe_size = SZ_1G;
 		max_chunk_size = 10 * max_stripe_size;
 		if (!devs_max)
-			devs_max = BTRFS_MAX_DEVS(info->chunk_root);
+			devs_max = BTRFS_MAX_DEVS(info);
 	} else if (type & BTRFS_BLOCK_GROUP_METADATA) {
 		/* for larger filesystems, use larger metadata chunks */
 		if (fs_devices->total_rw_bytes > 50ULL * SZ_1G)
@@ -4738,7 +4738,7 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
 			max_stripe_size = SZ_256M;
 		max_chunk_size = max_stripe_size;
 		if (!devs_max)
-			devs_max = BTRFS_MAX_DEVS(info->chunk_root);
+			devs_max = BTRFS_MAX_DEVS(info);
 	} else if (type & BTRFS_BLOCK_GROUP_SYSTEM) {
 		max_stripe_size = SZ_32M;
 		max_chunk_size = 2 * max_stripe_size;

From b6a535faed06c2bdfaf55b00025dfdcb1eadf980 Mon Sep 17 00:00:00 2001
From: Howard McLauchlan <hmclauchlan@fb.com>
Date: Fri, 2 Feb 2018 11:09:01 -0800
Subject: [PATCH 003/164] btrfs: print error if primary super block write fails

Presently, failing a primary super block write but succeeding in at
least one super block write in general will appear to users as if
nothing important went wrong. However, upon unmounting and re-mounting,
the file system will be in a rolled back state. This was discovered
with a BCC program that uses bpf_override_return() to fail super block
writes.

This patch outputs an error clarifying that the primary super block
write has failed, so users can expect potentially erroneous behaviour.
It also forces wait_dev_supers() to return an error to its caller if
the primary super block write fails.

Signed-off-by: Howard McLauchlan <hmclauchlan@fb.com>
Reviewed-by: Qu Wenruo <wqu@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/disk-io.c | 15 ++++++++++++++-
 1 file changed, 14 insertions(+), 1 deletion(-)

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 21f34ad0d411..9f7ed4390b33 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -3290,6 +3290,7 @@ static int wait_dev_supers(struct btrfs_device *device, int max_mirrors)
 	struct buffer_head *bh;
 	int i;
 	int errors = 0;
+	bool primary_failed = false;
 	u64 bytenr;
 
 	if (max_mirrors == 0)
@@ -3306,11 +3307,16 @@ static int wait_dev_supers(struct btrfs_device *device, int max_mirrors)
 				      BTRFS_SUPER_INFO_SIZE);
 		if (!bh) {
 			errors++;
+			if (i == 0)
+				primary_failed = true;
 			continue;
 		}
 		wait_on_buffer(bh);
-		if (!buffer_uptodate(bh))
+		if (!buffer_uptodate(bh)) {
 			errors++;
+			if (i == 0)
+				primary_failed = true;
+		}
 
 		/* drop our reference */
 		brelse(bh);
@@ -3319,6 +3325,13 @@ static int wait_dev_supers(struct btrfs_device *device, int max_mirrors)
 		brelse(bh);
 	}
 
+	/* log error, force error return */
+	if (primary_failed) {
+		btrfs_err(device->fs_info, "error writing primary super block to device %llu",
+			  device->devid);
+		return -1;
+	}
+
 	return errors < i ? 0 : -1;
 }
 

From 7806c6eb15f227a484c368bbaf07da9978f57869 Mon Sep 17 00:00:00 2001
From: Nikolay Borisov <nborisov@suse.com>
Date: Fri, 15 Dec 2017 12:06:18 +0200
Subject: [PATCH 004/164] btrfs: Remove unused btrfs_start_transaction_lflush
 function

Commit 0e8c36a9fd81 ("Btrfs: fix lots of orphan inodes when the space
is not enough") changed the way transaction reservation is made in
btrfs_evict_node and as a result this function became unused. This has
been the status quo for 5 years in which time no one noticed, so I'd
say it's safe to assume it's unlikely it will ever be used again.

Historical note: there were more attempts to remove the function, the
reasoning was missing and only based on some static analysis tool
reports. Other reason for rejection was that there seemed to be
connection to BTRFS_RESERVE_FLUSH_LIMIT and that would need to be
removeed to. This was not correct so removing the function is all we can
do.

Signed-off-by: Nikolay Borisov <nborisov@suse.com>
[ add the note ]
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/transaction.c | 8 --------
 fs/btrfs/transaction.h | 3 ---
 2 files changed, 11 deletions(-)

diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 04f07144b45c..2141587195d4 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -658,14 +658,6 @@ struct btrfs_trans_handle *btrfs_start_transaction_fallback_global_rsv(
 	return trans;
 }
 
-struct btrfs_trans_handle *btrfs_start_transaction_lflush(
-					struct btrfs_root *root,
-					unsigned int num_items)
-{
-	return start_transaction(root, num_items, TRANS_START,
-				 BTRFS_RESERVE_FLUSH_LIMIT, true);
-}
-
 struct btrfs_trans_handle *btrfs_join_transaction(struct btrfs_root *root)
 {
 	return start_transaction(root, 0, TRANS_JOIN, BTRFS_RESERVE_NO_FLUSH,
diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h
index 6beee072b1bd..817fd7c9836b 100644
--- a/fs/btrfs/transaction.h
+++ b/fs/btrfs/transaction.h
@@ -186,9 +186,6 @@ struct btrfs_trans_handle *btrfs_start_transaction_fallback_global_rsv(
 					struct btrfs_root *root,
 					unsigned int num_items,
 					int min_factor);
-struct btrfs_trans_handle *btrfs_start_transaction_lflush(
-					struct btrfs_root *root,
-					unsigned int num_items);
 struct btrfs_trans_handle *btrfs_join_transaction(struct btrfs_root *root);
 struct btrfs_trans_handle *btrfs_join_transaction_nolock(struct btrfs_root *root);
 struct btrfs_trans_handle *btrfs_attach_transaction(struct btrfs_root *root);

From 7a61f8808833115bc32dd44a9d0ce10039b9399d Mon Sep 17 00:00:00 2001
From: Colin Ian King <colin.king@canonical.com>
Date: Fri, 12 Jan 2018 16:52:58 +0000
Subject: [PATCH 005/164] btrfs: remove redundant check on ret and goto

The check for a non-zero ret is redundant as the goto will jump to
the very next statement anyway.  Remove this extraneous code.

Detected by CoverityScan, CID#1463784 ("Identical code for different
branches")

Signed-off-by: Colin Ian King <colin.king@canonical.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/tests/btrfs-tests.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/fs/btrfs/tests/btrfs-tests.c b/fs/btrfs/tests/btrfs-tests.c
index 9786d8cd0aa6..e74278170806 100644
--- a/fs/btrfs/tests/btrfs-tests.c
+++ b/fs/btrfs/tests/btrfs-tests.c
@@ -278,8 +278,7 @@ int btrfs_run_sanity_tests(void)
 		}
 	}
 	ret = btrfs_test_extent_map();
-	if (ret)
-		goto out;
+
 out:
 	btrfs_destroy_test_fs();
 	return ret;

From 97dc231e8990d484874e75e9f30a2f11fcf035c2 Mon Sep 17 00:00:00 2001
From: Colin Ian King <colin.king@canonical.com>
Date: Mon, 8 Jan 2018 23:06:32 +0000
Subject: [PATCH 006/164] Btrfs: extent map selftest: add missing void
 parameter to btrfs_test_extent_map

Add a missing void parameter to function btrfs_test_extent_map, fixes
sparse warning:

warning: non-ANSI function declaration of function 'btrfs_test_extent_map'

Signed-off-by: Colin Ian King <colin.king@canonical.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/tests/extent-map-tests.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/btrfs/tests/extent-map-tests.c b/fs/btrfs/tests/extent-map-tests.c
index 70c993f01670..c23bd00bdd92 100644
--- a/fs/btrfs/tests/extent-map-tests.c
+++ b/fs/btrfs/tests/extent-map-tests.c
@@ -343,7 +343,7 @@ static void test_case_4(struct extent_map_tree *em_tree)
 	__test_case_4(em_tree, SZ_4K);
 }
 
-int btrfs_test_extent_map()
+int btrfs_test_extent_map(void)
 {
 	struct extent_map_tree *em_tree;
 

From af89e0dc2ce3177d07c4af4028b8a7b88733d07e Mon Sep 17 00:00:00 2001
From: Nikolay Borisov <nborisov@suse.com>
Date: Wed, 7 Feb 2018 11:19:10 +0200
Subject: [PATCH 007/164] btrfs: Don't hardcode the csum size in
 btrfs_ordered_sum_size

Currently the function uses a hardcoded value for the checksum size of
a sector. This is fine, given that we currently support only a single
algorithm, whose checksum is 4 bytes == sizeof(u32). Despite not
having other algorithms, btrfs' design supports using a different
algorithm whith different space requirements. To future-proof the code
query the size of the currently used algorithm from the in-memory copy
of the super block. No functional changes.

Signed-off-by: Nikolay Borisov <nborisov@suse.com>
Reviewed-by: Qu Wenruo <wqu@suse.com>
Reviewed-by: Su Yue <suy.fnst@cn.fujitsu.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/ordered-data.h | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h
index 56c4c0ee6381..c53e2cfb72d9 100644
--- a/fs/btrfs/ordered-data.h
+++ b/fs/btrfs/ordered-data.h
@@ -151,7 +151,9 @@ static inline int btrfs_ordered_sum_size(struct btrfs_fs_info *fs_info,
 					 unsigned long bytes)
 {
 	int num_sectors = (int)DIV_ROUND_UP(bytes, fs_info->sectorsize);
-	return sizeof(struct btrfs_ordered_sum) + num_sectors * sizeof(u32);
+	int csum_size = btrfs_super_csum_size(fs_info->super_copy);
+
+	return sizeof(struct btrfs_ordered_sum) + num_sectors * csum_size;
 }
 
 static inline void

From 97282031a64ca72aabf6482f9c32d1bcc931cde2 Mon Sep 17 00:00:00 2001
From: Anand Jain <anand.jain@oracle.com>
Date: Mon, 12 Feb 2018 23:33:29 +0800
Subject: [PATCH 008/164] btrfs: open code btrfs_dev_replace_cancel()

btrfs_dev_replace_cancel() calls __btrfs_dev_replace_cancel() for the
actual cancel so just open code it.

Signed-off-by: Anand Jain <anand.jain@oracle.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/dev-replace.c | 10 +---------
 fs/btrfs/dev-replace.h |  3 +--
 fs/btrfs/ioctl.c       |  3 ++-
 3 files changed, 4 insertions(+), 12 deletions(-)

diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c
index 7efbc4d1128b..1ce9528663a6 100644
--- a/fs/btrfs/dev-replace.c
+++ b/fs/btrfs/dev-replace.c
@@ -44,7 +44,6 @@ static void btrfs_dev_replace_update_device_in_mapping_tree(
 						struct btrfs_fs_info *fs_info,
 						struct btrfs_device *srcdev,
 						struct btrfs_device *tgtdev);
-static u64 __btrfs_dev_replace_cancel(struct btrfs_fs_info *fs_info);
 static int btrfs_dev_replace_kthread(void *data);
 static int btrfs_dev_replace_continue_on_mount(struct btrfs_fs_info *fs_info);
 
@@ -694,14 +693,7 @@ void btrfs_dev_replace_status(struct btrfs_fs_info *fs_info,
 	btrfs_dev_replace_unlock(dev_replace, 0);
 }
 
-int btrfs_dev_replace_cancel(struct btrfs_fs_info *fs_info,
-			     struct btrfs_ioctl_dev_replace_args *args)
-{
-	args->result = __btrfs_dev_replace_cancel(fs_info);
-	return 0;
-}
-
-static u64 __btrfs_dev_replace_cancel(struct btrfs_fs_info *fs_info)
+u64 __btrfs_dev_replace_cancel(struct btrfs_fs_info *fs_info)
 {
 	struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
 	struct btrfs_device *tgt_device = NULL;
diff --git a/fs/btrfs/dev-replace.h b/fs/btrfs/dev-replace.h
index f94a76844ae7..d8b314ff12c1 100644
--- a/fs/btrfs/dev-replace.h
+++ b/fs/btrfs/dev-replace.h
@@ -32,8 +32,7 @@ int btrfs_dev_replace_start(struct btrfs_fs_info *fs_info,
 		int read_src);
 void btrfs_dev_replace_status(struct btrfs_fs_info *fs_info,
 			      struct btrfs_ioctl_dev_replace_args *args);
-int btrfs_dev_replace_cancel(struct btrfs_fs_info *fs_info,
-			     struct btrfs_ioctl_dev_replace_args *args);
+u64 __btrfs_dev_replace_cancel(struct btrfs_fs_info *fs_info);
 void btrfs_dev_replace_suspend_for_unmount(struct btrfs_fs_info *fs_info);
 int btrfs_resume_dev_replace_async(struct btrfs_fs_info *fs_info);
 int btrfs_dev_replace_is_ongoing(struct btrfs_dev_replace *dev_replace);
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 111ee282b777..9082b06f8889 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -4429,7 +4429,8 @@ static long btrfs_ioctl_dev_replace(struct btrfs_fs_info *fs_info,
 		ret = 0;
 		break;
 	case BTRFS_IOCTL_DEV_REPLACE_CMD_CANCEL:
-		ret = btrfs_dev_replace_cancel(fs_info, p);
+		p->result = __btrfs_dev_replace_cancel(fs_info);
+		ret = 0;
 		break;
 	default:
 		ret = -EINVAL;

From 17d202b9738887c60b4903937b569df1e266eabb Mon Sep 17 00:00:00 2001
From: Anand Jain <anand.jain@oracle.com>
Date: Mon, 12 Feb 2018 23:33:30 +0800
Subject: [PATCH 009/164] btrfs: rename __btrfs_dev_replace_cancel()

Remove __ which is for the special functions.

Signed-off-by: Anand Jain <anand.jain@oracle.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/dev-replace.c | 2 +-
 fs/btrfs/dev-replace.h | 2 +-
 fs/btrfs/ioctl.c       | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c
index 1ce9528663a6..4d1dbc16fcd4 100644
--- a/fs/btrfs/dev-replace.c
+++ b/fs/btrfs/dev-replace.c
@@ -693,7 +693,7 @@ void btrfs_dev_replace_status(struct btrfs_fs_info *fs_info,
 	btrfs_dev_replace_unlock(dev_replace, 0);
 }
 
-u64 __btrfs_dev_replace_cancel(struct btrfs_fs_info *fs_info)
+u64 btrfs_dev_replace_cancel(struct btrfs_fs_info *fs_info)
 {
 	struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
 	struct btrfs_device *tgt_device = NULL;
diff --git a/fs/btrfs/dev-replace.h b/fs/btrfs/dev-replace.h
index d8b314ff12c1..6c3543d93e96 100644
--- a/fs/btrfs/dev-replace.h
+++ b/fs/btrfs/dev-replace.h
@@ -32,7 +32,7 @@ int btrfs_dev_replace_start(struct btrfs_fs_info *fs_info,
 		int read_src);
 void btrfs_dev_replace_status(struct btrfs_fs_info *fs_info,
 			      struct btrfs_ioctl_dev_replace_args *args);
-u64 __btrfs_dev_replace_cancel(struct btrfs_fs_info *fs_info);
+u64 btrfs_dev_replace_cancel(struct btrfs_fs_info *fs_info);
 void btrfs_dev_replace_suspend_for_unmount(struct btrfs_fs_info *fs_info);
 int btrfs_resume_dev_replace_async(struct btrfs_fs_info *fs_info);
 int btrfs_dev_replace_is_ongoing(struct btrfs_dev_replace *dev_replace);
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 9082b06f8889..c5a559105949 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -4429,7 +4429,7 @@ static long btrfs_ioctl_dev_replace(struct btrfs_fs_info *fs_info,
 		ret = 0;
 		break;
 	case BTRFS_IOCTL_DEV_REPLACE_CMD_CANCEL:
-		p->result = __btrfs_dev_replace_cancel(fs_info);
+		p->result = btrfs_dev_replace_cancel(fs_info);
 		ret = 0;
 		break;
 	default:

From 18e67c73dc6ea5bc5d9591abf92f8841290c4fcc Mon Sep 17 00:00:00 2001
From: Anand Jain <anand.jain@oracle.com>
Date: Mon, 12 Feb 2018 23:33:31 +0800
Subject: [PATCH 010/164] btrfs: btrfs_dev_replace_cancel() can return int

Current u64 return from btrfs_dev_replace_cancel() was probably done
to match the btrfs_ioctl_dev_replace_args::result. However as our
actual return value fits in int, and it further gets typecast to u64,
so just return int.

Signed-off-by: Anand Jain <anand.jain@oracle.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/dev-replace.c | 4 ++--
 fs/btrfs/dev-replace.h | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c
index 4d1dbc16fcd4..a428d528220f 100644
--- a/fs/btrfs/dev-replace.c
+++ b/fs/btrfs/dev-replace.c
@@ -693,13 +693,13 @@ void btrfs_dev_replace_status(struct btrfs_fs_info *fs_info,
 	btrfs_dev_replace_unlock(dev_replace, 0);
 }
 
-u64 btrfs_dev_replace_cancel(struct btrfs_fs_info *fs_info)
+int btrfs_dev_replace_cancel(struct btrfs_fs_info *fs_info)
 {
 	struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
 	struct btrfs_device *tgt_device = NULL;
 	struct btrfs_trans_handle *trans;
 	struct btrfs_root *root = fs_info->tree_root;
-	u64 result;
+	int result;
 	int ret;
 
 	if (sb_rdonly(fs_info->sb))
diff --git a/fs/btrfs/dev-replace.h b/fs/btrfs/dev-replace.h
index 6c3543d93e96..389de365b0db 100644
--- a/fs/btrfs/dev-replace.h
+++ b/fs/btrfs/dev-replace.h
@@ -32,7 +32,7 @@ int btrfs_dev_replace_start(struct btrfs_fs_info *fs_info,
 		int read_src);
 void btrfs_dev_replace_status(struct btrfs_fs_info *fs_info,
 			      struct btrfs_ioctl_dev_replace_args *args);
-u64 btrfs_dev_replace_cancel(struct btrfs_fs_info *fs_info);
+int btrfs_dev_replace_cancel(struct btrfs_fs_info *fs_info);
 void btrfs_dev_replace_suspend_for_unmount(struct btrfs_fs_info *fs_info);
 int btrfs_resume_dev_replace_async(struct btrfs_fs_info *fs_info);
 int btrfs_dev_replace_is_ongoing(struct btrfs_dev_replace *dev_replace);

From 15fc1283f631552ffedebd14e4fd5a36438e7d2a Mon Sep 17 00:00:00 2001
From: Anand Jain <anand.jain@oracle.com>
Date: Mon, 12 Feb 2018 23:36:25 +0800
Subject: [PATCH 011/164] btrfs: open code
 btrfs_init_dev_replace_tgtdev_for_resume()

btrfs_init_dev_replace_tgtdev_for_resume() initializes replace
target device in a few simple steps, so do it at the parent function.
Moreover, there isn't any other caller so just open code it.

Signed-off-by: Anand Jain <anand.jain@oracle.com>
Reviewed-by: Qu Wenruo <wqu@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/dev-replace.c | 10 ++++++++--
 fs/btrfs/volumes.c     | 13 -------------
 fs/btrfs/volumes.h     |  2 --
 3 files changed, 8 insertions(+), 17 deletions(-)

diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c
index a428d528220f..dd717e204b5e 100644
--- a/fs/btrfs/dev-replace.c
+++ b/fs/btrfs/dev-replace.c
@@ -173,8 +173,14 @@ no_valid_dev_replace_entry_found:
 			}
 			set_bit(BTRFS_DEV_STATE_REPLACE_TGT,
 				&dev_replace->tgtdev->dev_state);
-			btrfs_init_dev_replace_tgtdev_for_resume(fs_info,
-				dev_replace->tgtdev);
+
+			WARN_ON(fs_info->fs_devices->rw_devices == 0);
+			dev_replace->tgtdev->io_width = fs_info->sectorsize;
+			dev_replace->tgtdev->io_align = fs_info->sectorsize;
+			dev_replace->tgtdev->sector_size = fs_info->sectorsize;
+			dev_replace->tgtdev->fs_info = fs_info;
+			set_bit(BTRFS_DEV_STATE_IN_FS_METADATA,
+				&dev_replace->tgtdev->dev_state);
 		}
 		break;
 	}
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index c9e5f8306d02..bdfde42ebebf 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -2666,19 +2666,6 @@ error:
 	return ret;
 }
 
-void btrfs_init_dev_replace_tgtdev_for_resume(struct btrfs_fs_info *fs_info,
-					      struct btrfs_device *tgtdev)
-{
-	u32 sectorsize = fs_info->sectorsize;
-
-	WARN_ON(fs_info->fs_devices->rw_devices == 0);
-	tgtdev->io_width = sectorsize;
-	tgtdev->io_align = sectorsize;
-	tgtdev->sector_size = sectorsize;
-	tgtdev->fs_info = fs_info;
-	set_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &tgtdev->dev_state);
-}
-
 static noinline int btrfs_update_device(struct btrfs_trans_handle *trans,
 					struct btrfs_device *device)
 {
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index 28c28eeadff3..2a171f9b2c04 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -476,8 +476,6 @@ void btrfs_rm_dev_replace_free_srcdev(struct btrfs_fs_info *fs_info,
 				      struct btrfs_device *srcdev);
 void btrfs_destroy_dev_replace_tgtdev(struct btrfs_fs_info *fs_info,
 				      struct btrfs_device *tgtdev);
-void btrfs_init_dev_replace_tgtdev_for_resume(struct btrfs_fs_info *fs_info,
-					      struct btrfs_device *tgtdev);
 void btrfs_scratch_superblocks(struct block_device *bdev, const char *device_path);
 int btrfs_is_parity_mirror(struct btrfs_fs_info *fs_info,
 			   u64 logical, u64 len);

From 0e34693f7bb149273b747194b3988801a9ca8c8e Mon Sep 17 00:00:00 2001
From: Nikolay Borisov <nborisov@suse.com>
Date: Wed, 7 Feb 2018 17:55:37 +0200
Subject: [PATCH 012/164] btrfs: Make btrfs_trans_release_metadata private to
 transaction.c

This function is only ever used in __btrfs_end_transaction and
btrfs_commit_transaction so there is no need to export it via header.
Let's move it closer to where it's used, make it static and remove it
from the header. No functional changes.

Signed-off-by: Nikolay Borisov <nborisov@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/ctree.h       |  2 --
 fs/btrfs/extent-tree.c | 18 ------------------
 fs/btrfs/transaction.c | 19 +++++++++++++++++++
 3 files changed, 19 insertions(+), 20 deletions(-)

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index da308774b8a4..f7ab01fa5315 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -2733,8 +2733,6 @@ void btrfs_delalloc_release_space(struct inode *inode,
 			struct extent_changeset *reserved, u64 start, u64 len);
 void btrfs_free_reserved_data_space_noquota(struct inode *inode, u64 start,
 					    u64 len);
-void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans,
-				  struct btrfs_fs_info *fs_info);
 void btrfs_trans_release_chunk_metadata(struct btrfs_trans_handle *trans);
 int btrfs_orphan_reserve_metadata(struct btrfs_trans_handle *trans,
 				  struct btrfs_inode *inode);
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index c1618ab9fecf..8f772fec9d6d 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -5893,24 +5893,6 @@ static void release_global_block_rsv(struct btrfs_fs_info *fs_info)
 	WARN_ON(fs_info->delayed_block_rsv.reserved > 0);
 }
 
-void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans,
-				  struct btrfs_fs_info *fs_info)
-{
-	if (!trans->block_rsv) {
-		ASSERT(!trans->bytes_reserved);
-		return;
-	}
-
-	if (!trans->bytes_reserved)
-		return;
-
-	ASSERT(trans->block_rsv == &fs_info->trans_block_rsv);
-	trace_btrfs_space_reservation(fs_info, "transaction",
-				      trans->transid, trans->bytes_reserved, 0);
-	btrfs_block_rsv_release(fs_info, trans->block_rsv,
-				trans->bytes_reserved);
-	trans->bytes_reserved = 0;
-}
 
 /*
  * To be called after all the new block groups attached to the transaction
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 2141587195d4..beca25635787 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -818,6 +818,25 @@ int btrfs_should_end_transaction(struct btrfs_trans_handle *trans)
 	return should_end_transaction(trans);
 }
 
+static void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans,
+				  struct btrfs_fs_info *fs_info)
+{
+	if (!trans->block_rsv) {
+		ASSERT(!trans->bytes_reserved);
+		return;
+	}
+
+	if (!trans->bytes_reserved)
+		return;
+
+	ASSERT(trans->block_rsv == &fs_info->trans_block_rsv);
+	trace_btrfs_space_reservation(fs_info, "transaction",
+				      trans->transid, trans->bytes_reserved, 0);
+	btrfs_block_rsv_release(fs_info, trans->block_rsv,
+				trans->bytes_reserved);
+	trans->bytes_reserved = 0;
+}
+
 static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
 				   int throttle)
 {

From c9b577c01ac91a82fce697c445cfac2bcd8a49f6 Mon Sep 17 00:00:00 2001
From: Nikolay Borisov <nborisov@suse.com>
Date: Wed, 7 Feb 2018 17:55:38 +0200
Subject: [PATCH 013/164] btrfs: Open code btrfs_write_and_wait_marked_extents

btrfs_write_and_wait_transaction is essentially a wrapper of
btrfs_write_and_wait_marked_extents with the addition of calling
clear_btree_io_tree. Having the code split doesn't really bring any
benefit. Open code the later into the former and add proper
documentation header.

Signed-off-by: Nikolay Borisov <nborisov@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
[ reformat comment ]
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/transaction.c | 45 ++++++++++++++++++------------------------
 1 file changed, 19 insertions(+), 26 deletions(-)

diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index beca25635787..03fbb8854a1b 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -1083,40 +1083,33 @@ int btrfs_wait_tree_log_extents(struct btrfs_root *log_root, int mark)
 }
 
 /*
- * when btree blocks are allocated, they have some corresponding bits set for
- * them in one of two extent_io trees.  This is used to make sure all of
- * those extents are on disk for transaction or log commit
+ * When btree blocks are allocated the corresponding extents are marked dirty.
+ * This function ensures such extents are persisted on disk for transaction or
+ * log commit.
+ *
+ * @trans: transaction whose dirty pages we'd like to write
  */
-static int btrfs_write_and_wait_marked_extents(struct btrfs_fs_info *fs_info,
-				struct extent_io_tree *dirty_pages, int mark)
-{
-	int ret;
-	int ret2;
-	struct blk_plug plug;
-
-	blk_start_plug(&plug);
-	ret = btrfs_write_marked_extents(fs_info, dirty_pages, mark);
-	blk_finish_plug(&plug);
-	ret2 = btrfs_wait_extents(fs_info, dirty_pages);
-
-	if (ret)
-		return ret;
-	if (ret2)
-		return ret2;
-	return 0;
-}
-
 static int btrfs_write_and_wait_transaction(struct btrfs_trans_handle *trans,
 					    struct btrfs_fs_info *fs_info)
 {
 	int ret;
+	int ret2;
+	struct extent_io_tree *dirty_pages = &trans->transaction->dirty_pages;
+	struct blk_plug plug;
+
+	blk_start_plug(&plug);
+	ret = btrfs_write_marked_extents(fs_info, dirty_pages, EXTENT_DIRTY);
+	blk_finish_plug(&plug);
+	ret2 = btrfs_wait_extents(fs_info, dirty_pages);
 
-	ret = btrfs_write_and_wait_marked_extents(fs_info,
-					   &trans->transaction->dirty_pages,
-					   EXTENT_DIRTY);
 	clear_btree_io_tree(&trans->transaction->dirty_pages);
 
-	return ret;
+	if (ret)
+		return ret;
+	else if (ret2)
+		return ret2;
+	else
+		return 0;
 }
 
 /*

From dc60c525cff11a5e60073cd1f618023b28b64ff1 Mon Sep 17 00:00:00 2001
From: Nikolay Borisov <nborisov@suse.com>
Date: Wed, 7 Feb 2018 17:55:39 +0200
Subject: [PATCH 014/164] btrfs: Remove fs_info argument from
 btrfs_trans_release_metadata

All current callers of this function just get a reference to the
trans->fs_info member and pass it as the second argument. Collapse this
into the function itself. No functional changes.

Signed-off-by: Nikolay Borisov <nborisov@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/transaction.c | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 03fbb8854a1b..bfbc1ace2a24 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -818,9 +818,11 @@ int btrfs_should_end_transaction(struct btrfs_trans_handle *trans)
 	return should_end_transaction(trans);
 }
 
-static void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans,
-				  struct btrfs_fs_info *fs_info)
+static void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans)
+
 {
+	struct btrfs_fs_info *fs_info = trans->fs_info;
+
 	if (!trans->block_rsv) {
 		ASSERT(!trans->bytes_reserved);
 		return;
@@ -854,7 +856,7 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
 		return 0;
 	}
 
-	btrfs_trans_release_metadata(trans, info);
+	btrfs_trans_release_metadata(trans);
 	trans->block_rsv = NULL;
 
 	if (!list_empty(&trans->new_bgs))
@@ -875,7 +877,7 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
 			must_run_delayed_refs = 2;
 	}
 
-	btrfs_trans_release_metadata(trans, info);
+	btrfs_trans_release_metadata(trans);
 	trans->block_rsv = NULL;
 
 	if (!list_empty(&trans->new_bgs))
@@ -1969,7 +1971,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
 		return ret;
 	}
 
-	btrfs_trans_release_metadata(trans, fs_info);
+	btrfs_trans_release_metadata(trans);
 	trans->block_rsv = NULL;
 
 	cur_trans = trans->transaction;
@@ -2323,7 +2325,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
 scrub_continue:
 	btrfs_scrub_continue(fs_info);
 cleanup_transaction:
-	btrfs_trans_release_metadata(trans, fs_info);
+	btrfs_trans_release_metadata(trans);
 	btrfs_trans_release_chunk_metadata(trans);
 	trans->block_rsv = NULL;
 	btrfs_warn(fs_info, "Skipping commit of aborted transaction.");

From 6c686b359a2dc501353ea61adcca441dd1473e91 Mon Sep 17 00:00:00 2001
From: Nikolay Borisov <nborisov@suse.com>
Date: Wed, 7 Feb 2018 17:55:40 +0200
Subject: [PATCH 015/164] btrfs: Remove fs_info argument from
 btrfs_create_pending_block_groups

It can be referenced from the passed transaciton so no point in
passing it as function argument. No functional changes.

Signed-off-by: Nikolay Borisov <nborisov@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/ctree.h       |  3 +--
 fs/btrfs/extent-tree.c | 10 +++++-----
 fs/btrfs/transaction.c |  6 +++---
 3 files changed, 9 insertions(+), 10 deletions(-)

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index f7ab01fa5315..d58625a31109 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -2697,8 +2697,7 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
 void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info);
 void btrfs_get_block_group_trimming(struct btrfs_block_group_cache *cache);
 void btrfs_put_block_group_trimming(struct btrfs_block_group_cache *cache);
-void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans,
-				       struct btrfs_fs_info *fs_info);
+void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans);
 u64 btrfs_data_alloc_profile(struct btrfs_fs_info *fs_info);
 u64 btrfs_metadata_alloc_profile(struct btrfs_fs_info *fs_info);
 u64 btrfs_system_alloc_profile(struct btrfs_fs_info *fs_info);
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 8f772fec9d6d..47d8992cc9fa 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -3086,7 +3086,7 @@ again:
 
 	if (run_all) {
 		if (!list_empty(&trans->new_bgs))
-			btrfs_create_pending_block_groups(trans, fs_info);
+			btrfs_create_pending_block_groups(trans);
 
 		spin_lock(&delayed_refs->lock);
 		node = rb_first(&delayed_refs->href_root);
@@ -3686,7 +3686,7 @@ again:
 	 * make sure all the block groups on our dirty list actually
 	 * exist
 	 */
-	btrfs_create_pending_block_groups(trans, fs_info);
+	btrfs_create_pending_block_groups(trans);
 
 	if (!path) {
 		path = btrfs_alloc_path();
@@ -4706,7 +4706,7 @@ out:
 	 */
 	if (trans->can_flush_pending_bgs &&
 	    trans->chunk_bytes_reserved >= (u64)SZ_2M) {
-		btrfs_create_pending_block_groups(trans, fs_info);
+		btrfs_create_pending_block_groups(trans);
 		btrfs_trans_release_chunk_metadata(trans);
 	}
 	return ret;
@@ -10151,9 +10151,9 @@ error:
 	return ret;
 }
 
-void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans,
-				       struct btrfs_fs_info *fs_info)
+void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans)
 {
+	struct btrfs_fs_info *fs_info = trans->fs_info;
 	struct btrfs_block_group_cache *block_group, *tmp;
 	struct btrfs_root *extent_root = fs_info->extent_root;
 	struct btrfs_block_group_item item;
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index bfbc1ace2a24..e31c4051368b 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -860,7 +860,7 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
 	trans->block_rsv = NULL;
 
 	if (!list_empty(&trans->new_bgs))
-		btrfs_create_pending_block_groups(trans, info);
+		btrfs_create_pending_block_groups(trans);
 
 	trans->delayed_ref_updates = 0;
 	if (!trans->sync) {
@@ -881,7 +881,7 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
 	trans->block_rsv = NULL;
 
 	if (!list_empty(&trans->new_bgs))
-		btrfs_create_pending_block_groups(trans, info);
+		btrfs_create_pending_block_groups(trans);
 
 	btrfs_trans_release_chunk_metadata(trans);
 
@@ -1984,7 +1984,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
 	smp_wmb();
 
 	if (!list_empty(&trans->new_bgs))
-		btrfs_create_pending_block_groups(trans, fs_info);
+		btrfs_create_pending_block_groups(trans);
 
 	ret = btrfs_run_delayed_refs(trans, fs_info, 0);
 	if (ret) {

From 21217054203cd10f26ba133352046895c16cd3de Mon Sep 17 00:00:00 2001
From: Nikolay Borisov <nborisov@suse.com>
Date: Wed, 7 Feb 2018 17:55:41 +0200
Subject: [PATCH 016/164] btrfs: Don't pass fs_info arg to
 btrfs_start_dirty_block_groups

It can be referenced from the passed transaction so no point in passing
it as a function argument. No functional changes.

Signed-off-by: Nikolay Borisov <nborisov@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/ctree.h       | 3 +--
 fs/btrfs/extent-tree.c | 4 ++--
 fs/btrfs/transaction.c | 2 +-
 3 files changed, 4 insertions(+), 5 deletions(-)

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index d58625a31109..2c96275c17e6 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -2675,8 +2675,7 @@ int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
 			 u64 bytenr, u64 num_bytes, u64 parent,
 			 u64 root_objectid, u64 owner, u64 offset);
 
-int btrfs_start_dirty_block_groups(struct btrfs_trans_handle *trans,
-				   struct btrfs_fs_info *fs_info);
+int btrfs_start_dirty_block_groups(struct btrfs_trans_handle *trans);
 int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
 				   struct btrfs_fs_info *fs_info);
 int btrfs_setup_space_cache(struct btrfs_trans_handle *trans,
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 47d8992cc9fa..194b813045a7 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -3660,9 +3660,9 @@ int btrfs_setup_space_cache(struct btrfs_trans_handle *trans,
  * the commit latency by getting rid of the easy block groups while
  * we're still allowing others to join the commit.
  */
-int btrfs_start_dirty_block_groups(struct btrfs_trans_handle *trans,
-				   struct btrfs_fs_info *fs_info)
+int btrfs_start_dirty_block_groups(struct btrfs_trans_handle *trans)
 {
+	struct btrfs_fs_info *fs_info = trans->fs_info;
 	struct btrfs_block_group_cache *cache;
 	struct btrfs_transaction *cur_trans = trans->transaction;
 	int ret = 0;
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index e31c4051368b..87f94228abe9 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -2015,7 +2015,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
 		mutex_unlock(&fs_info->ro_block_group_mutex);
 
 		if (run_it)
-			ret = btrfs_start_dirty_block_groups(trans, fs_info);
+			ret = btrfs_start_dirty_block_groups(trans);
 	}
 	if (ret) {
 		btrfs_end_transaction(trans);

From b84acab38f58b775f8cd3f38422c610a75b3eb70 Mon Sep 17 00:00:00 2001
From: Nikolay Borisov <nborisov@suse.com>
Date: Wed, 7 Feb 2018 17:55:42 +0200
Subject: [PATCH 017/164] btrfs: Don't pass fs_info to
 __btrfs_run_delayed_items

We already pass the transaction handle, which contains a refrence to
the fs_info so grab it from there. No functional changes.

Signed-off-by: Nikolay Borisov <nborisov@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/delayed-inode.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c
index 0530f6f2e4ba..1acc03680e9c 100644
--- a/fs/btrfs/delayed-inode.c
+++ b/fs/btrfs/delayed-inode.c
@@ -1115,9 +1115,9 @@ __btrfs_commit_inode_delayed_items(struct btrfs_trans_handle *trans,
  * Returns < 0 on error and returns with an aborted transaction with any
  * outstanding delayed items cleaned up.
  */
-static int __btrfs_run_delayed_items(struct btrfs_trans_handle *trans,
-				     struct btrfs_fs_info *fs_info, int nr)
+static int __btrfs_run_delayed_items(struct btrfs_trans_handle *trans, int nr)
 {
+	struct btrfs_fs_info *fs_info = trans->fs_info;
 	struct btrfs_delayed_root *delayed_root;
 	struct btrfs_delayed_node *curr_node, *prev_node;
 	struct btrfs_path *path;
@@ -1165,13 +1165,13 @@ static int __btrfs_run_delayed_items(struct btrfs_trans_handle *trans,
 int btrfs_run_delayed_items(struct btrfs_trans_handle *trans,
 			    struct btrfs_fs_info *fs_info)
 {
-	return __btrfs_run_delayed_items(trans, fs_info, -1);
+	return __btrfs_run_delayed_items(trans, -1);
 }
 
 int btrfs_run_delayed_items_nr(struct btrfs_trans_handle *trans,
 			       struct btrfs_fs_info *fs_info, int nr)
 {
-	return __btrfs_run_delayed_items(trans, fs_info, nr);
+	return __btrfs_run_delayed_items(trans, nr);
 }
 
 int btrfs_commit_inode_delayed_items(struct btrfs_trans_handle *trans,

From e5c304e651e6ab13495d4aabb5e7d5d37933dc04 Mon Sep 17 00:00:00 2001
From: Nikolay Borisov <nborisov@suse.com>
Date: Wed, 7 Feb 2018 17:55:43 +0200
Subject: [PATCH 018/164] btrfs: Don't pass fs_info to
 btrfs_run_delayed_items/_nr

We already pass the transaction which has a reference to the fs_info,
so use that. No functional changes.

Signed-off-by: Nikolay Borisov <nborisov@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/delayed-inode.c |  6 ++----
 fs/btrfs/delayed-inode.h |  6 ++----
 fs/btrfs/extent-tree.c   |  2 +-
 fs/btrfs/transaction.c   |  8 ++++----
 fs/btrfs/tree-log.c      | 12 ++++--------
 5 files changed, 13 insertions(+), 21 deletions(-)

diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c
index 1acc03680e9c..09939fc37f2a 100644
--- a/fs/btrfs/delayed-inode.c
+++ b/fs/btrfs/delayed-inode.c
@@ -1162,14 +1162,12 @@ static int __btrfs_run_delayed_items(struct btrfs_trans_handle *trans, int nr)
 	return ret;
 }
 
-int btrfs_run_delayed_items(struct btrfs_trans_handle *trans,
-			    struct btrfs_fs_info *fs_info)
+int btrfs_run_delayed_items(struct btrfs_trans_handle *trans)
 {
 	return __btrfs_run_delayed_items(trans, -1);
 }
 
-int btrfs_run_delayed_items_nr(struct btrfs_trans_handle *trans,
-			       struct btrfs_fs_info *fs_info, int nr)
+int btrfs_run_delayed_items_nr(struct btrfs_trans_handle *trans, int nr)
 {
 	return __btrfs_run_delayed_items(trans, nr);
 }
diff --git a/fs/btrfs/delayed-inode.h b/fs/btrfs/delayed-inode.h
index c4189d495934..ae893d85224f 100644
--- a/fs/btrfs/delayed-inode.h
+++ b/fs/btrfs/delayed-inode.h
@@ -111,10 +111,8 @@ int btrfs_delete_delayed_dir_index(struct btrfs_trans_handle *trans,
 
 int btrfs_inode_delayed_dir_index_count(struct btrfs_inode *inode);
 
-int btrfs_run_delayed_items(struct btrfs_trans_handle *trans,
-			    struct btrfs_fs_info *fs_info);
-int btrfs_run_delayed_items_nr(struct btrfs_trans_handle *trans,
-			       struct btrfs_fs_info *fs_info, int nr);
+int btrfs_run_delayed_items(struct btrfs_trans_handle *trans);
+int btrfs_run_delayed_items_nr(struct btrfs_trans_handle *trans, int nr);
 
 void btrfs_balance_delayed_items(struct btrfs_fs_info *fs_info);
 
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 194b813045a7..7b4d6789cc66 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -4994,7 +4994,7 @@ static void flush_space(struct btrfs_fs_info *fs_info,
 			ret = PTR_ERR(trans);
 			break;
 		}
-		ret = btrfs_run_delayed_items_nr(trans, fs_info, nr);
+		ret = btrfs_run_delayed_items_nr(trans, nr);
 		btrfs_end_transaction(trans);
 		break;
 	case FLUSH_DELALLOC:
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 87f94228abe9..e332026a7f82 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -1530,7 +1530,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
 	 * otherwise we corrupt the FS during
 	 * snapshot
 	 */
-	ret = btrfs_run_delayed_items(trans, fs_info);
+	ret = btrfs_run_delayed_items(trans);
 	if (ret) {	/* Transaction aborted */
 		btrfs_abort_transaction(trans, ret);
 		goto fail;
@@ -2067,7 +2067,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
 	if (ret)
 		goto cleanup_transaction;
 
-	ret = btrfs_run_delayed_items(trans, fs_info);
+	ret = btrfs_run_delayed_items(trans);
 	if (ret)
 		goto cleanup_transaction;
 
@@ -2075,7 +2075,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
 		   extwriter_counter_read(cur_trans) == 0);
 
 	/* some pending stuffs might be added after the previous flush. */
-	ret = btrfs_run_delayed_items(trans, fs_info);
+	ret = btrfs_run_delayed_items(trans);
 	if (ret)
 		goto cleanup_transaction;
 
@@ -2128,7 +2128,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
 	 * because all the tree which are snapshoted will be forced to COW
 	 * the nodes and leaves.
 	 */
-	ret = btrfs_run_delayed_items(trans, fs_info);
+	ret = btrfs_run_delayed_items(trans);
 	if (ret) {
 		mutex_unlock(&fs_info->reloc_mutex);
 		goto scrub_continue;
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 434457794c27..fac5fd1cc786 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -853,7 +853,6 @@ static noinline int drop_one_dir_item(struct btrfs_trans_handle *trans,
 				      struct btrfs_inode *dir,
 				      struct btrfs_dir_item *di)
 {
-	struct btrfs_fs_info *fs_info = root->fs_info;
 	struct inode *inode;
 	char *name;
 	int name_len;
@@ -887,7 +886,7 @@ static noinline int drop_one_dir_item(struct btrfs_trans_handle *trans,
 	if (ret)
 		goto out;
 	else
-		ret = btrfs_run_delayed_items(trans, fs_info);
+		ret = btrfs_run_delayed_items(trans);
 out:
 	kfree(name);
 	iput(inode);
@@ -1007,7 +1006,6 @@ static inline int __add_inode_ref(struct btrfs_trans_handle *trans,
 				  u64 ref_index, char *name, int namelen,
 				  int *search_done)
 {
-	struct btrfs_fs_info *fs_info = root->fs_info;
 	int ret;
 	char *victim_name;
 	int victim_name_len;
@@ -1065,7 +1063,7 @@ again:
 				kfree(victim_name);
 				if (ret)
 					return ret;
-				ret = btrfs_run_delayed_items(trans, fs_info);
+				ret = btrfs_run_delayed_items(trans);
 				if (ret)
 					return ret;
 				*search_done = 1;
@@ -1136,8 +1134,7 @@ again:
 							victim_name_len);
 					if (!ret)
 						ret = btrfs_run_delayed_items(
-								  trans,
-								  fs_info);
+								  trans);
 				}
 				iput(victim_parent);
 				kfree(victim_name);
@@ -2098,7 +2095,6 @@ static noinline int check_item_in_log(struct btrfs_trans_handle *trans,
 				      struct inode *dir,
 				      struct btrfs_key *dir_key)
 {
-	struct btrfs_fs_info *fs_info = root->fs_info;
 	int ret;
 	struct extent_buffer *eb;
 	int slot;
@@ -2162,7 +2158,7 @@ again:
 			ret = btrfs_unlink_inode(trans, root, BTRFS_I(dir),
 					BTRFS_I(inode), name, name_len);
 			if (!ret)
-				ret = btrfs_run_delayed_items(trans, fs_info);
+				ret = btrfs_run_delayed_items(trans);
 			kfree(name);
 			iput(inode);
 			if (ret)

From 7e4443d9eb33a4fefb8e929cbbacd9ad58727244 Mon Sep 17 00:00:00 2001
From: Nikolay Borisov <nborisov@suse.com>
Date: Wed, 7 Feb 2018 17:55:44 +0200
Subject: [PATCH 019/164] btrfs: Don't pass fs_info to commit_fs_roots

We already pass the transaction handle which has a reference to the
fs_info. No functional changes.

Signed-off-by: Nikolay Borisov <nborisov@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/transaction.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index e332026a7f82..f831251237fe 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -1257,9 +1257,9 @@ void btrfs_add_dead_root(struct btrfs_root *root)
 /*
  * update all the cowonly tree roots on disk
  */
-static noinline int commit_fs_roots(struct btrfs_trans_handle *trans,
-				    struct btrfs_fs_info *fs_info)
+static noinline int commit_fs_roots(struct btrfs_trans_handle *trans)
 {
+	struct btrfs_fs_info *fs_info = trans->fs_info;
 	struct btrfs_root *gang[8];
 	int i;
 	int ret;
@@ -1377,7 +1377,7 @@ static int qgroup_account_snapshot(struct btrfs_trans_handle *trans,
 	 */
 	mutex_lock(&fs_info->tree_log_mutex);
 
-	ret = commit_fs_roots(trans, fs_info);
+	ret = commit_fs_roots(trans);
 	if (ret)
 		goto out;
 	ret = btrfs_qgroup_account_extents(trans, fs_info);
@@ -2163,7 +2163,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
 	 */
 	mutex_lock(&fs_info->tree_log_mutex);
 
-	ret = commit_fs_roots(trans, fs_info);
+	ret = commit_fs_roots(trans);
 	if (ret) {
 		mutex_unlock(&fs_info->tree_log_mutex);
 		mutex_unlock(&fs_info->reloc_mutex);

From 9386d8bc58e2088f05d2d49a9b8af626ebbe687b Mon Sep 17 00:00:00 2001
From: Nikolay Borisov <nborisov@suse.com>
Date: Wed, 7 Feb 2018 17:55:45 +0200
Subject: [PATCH 020/164] btrfs: Don't pass fs_info to commit_cowonly_roots

We already pass a transaction handle which refrences the fs_info so
we can grab it from there. No functional changes.

Signed-off-by: Nikolay Borisov <nborisov@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/transaction.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index f831251237fe..6cba2b0c2b8a 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -1161,9 +1161,9 @@ static int update_cowonly_root(struct btrfs_trans_handle *trans,
  * failures will cause the file system to go offline. We still need
  * to clean up the delayed refs.
  */
-static noinline int commit_cowonly_roots(struct btrfs_trans_handle *trans,
-					 struct btrfs_fs_info *fs_info)
+static noinline int commit_cowonly_roots(struct btrfs_trans_handle *trans)
 {
+	struct btrfs_fs_info *fs_info = trans->fs_info;
 	struct list_head *dirty_bgs = &trans->transaction->dirty_bgs;
 	struct list_head *io_bgs = &trans->transaction->io_bgs;
 	struct list_head *next;
@@ -1403,7 +1403,7 @@ static int qgroup_account_snapshot(struct btrfs_trans_handle *trans,
 	 * like chunk and root tree, as they won't affect qgroup.
 	 * And we don't write super to avoid half committed status.
 	 */
-	ret = commit_cowonly_roots(trans, fs_info);
+	ret = commit_cowonly_roots(trans);
 	if (ret)
 		goto out;
 	switch_commit_roots(trans->transaction, fs_info);
@@ -2203,7 +2203,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
 		goto scrub_continue;
 	}
 
-	ret = commit_cowonly_roots(trans, fs_info);
+	ret = commit_cowonly_roots(trans);
 	if (ret) {
 		mutex_unlock(&fs_info->tree_log_mutex);
 		mutex_unlock(&fs_info->reloc_mutex);

From 97cb39bb912c8092c5570578f2eb1b2988ac1af8 Mon Sep 17 00:00:00 2001
From: Nikolay Borisov <nborisov@suse.com>
Date: Wed, 7 Feb 2018 17:55:46 +0200
Subject: [PATCH 021/164] btrfs: Remove root argument of cleanup_transaction

The only thing the passed root is used for is:
1. get a reference to the fs_info and to
2. call trace_btrfs_transaction_commit.

We can achieve 1) by simply referring to the fs_info from passed trans
object. As far as 2) is concerned cleanup_transaction is called from
only one place and the 'root' argument passed is the one from the trans
handle. No functional changes.

Signed-off-by: Nikolay Borisov <nborisov@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/transaction.c | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 6cba2b0c2b8a..ad7546745b5b 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -1867,10 +1867,9 @@ int btrfs_commit_transaction_async(struct btrfs_trans_handle *trans,
 }
 
 
-static void cleanup_transaction(struct btrfs_trans_handle *trans,
-				struct btrfs_root *root, int err)
+static void cleanup_transaction(struct btrfs_trans_handle *trans, int err)
 {
-	struct btrfs_fs_info *fs_info = root->fs_info;
+	struct btrfs_fs_info *fs_info = trans->fs_info;
 	struct btrfs_transaction *cur_trans = trans->transaction;
 	DEFINE_WAIT(wait);
 
@@ -1910,7 +1909,7 @@ static void cleanup_transaction(struct btrfs_trans_handle *trans,
 	btrfs_put_transaction(cur_trans);
 	btrfs_put_transaction(cur_trans);
 
-	trace_btrfs_transaction_commit(root);
+	trace_btrfs_transaction_commit(trans->root);
 
 	if (current->journal_info == trans)
 		current->journal_info = NULL;
@@ -2331,7 +2330,7 @@ cleanup_transaction:
 	btrfs_warn(fs_info, "Skipping commit of aborted transaction.");
 	if (current->journal_info == trans)
 		current->journal_info = NULL;
-	cleanup_transaction(trans, trans->root, ret);
+	cleanup_transaction(trans, ret);
 
 	return ret;
 }

From 16916a88d43c5f82c20a810e54d1a7f0d3d3a9ca Mon Sep 17 00:00:00 2001
From: Nikolay Borisov <nborisov@suse.com>
Date: Wed, 7 Feb 2018 17:55:47 +0200
Subject: [PATCH 022/164] btrfs: Remove fs_info argument from
 switch_commit_roots

We already have the fs_info from the passed transaction so use it
directly. No functional changes.

Signed-off-by: Nikolay Borisov <nborisov@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/transaction.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index ad7546745b5b..0d36337b0fcb 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -126,9 +126,9 @@ static void clear_btree_io_tree(struct extent_io_tree *tree)
 	spin_unlock(&tree->lock);
 }
 
-static noinline void switch_commit_roots(struct btrfs_transaction *trans,
-					 struct btrfs_fs_info *fs_info)
+static noinline void switch_commit_roots(struct btrfs_transaction *trans)
 {
+	struct btrfs_fs_info *fs_info = trans->fs_info;
 	struct btrfs_root *root, *tmp;
 
 	down_write(&fs_info->commit_root_sem);
@@ -1406,7 +1406,7 @@ static int qgroup_account_snapshot(struct btrfs_trans_handle *trans,
 	ret = commit_cowonly_roots(trans);
 	if (ret)
 		goto out;
-	switch_commit_roots(trans->transaction, fs_info);
+	switch_commit_roots(trans->transaction);
 	ret = btrfs_write_and_wait_transaction(trans, fs_info);
 	if (ret)
 		btrfs_handle_fs_error(fs_info, ret,
@@ -2234,7 +2234,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
 	list_add_tail(&fs_info->chunk_root->dirty_list,
 		      &cur_trans->switch_commits);
 
-	switch_commit_roots(cur_trans, fs_info);
+	switch_commit_roots(cur_trans);
 
 	ASSERT(list_empty(&cur_trans->dirty_bgs));
 	ASSERT(list_empty(&cur_trans->io_bgs));

From 08d50ca32cd0554699b7680fbda97c94f49c73f2 Mon Sep 17 00:00:00 2001
From: Nikolay Borisov <nborisov@suse.com>
Date: Wed, 7 Feb 2018 17:55:48 +0200
Subject: [PATCH 023/164] btrfs: Remove fs_info argument from
 create_pending_snapshots/create_pending_snapshot

We already pass the trans handle which has a reference to fs_info to
create_pending_snapshot so we can refer to it directly. Doing this
obviates the need to pass the fs_info to create_pending_snapshots as
well. No functional changes.

Signed-off-by: Nikolay Borisov <nborisov@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/transaction.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 0d36337b0fcb..4daf51f6cf52 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -1436,9 +1436,10 @@ out:
  * the creation of the pending snapshots, just return 0.
  */
 static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
-				   struct btrfs_fs_info *fs_info,
 				   struct btrfs_pending_snapshot *pending)
 {
+
+	struct btrfs_fs_info *fs_info = trans->fs_info;
 	struct btrfs_key key;
 	struct btrfs_root_item *new_root_item;
 	struct btrfs_root *tree_root = fs_info->tree_root;
@@ -1705,8 +1706,7 @@ no_free_objectid:
 /*
  * create all the snapshots we've scheduled for creation
  */
-static noinline int create_pending_snapshots(struct btrfs_trans_handle *trans,
-					     struct btrfs_fs_info *fs_info)
+static noinline int create_pending_snapshots(struct btrfs_trans_handle *trans)
 {
 	struct btrfs_pending_snapshot *pending, *next;
 	struct list_head *head = &trans->transaction->pending_snapshots;
@@ -1714,7 +1714,7 @@ static noinline int create_pending_snapshots(struct btrfs_trans_handle *trans,
 
 	list_for_each_entry_safe(pending, next, head, list) {
 		list_del(&pending->list);
-		ret = create_pending_snapshot(trans, fs_info, pending);
+		ret = create_pending_snapshot(trans, pending);
 		if (ret)
 			break;
 	}
@@ -2111,7 +2111,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
 	 * deal with them in create_pending_snapshot(), which is the
 	 * core function of the snapshot creation.
 	 */
-	ret = create_pending_snapshots(trans, fs_info);
+	ret = create_pending_snapshots(trans);
 	if (ret) {
 		mutex_unlock(&fs_info->reloc_mutex);
 		goto scrub_continue;

From e9b919b1f73f1a363988ae1b9fba66f83a221f2e Mon Sep 17 00:00:00 2001
From: Nikolay Borisov <nborisov@suse.com>
Date: Wed, 7 Feb 2018 17:55:49 +0200
Subject: [PATCH 024/164] btrfs: Remove fs_info argument from
 btrfs_update_commit_device_bytes_used

We already pass the btrfs_transaction which references fs_info so no
need to pass the later as an argument. Also use the opportunity to
shorten transaction->trans. No functional changes.

Signed-off-by: Nikolay Borisov <nborisov@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/transaction.c | 2 +-
 fs/btrfs/volumes.c     | 8 ++++----
 fs/btrfs/volumes.h     | 3 +--
 3 files changed, 6 insertions(+), 7 deletions(-)

diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 4daf51f6cf52..f6a33f474e7b 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -2246,7 +2246,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
 	       sizeof(*fs_info->super_copy));
 
 	btrfs_update_commit_device_size(fs_info);
-	btrfs_update_commit_device_bytes_used(fs_info, cur_trans);
+	btrfs_update_commit_device_bytes_used(cur_trans);
 
 	clear_bit(BTRFS_FS_LOG1_ERR, &fs_info->flags);
 	clear_bit(BTRFS_FS_LOG2_ERR, &fs_info->flags);
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index bdfde42ebebf..e4bfd81340a7 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -7345,20 +7345,20 @@ void btrfs_update_commit_device_size(struct btrfs_fs_info *fs_info)
 }
 
 /* Must be invoked during the transaction commit */
-void btrfs_update_commit_device_bytes_used(struct btrfs_fs_info *fs_info,
-					struct btrfs_transaction *transaction)
+void btrfs_update_commit_device_bytes_used(struct btrfs_transaction *trans)
 {
+	struct btrfs_fs_info *fs_info = trans->fs_info;
 	struct extent_map *em;
 	struct map_lookup *map;
 	struct btrfs_device *dev;
 	int i;
 
-	if (list_empty(&transaction->pending_chunks))
+	if (list_empty(&trans->pending_chunks))
 		return;
 
 	/* In order to kick the device replace finish process */
 	mutex_lock(&fs_info->chunk_mutex);
-	list_for_each_entry(em, &transaction->pending_chunks, list) {
+	list_for_each_entry(em, &trans->pending_chunks, list) {
 		map = em->map_lookup;
 
 		for (i = 0; i < map->num_stripes; i++) {
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index 2a171f9b2c04..654df38faa7a 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -545,8 +545,7 @@ static inline void btrfs_dev_stat_reset(struct btrfs_device *dev,
 }
 
 void btrfs_update_commit_device_size(struct btrfs_fs_info *fs_info);
-void btrfs_update_commit_device_bytes_used(struct btrfs_fs_info *fs_info,
-					struct btrfs_transaction *transaction);
+void btrfs_update_commit_device_bytes_used(struct btrfs_transaction *trans);
 
 struct list_head *btrfs_get_fs_uuids(void);
 void btrfs_set_fs_info_ptr(struct btrfs_fs_info *fs_info);

From 70458a58190ab3aa0267e539cac7c8dcb6dc5dd9 Mon Sep 17 00:00:00 2001
From: Nikolay Borisov <nborisov@suse.com>
Date: Wed, 7 Feb 2018 17:55:50 +0200
Subject: [PATCH 025/164] btrfs: Remove fs_info argument of
 btrfs_write_and_wait_transaction

We already pass btrfs_trans_handle which contains a reference to the
fs_info so use that. No functional changes.

Signed-off-by: Nikolay Borisov <nborisov@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/transaction.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index f6a33f474e7b..ca2d91163af9 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -1091,12 +1091,12 @@ int btrfs_wait_tree_log_extents(struct btrfs_root *log_root, int mark)
  *
  * @trans: transaction whose dirty pages we'd like to write
  */
-static int btrfs_write_and_wait_transaction(struct btrfs_trans_handle *trans,
-					    struct btrfs_fs_info *fs_info)
+static int btrfs_write_and_wait_transaction(struct btrfs_trans_handle *trans)
 {
 	int ret;
 	int ret2;
 	struct extent_io_tree *dirty_pages = &trans->transaction->dirty_pages;
+	struct btrfs_fs_info *fs_info = trans->fs_info;
 	struct blk_plug plug;
 
 	blk_start_plug(&plug);
@@ -1407,7 +1407,7 @@ static int qgroup_account_snapshot(struct btrfs_trans_handle *trans,
 	if (ret)
 		goto out;
 	switch_commit_roots(trans->transaction);
-	ret = btrfs_write_and_wait_transaction(trans, fs_info);
+	ret = btrfs_write_and_wait_transaction(trans);
 	if (ret)
 		btrfs_handle_fs_error(fs_info, ret,
 			"Error while writing out transaction for qgroup");
@@ -2261,7 +2261,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
 
 	wake_up(&fs_info->transaction_wait);
 
-	ret = btrfs_write_and_wait_transaction(trans, fs_info);
+	ret = btrfs_write_and_wait_transaction(trans);
 	if (ret) {
 		btrfs_handle_fs_error(fs_info, ret,
 				      "Error while writing out transaction");

From ba020491c8d08ec500ce1ddfd0715168a0ab9241 Mon Sep 17 00:00:00 2001
From: Anand Jain <Anand.Jain@oracle.com>
Date: Tue, 13 Feb 2018 12:35:44 +0800
Subject: [PATCH 026/164] btrfs: extent_buffer_uptodate() make it static and
 inline

extent_buffer_uptodate() is a trivial wrapper around test_bit() and
nothing else. So make it static and inline, save on code space and call
indirection.

Before:
   text	   data	    bss	    dec	    hex	filename
1131257	  82898	  18992	1233147	 12d0fb	fs/btrfs/btrfs.ko

After:
   text	   data	    bss	    dec	    hex	filename
1131090	  82898	  18992	1232980	 12d054	fs/btrfs/btrfs.ko

Signed-off-by: Anand Jain <anand.jain@oracle.com>
Reviewed-by: Nikolay Borisov <nborisov@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/extent_io.c | 5 -----
 fs/btrfs/extent_io.h | 6 +++++-
 2 files changed, 5 insertions(+), 6 deletions(-)

diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 5bd7ecefc2bc..4e73705b405e 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -5230,11 +5230,6 @@ void set_extent_buffer_uptodate(struct extent_buffer *eb)
 	}
 }
 
-int extent_buffer_uptodate(struct extent_buffer *eb)
-{
-	return test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
-}
-
 int read_extent_buffer_pages(struct extent_io_tree *tree,
 			     struct extent_buffer *eb, int wait, int mirror_num)
 {
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index a7a850abd600..da9be2fb0502 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -455,6 +455,11 @@ static inline void extent_buffer_get(struct extent_buffer *eb)
 	atomic_inc(&eb->refs);
 }
 
+static inline int extent_buffer_uptodate(struct extent_buffer *eb)
+{
+	return test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
+}
+
 int memcmp_extent_buffer(const struct extent_buffer *eb, const void *ptrv,
 			 unsigned long start, unsigned long len);
 void read_extent_buffer(const struct extent_buffer *eb, void *dst,
@@ -489,7 +494,6 @@ void clear_extent_buffer_dirty(struct extent_buffer *eb);
 int set_extent_buffer_dirty(struct extent_buffer *eb);
 void set_extent_buffer_uptodate(struct extent_buffer *eb);
 void clear_extent_buffer_uptodate(struct extent_buffer *eb);
-int extent_buffer_uptodate(struct extent_buffer *eb);
 int extent_buffer_under_io(struct extent_buffer *eb);
 int map_private_extent_buffer(const struct extent_buffer *eb,
 			      unsigned long offset, unsigned long min_len,

From f7b885befd05fa4f546cdc3e6c9a3b4a30484cd1 Mon Sep 17 00:00:00 2001
From: Anand Jain <anand.jain@oracle.com>
Date: Tue, 13 Feb 2018 17:50:42 +0800
Subject: [PATCH 027/164] btrfs: manage thread_pool mount option as %u

The mount option thread_pool is always unsigned. Manage it that way all
around.

Signed-off-by: Anand Jain <anand.jain@oracle.com>
Reviewed-by: Nikolay Borisov <nborisov@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/ctree.h   |  2 +-
 fs/btrfs/disk-io.c |  4 ++--
 fs/btrfs/super.c   | 13 ++++++-------
 3 files changed, 9 insertions(+), 10 deletions(-)

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 2c96275c17e6..407f435245f5 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -935,7 +935,7 @@ struct btrfs_fs_info {
 	struct btrfs_workqueue *extent_workers;
 	struct task_struct *transaction_kthread;
 	struct task_struct *cleaner_kthread;
-	int thread_pool_size;
+	u32 thread_pool_size;
 
 	struct kobject *space_info_kobj;
 
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 9f7ed4390b33..47624407bb94 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -2183,7 +2183,7 @@ static void btrfs_init_qgroup(struct btrfs_fs_info *fs_info)
 static int btrfs_init_workqueues(struct btrfs_fs_info *fs_info,
 		struct btrfs_fs_devices *fs_devices)
 {
-	int max_active = fs_info->thread_pool_size;
+	u32 max_active = fs_info->thread_pool_size;
 	unsigned int flags = WQ_MEM_RECLAIM | WQ_FREEZABLE | WQ_UNBOUND;
 
 	fs_info->workers =
@@ -2404,7 +2404,7 @@ int open_ctree(struct super_block *sb,
 	int err = -EINVAL;
 	int num_backups_tried = 0;
 	int backup_index = 0;
-	int max_active;
+	u32 max_active;
 	int clear_free_space_tree = 0;
 
 	tree_root = fs_info->tree_root = btrfs_alloc_root(fs_info, GFP_KERNEL);
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 4b817947e00f..790f3ab600cd 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -345,7 +345,7 @@ static const match_table_t tokens = {
 	{Opt_barrier, "barrier"},
 	{Opt_max_inline, "max_inline=%s"},
 	{Opt_alloc_start, "alloc_start=%s"},
-	{Opt_thread_pool, "thread_pool=%d"},
+	{Opt_thread_pool, "thread_pool=%u"},
 	{Opt_compress, "compress"},
 	{Opt_compress_type, "compress=%s"},
 	{Opt_compress_force, "compress-force"},
@@ -594,12 +594,11 @@ int btrfs_parse_options(struct btrfs_fs_info *info, char *options,
 			ret = match_int(&args[0], &intarg);
 			if (ret) {
 				goto out;
-			} else if (intarg > 0) {
-				info->thread_pool_size = intarg;
-			} else {
+			} else if (intarg == 0) {
 				ret = -EINVAL;
 				goto out;
 			}
+			info->thread_pool_size = intarg;
 			break;
 		case Opt_max_inline:
 			num = match_strdup(&args[0]);
@@ -1284,7 +1283,7 @@ static int btrfs_show_options(struct seq_file *seq, struct dentry *dentry)
 		seq_printf(seq, ",max_inline=%llu", info->max_inline);
 	if (info->thread_pool_size !=  min_t(unsigned long,
 					     num_online_cpus() + 2, 8))
-		seq_printf(seq, ",thread_pool=%d", info->thread_pool_size);
+		seq_printf(seq, ",thread_pool=%u", info->thread_pool_size);
 	if (btrfs_test_opt(info, COMPRESS)) {
 		compress_type = btrfs_compress_type2str(info->compress_type);
 		if (btrfs_test_opt(info, FORCE_COMPRESS))
@@ -1690,7 +1689,7 @@ out:
 }
 
 static void btrfs_resize_thread_pool(struct btrfs_fs_info *fs_info,
-				     int new_pool_size, int old_pool_size)
+				     u32 new_pool_size, u32 old_pool_size)
 {
 	if (new_pool_size == old_pool_size)
 		return;
@@ -1758,7 +1757,7 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data)
 	unsigned long old_opts = fs_info->mount_opt;
 	unsigned long old_compress_type = fs_info->compress_type;
 	u64 old_max_inline = fs_info->max_inline;
-	int old_thread_pool_size = fs_info->thread_pool_size;
+	u32 old_thread_pool_size = fs_info->thread_pool_size;
 	unsigned int old_metadata_ratio = fs_info->metadata_ratio;
 	int ret;
 

From 764cb8b43d4d8047460ed0e530fc109434c346f2 Mon Sep 17 00:00:00 2001
From: Anand Jain <anand.jain@oracle.com>
Date: Tue, 13 Feb 2018 17:50:44 +0800
Subject: [PATCH 028/164] btrfs: manage metadata_ratio mount option as %u

As metadata_ratio mount option is unsinged so manage it as %u for token
verifications, instead of %d.

Signed-off-by: Anand Jain <anand.jain@oracle.com>
Reviewed-by: Nikolay Borisov <nborisov@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/super.c | 18 ++++++------------
 1 file changed, 6 insertions(+), 12 deletions(-)

diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 790f3ab600cd..79900e1135d2 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -361,7 +361,7 @@ static const match_table_t tokens = {
 	{Opt_norecovery, "norecovery"},
 	{Opt_flushoncommit, "flushoncommit"},
 	{Opt_noflushoncommit, "noflushoncommit"},
-	{Opt_ratio, "metadata_ratio=%d"},
+	{Opt_ratio, "metadata_ratio=%u"},
 	{Opt_discard, "discard"},
 	{Opt_nodiscard, "nodiscard"},
 	{Opt_space_cache, "space_cache"},
@@ -657,16 +657,11 @@ int btrfs_parse_options(struct btrfs_fs_info *info, char *options,
 			break;
 		case Opt_ratio:
 			ret = match_int(&args[0], &intarg);
-			if (ret) {
+			if (ret)
 				goto out;
-			} else if (intarg >= 0) {
-				info->metadata_ratio = intarg;
-				btrfs_info(info, "metadata ratio %d",
-					   info->metadata_ratio);
-			} else {
-				ret = -EINVAL;
-				goto out;
-			}
+			info->metadata_ratio = intarg;
+			btrfs_info(info, "metadata ratio %u",
+				   info->metadata_ratio);
 			break;
 		case Opt_discard:
 			btrfs_set_and_info(info, DISCARD,
@@ -1339,8 +1334,7 @@ static int btrfs_show_options(struct seq_file *seq, struct dentry *dentry)
 				info->check_integrity_print_mask);
 #endif
 	if (info->metadata_ratio)
-		seq_printf(seq, ",metadata_ratio=%d",
-				info->metadata_ratio);
+		seq_printf(seq, ",metadata_ratio=%u", info->metadata_ratio);
 	if (btrfs_test_opt(info, PANIC_ON_FATAL_ERROR))
 		seq_puts(seq, ",fatal_errors=panic");
 	if (info->commit_interval != BTRFS_DEFAULT_COMMIT_INTERVAL)

From 02453bdeb02482cb405700dca70d158b4c042e71 Mon Sep 17 00:00:00 2001
From: Anand Jain <anand.jain@oracle.com>
Date: Tue, 13 Feb 2018 17:50:45 +0800
Subject: [PATCH 029/164] btrfs: manage check_int_print_mask mount option as %u

As check_int_print_mask mount option is unsigned so manage it as %u for
token verifications, instead of %d.

Signed-off-by: Anand Jain <anand.jain@oracle.com>
Reviewed-by: Nikolay Borisov <nborisov@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/super.c | 16 +++++-----------
 1 file changed, 5 insertions(+), 11 deletions(-)

diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 79900e1135d2..d3a49b8c760b 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -381,7 +381,7 @@ static const match_table_t tokens = {
 	{Opt_skip_balance, "skip_balance"},
 	{Opt_check_integrity, "check_int"},
 	{Opt_check_integrity_including_extent_data, "check_int_data"},
-	{Opt_check_integrity_print_mask, "check_int_print_mask=%d"},
+	{Opt_check_integrity_print_mask, "check_int_print_mask=%u"},
 	{Opt_rescan_uuid_tree, "rescan_uuid_tree"},
 	{Opt_fatal_errors, "fatal_errors=%s"},
 	{Opt_commit_interval, "commit=%d"},
@@ -756,17 +756,11 @@ int btrfs_parse_options(struct btrfs_fs_info *info, char *options,
 			break;
 		case Opt_check_integrity_print_mask:
 			ret = match_int(&args[0], &intarg);
-			if (ret) {
+			if (ret)
 				goto out;
-			} else if (intarg >= 0) {
-				info->check_integrity_print_mask = intarg;
-				btrfs_info(info,
-					   "check_integrity_print_mask 0x%x",
-					   info->check_integrity_print_mask);
-			} else {
-				ret = -EINVAL;
-				goto out;
-			}
+			info->check_integrity_print_mask = intarg;
+			btrfs_info(info, "check_integrity_print_mask 0x%x",
+				   info->check_integrity_print_mask);
 			break;
 #else
 		case Opt_check_integrity_including_extent_data:

From d3740608646f72fb94705a853946f647abcfaec4 Mon Sep 17 00:00:00 2001
From: Anand Jain <anand.jain@oracle.com>
Date: Tue, 13 Feb 2018 17:50:46 +0800
Subject: [PATCH 030/164] btrfs: manage commit mount option as %u

As the commit mount option is unsigned so manage it as %u for token
verifications, instead of %d.

Signed-off-by: Anand Jain <anand.jain@oracle.com>
Reviewed-by: Nikolay Borisov <nborisov@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/ctree.h |  2 +-
 fs/btrfs/super.c | 26 ++++++++++----------------
 2 files changed, 11 insertions(+), 17 deletions(-)

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 407f435245f5..024d5feb5856 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -788,7 +788,7 @@ struct btrfs_fs_info {
 	unsigned long pending_changes;
 	unsigned long compress_type:4;
 	unsigned int compress_level;
-	int commit_interval;
+	u32 commit_interval;
 	/*
 	 * It is a suggestive number, the read side is safe even it gets a
 	 * wrong number because we will write out the data into a regular
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index d3a49b8c760b..3656d0575ed8 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -384,7 +384,7 @@ static const match_table_t tokens = {
 	{Opt_check_integrity_print_mask, "check_int_print_mask=%u"},
 	{Opt_rescan_uuid_tree, "rescan_uuid_tree"},
 	{Opt_fatal_errors, "fatal_errors=%s"},
-	{Opt_commit_interval, "commit=%d"},
+	{Opt_commit_interval, "commit=%u"},
 #ifdef CONFIG_BTRFS_DEBUG
 	{Opt_fragment_data, "fragment=data"},
 	{Opt_fragment_metadata, "fragment=metadata"},
@@ -786,24 +786,18 @@ int btrfs_parse_options(struct btrfs_fs_info *info, char *options,
 		case Opt_commit_interval:
 			intarg = 0;
 			ret = match_int(&args[0], &intarg);
-			if (ret < 0) {
-				btrfs_err(info, "invalid commit interval");
-				ret = -EINVAL;
+			if (ret)
 				goto out;
-			}
-			if (intarg > 0) {
-				if (intarg > 300) {
-					btrfs_warn(info,
-						"excessive commit interval %d",
-						intarg);
-				}
-				info->commit_interval = intarg;
-			} else {
+			if (intarg == 0) {
 				btrfs_info(info,
-					   "using default commit interval %ds",
+					   "using default commit interval %us",
 					   BTRFS_DEFAULT_COMMIT_INTERVAL);
-				info->commit_interval = BTRFS_DEFAULT_COMMIT_INTERVAL;
+				intarg = BTRFS_DEFAULT_COMMIT_INTERVAL;
+			} else if (intarg > 300) {
+				btrfs_warn(info, "excessive commit interval %d",
+					   intarg);
 			}
+			info->commit_interval = intarg;
 			break;
 #ifdef CONFIG_BTRFS_DEBUG
 		case Opt_fragment_all:
@@ -1332,7 +1326,7 @@ static int btrfs_show_options(struct seq_file *seq, struct dentry *dentry)
 	if (btrfs_test_opt(info, PANIC_ON_FATAL_ERROR))
 		seq_puts(seq, ",fatal_errors=panic");
 	if (info->commit_interval != BTRFS_DEFAULT_COMMIT_INTERVAL)
-		seq_printf(seq, ",commit=%d", info->commit_interval);
+		seq_printf(seq, ",commit=%u", info->commit_interval);
 #ifdef CONFIG_BTRFS_DEBUG
 	if (btrfs_test_opt(info, FRAGMENT_DATA))
 		seq_puts(seq, ",fragment=data");

From eceff22a8067fa5f587d1bab0eb66503d33b7164 Mon Sep 17 00:00:00 2001
From: Anand Jain <anand.jain@oracle.com>
Date: Tue, 13 Feb 2018 17:50:47 +0800
Subject: [PATCH 031/164] btrfs: add a comment to mark the deprecated mount
 option

The options alloc_start and subvolrootid are deprecated, comment them in
the tokens list. And leave them as it is. No functional changes.

Signed-off-by: Anand Jain <anand.jain@oracle.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/super.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 3656d0575ed8..540c18511e7a 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -344,7 +344,7 @@ static const match_table_t tokens = {
 	{Opt_nobarrier, "nobarrier"},
 	{Opt_barrier, "barrier"},
 	{Opt_max_inline, "max_inline=%s"},
-	{Opt_alloc_start, "alloc_start=%s"},
+	{Opt_alloc_start, "alloc_start=%s"},		/* deprecated */
 	{Opt_thread_pool, "thread_pool=%u"},
 	{Opt_compress, "compress"},
 	{Opt_compress_type, "compress=%s"},
@@ -370,7 +370,7 @@ static const match_table_t tokens = {
 	{Opt_user_subvol_rm_allowed, "user_subvol_rm_allowed"},
 	{Opt_enospc_debug, "enospc_debug"},
 	{Opt_noenospc_debug, "noenospc_debug"},
-	{Opt_subvolrootid, "subvolrootid=%d"},
+	{Opt_subvolrootid, "subvolrootid=%d"},		/* deprecated */
 	{Opt_defrag, "autodefrag"},
 	{Opt_nodefrag, "noautodefrag"},
 	{Opt_inode_cache, "inode_cache"},

From acf18c56fdcb952a06650282192e3b4ca1855c5e Mon Sep 17 00:00:00 2001
From: Anand Jain <anand.jain@oracle.com>
Date: Sat, 24 Feb 2018 19:43:56 +0800
Subject: [PATCH 032/164] btrfs: fix null pointer deref when target device is
 missing

The replace target device can be missing when mounted with -o degraded,
but we wont allocate a missing btrfs_device to it. So check the device
before accessing.

BUG: unable to handle kernel NULL pointer dereference at 00000000000000b0
IP: btrfs_destroy_dev_replace_tgtdev+0x43/0xf0 [btrfs]
Call Trace:
btrfs_dev_replace_cancel+0x15f/0x180 [btrfs]
btrfs_ioctl+0x2216/0x2590 [btrfs]
do_vfs_ioctl+0x625/0x650
SyS_ioctl+0x4e/0x80
do_syscall_64+0x5d/0x160
entry_SYSCALL64_slow_path+0x25/0x25

This patch has been moved in front of patch "btrfs: log, when replace,
is canceled by the user" that could reproduce the crash if the system
reboots inside btrfs_dev_replace_start before the
btrfs_dev_replace_finishing call.

 $ mkfs /dev/sda
 $ mount /dev/sda mnt
 $ btrfs replace start /dev/sda /dev/sdb
 <insert reboot>
 $ mount po degraded /dev/sdb mnt
 <crash>

Signed-off-by: Anand Jain <anand.jain@oracle.com>
[ added reproducer description from mail ]
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/dev-replace.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c
index dd717e204b5e..c523478b3de2 100644
--- a/fs/btrfs/dev-replace.c
+++ b/fs/btrfs/dev-replace.c
@@ -312,7 +312,7 @@ void btrfs_after_dev_replace_commit(struct btrfs_fs_info *fs_info)
 
 static char* btrfs_dev_name(struct btrfs_device *device)
 {
-	if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state))
+	if (!device || test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state))
 		return "<missing disk>";
 	else
 		return rcu_str_deref(device->name);

From 8f2ceaa7b42a6df21eed621b88037af2c4cd8257 Mon Sep 17 00:00:00 2001
From: Anand Jain <anand.jain@oracle.com>
Date: Tue, 13 Feb 2018 11:53:43 +0800
Subject: [PATCH 033/164] btrfs: log, when replace, is canceled by the user

For debugging or administration purposes, we would want to know if and
when the user cancels the replace, to complement the existing messages
when dev-replace starts or finishes.

Signed-off-by: Anand Jain <anand.jain@oracle.com>
Reviewed-by: David Sterba <dsterba@suse.com>
[ update changelog, fold fix for RCU warning from Nikolay ]
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/dev-replace.c | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c
index c523478b3de2..e279f04b3388 100644
--- a/fs/btrfs/dev-replace.c
+++ b/fs/btrfs/dev-replace.c
@@ -703,6 +703,7 @@ int btrfs_dev_replace_cancel(struct btrfs_fs_info *fs_info)
 {
 	struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
 	struct btrfs_device *tgt_device = NULL;
+	struct btrfs_device *src_device = NULL;
 	struct btrfs_trans_handle *trans;
 	struct btrfs_root *root = fs_info->tree_root;
 	int result;
@@ -724,6 +725,7 @@ int btrfs_dev_replace_cancel(struct btrfs_fs_info *fs_info)
 	case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED:
 		result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NO_ERROR;
 		tgt_device = dev_replace->tgtdev;
+		src_device = dev_replace->srcdev;
 		dev_replace->tgtdev = NULL;
 		dev_replace->srcdev = NULL;
 		break;
@@ -741,6 +743,12 @@ int btrfs_dev_replace_cancel(struct btrfs_fs_info *fs_info)
 	}
 	ret = btrfs_commit_transaction(trans);
 	WARN_ON(ret);
+
+	btrfs_info_in_rcu(fs_info,
+		"dev_replace from %s (devid %llu) to %s canceled",
+		btrfs_dev_name(src_device), src_device->devid,
+		btrfs_dev_name(tgt_device));
+
 	if (tgt_device)
 		btrfs_destroy_dev_replace_tgtdev(fs_info, tgt_device);
 

From bf6d7d4900c0d0c15dcd1baa1b65e6519bf4abef Mon Sep 17 00:00:00 2001
From: Nikolay Borisov <nborisov@suse.com>
Date: Thu, 8 Feb 2018 18:25:17 +0200
Subject: [PATCH 034/164] btrfs: Remove invalid null checks from
 btrfs_cleanup_dirty_bgs

list_first_entry is essentially a wrapper over cotnainer_of. The latter
can never return null even if it's working on inconsistent list since it
will either crash or return some offset in the wrong struct.
Additionally, for the dirty_bgs list the iteration is done under
dirty_bgs_lock which ensures consistency of the list.

Signed-off-by: Nikolay Borisov <nborisov@suse.com>
Reviewed-by: Liu Bo <bo.li.liu@oracle.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/disk-io.c | 9 ---------
 1 file changed, 9 deletions(-)

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 47624407bb94..dcfdb1f17f1b 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -4327,11 +4327,6 @@ void btrfs_cleanup_dirty_bgs(struct btrfs_transaction *cur_trans,
 		cache = list_first_entry(&cur_trans->dirty_bgs,
 					 struct btrfs_block_group_cache,
 					 dirty_list);
-		if (!cache) {
-			btrfs_err(fs_info, "orphan block group dirty_bgs list");
-			spin_unlock(&cur_trans->dirty_bgs_lock);
-			return;
-		}
 
 		if (!list_empty(&cache->io_list)) {
 			spin_unlock(&cur_trans->dirty_bgs_lock);
@@ -4355,10 +4350,6 @@ void btrfs_cleanup_dirty_bgs(struct btrfs_transaction *cur_trans,
 		cache = list_first_entry(&cur_trans->io_bgs,
 					 struct btrfs_block_group_cache,
 					 io_list);
-		if (!cache) {
-			btrfs_err(fs_info, "orphan block group on io_bgs list");
-			return;
-		}
 
 		list_del_init(&cache->io_list);
 		spin_lock(&cache->lock);

From 45ae2c1841c31c90077cf427c09ea0e83e381026 Mon Sep 17 00:00:00 2001
From: Nikolay Borisov <nborisov@suse.com>
Date: Thu, 8 Feb 2018 18:25:18 +0200
Subject: [PATCH 035/164] btrfs: Document consistency of transaction->io_bgs
 list

The reason why io_bgs can be modified without holding any lock is
non-obvious. Document it and reference that documentation from the
respective call sites.

Signed-off-by: Nikolay Borisov <nborisov@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/disk-io.c     |  4 ++++
 fs/btrfs/extent-tree.c |  9 +++++++--
 fs/btrfs/transaction.h | 16 ++++++++++++++++
 3 files changed, 27 insertions(+), 2 deletions(-)

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index dcfdb1f17f1b..08e33f316a86 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -4346,6 +4346,10 @@ void btrfs_cleanup_dirty_bgs(struct btrfs_transaction *cur_trans,
 	}
 	spin_unlock(&cur_trans->dirty_bgs_lock);
 
+	/*
+	 * Refer to the definition of io_bgs member for details why it's safe
+	 * to use it without any locking
+	 */
 	while (!list_empty(&cur_trans->io_bgs)) {
 		cache = list_first_entry(&cur_trans->io_bgs,
 					 struct btrfs_block_group_cache,
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 7b4d6789cc66..9a677860f64d 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -3741,8 +3741,9 @@ again:
 				should_put = 0;
 
 				/*
-				 * the cache_write_mutex is protecting
-				 * the io_list
+				 * The cache_write_mutex is protecting the
+				 * io_list, also refer to the definition of
+				 * btrfs_transaction::io_bgs for more details
 				 */
 				list_add_tail(&cache->io_list, io);
 			} else {
@@ -3934,6 +3935,10 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
 	}
 	spin_unlock(&cur_trans->dirty_bgs_lock);
 
+	/*
+	 * Refer to the definition of io_bgs member for details why it's safe
+	 * to use it without any locking
+	 */
 	while (!list_empty(io)) {
 		cache = list_first_entry(io, struct btrfs_block_group_cache,
 					 io_list);
diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h
index 817fd7c9836b..2762c8d6191c 100644
--- a/fs/btrfs/transaction.h
+++ b/fs/btrfs/transaction.h
@@ -69,6 +69,22 @@ struct btrfs_transaction {
 	struct list_head pending_chunks;
 	struct list_head switch_commits;
 	struct list_head dirty_bgs;
+
+	/*
+	 * There is no explicit lock which protects io_bgs, rather its
+	 * consistency is implied by the fact that all the sites which modify
+	 * it do so under some form of transaction critical section, namely:
+	 *
+	 * - btrfs_start_dirty_block_groups - This function can only ever be
+	 *   run by one of the transaction committers. Refer to
+	 *   BTRFS_TRANS_DIRTY_BG_RUN usage in btrfs_commit_transaction
+	 *
+	 * - btrfs_write_dirty_blockgroups - this is called by
+	 *   commit_cowonly_roots from transaction critical section
+	 *   (TRANS_STATE_COMMIT_DOING)
+	 *
+	 * - btrfs_cleanup_dirty_bgs - called on transaction abort
+	 */
 	struct list_head io_bgs;
 	struct list_head dropped_roots;
 

From 9a3daff320260723b9e64b5b3b9c6592a0cd006a Mon Sep 17 00:00:00 2001
From: Nikolay Borisov <nborisov@suse.com>
Date: Fri, 15 Dec 2017 12:05:37 +0200
Subject: [PATCH 036/164] btrfs: Add enospc_debug printing in
 metadata_reserve_bytes

Currently when enospc_debug mount option is turned on we do not print
any debug info in case metadata reservation failures happen. Fix this
by adding the necessary hook in reserve_metadata_bytes.

Signed-off-by: Nikolay Borisov <nborisov@suse.com>
Reviewed-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/extent-tree.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 9a677860f64d..85258e85a030 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -5394,10 +5394,15 @@ static int reserve_metadata_bytes(struct btrfs_root *root,
 		    !block_rsv_use_bytes(global_rsv, orig_bytes))
 			ret = 0;
 	}
-	if (ret == -ENOSPC)
+	if (ret == -ENOSPC) {
 		trace_btrfs_space_reservation(fs_info, "space_info:enospc",
 					      block_rsv->space_info->flags,
 					      orig_bytes, 1);
+
+		if (btrfs_test_opt(fs_info, ENOSPC_DEBUG))
+			dump_space_info(fs_info, block_rsv->space_info,
+					orig_bytes, 0);
+	}
 	return ret;
 }
 

From 86d750a48a6d22435265abf525d8a27fa636db26 Mon Sep 17 00:00:00 2001
From: Yang Shi <yang.s@alibaba-inc.com>
Date: Sat, 18 Nov 2017 07:02:16 +0800
Subject: [PATCH 037/164] btrfs: remove unused hardirq.h

Preempt counter APIs have been split out, currently, hardirq.h just
includes irq_enter/exit APIs which are not used by btrfs at all.

So, remove the unused hardirq.h.

Signed-off-by: Yang Shi <yang.s@alibaba-inc.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/extent_map.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c
index d3bd02105d1c..c80dea7c69af 100644
--- a/fs/btrfs/extent_map.c
+++ b/fs/btrfs/extent_map.c
@@ -2,7 +2,6 @@
 #include <linux/err.h>
 #include <linux/slab.h>
 #include <linux/spinlock.h>
-#include <linux/hardirq.h>
 #include "ctree.h"
 #include "extent_map.h"
 #include "compression.h"

From 2afb9653bf56a32ac4955527357428a8c4fdd5b8 Mon Sep 17 00:00:00 2001
From: Anand Jain <anand.jain@oracle.com>
Date: Thu, 15 Feb 2018 11:36:07 +0800
Subject: [PATCH 038/164] btrfs: remove unused function
 btrfs_async_submit_limit()

Commit [1] removed the need to use btrfs_async_submit_limit(), so
delete it.

[1]
 commit 736cd52e0c720103f52ab9da47b6cc3af6b083f6
  Btrfs: remove nr_async_submits and async_submit_draining

Signed-off-by: Anand Jain <anand.jain@oracle.com>
Reviewed-by: Nikolay Borisov <nborisov@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/disk-io.c | 8 --------
 fs/btrfs/disk-io.h | 1 -
 2 files changed, 9 deletions(-)

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 08e33f316a86..b23961a4af52 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -710,14 +710,6 @@ blk_status_t btrfs_bio_wq_end_io(struct btrfs_fs_info *info, struct bio *bio,
 	return 0;
 }
 
-unsigned long btrfs_async_submit_limit(struct btrfs_fs_info *info)
-{
-	unsigned long limit = min_t(unsigned long,
-				    info->thread_pool_size,
-				    info->fs_devices->open_devices);
-	return 256 * limit;
-}
-
 static void run_one_async_start(struct btrfs_work *work)
 {
 	struct async_submit_bio *async;
diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
index 301151a50ac1..e2ac6a14150a 100644
--- a/fs/btrfs/disk-io.h
+++ b/fs/btrfs/disk-io.h
@@ -133,7 +133,6 @@ blk_status_t btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct bio *bio,
 			u64 bio_offset, void *private_data,
 			extent_submit_bio_hook_t *submit_bio_start,
 			extent_submit_bio_hook_t *submit_bio_done);
-unsigned long btrfs_async_submit_limit(struct btrfs_fs_info *info);
 int btrfs_write_tree_block(struct extent_buffer *buf);
 void btrfs_wait_tree_block_writeback(struct extent_buffer *buf);
 int btrfs_init_log_root_tree(struct btrfs_trans_handle *trans,

From 3752d22fcea160cc2493e34f5e0e41cdd7fdd921 Mon Sep 17 00:00:00 2001
From: Anand Jain <Anand.Jain@oracle.com>
Date: Thu, 15 Feb 2018 12:29:38 +0800
Subject: [PATCH 039/164] btrfs: cow_file_range() num_bytes and disk_num_bytes
 are same

This patch deletes local variable disk_num_bytes as its value
is same as num_bytes in the function cow_file_range().

Signed-off-by: Anand Jain <anand.jain@oracle.com>
Reviewed-by: Nikolay Borisov <nborisov@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/inode.c | 16 ++++++----------
 1 file changed, 6 insertions(+), 10 deletions(-)

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index f53470112670..e90d7c597ef3 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -961,7 +961,6 @@ static noinline int cow_file_range(struct inode *inode,
 	u64 alloc_hint = 0;
 	u64 num_bytes;
 	unsigned long ram_size;
-	u64 disk_num_bytes;
 	u64 cur_alloc_size = 0;
 	u64 blocksize = fs_info->sectorsize;
 	struct btrfs_key ins;
@@ -979,7 +978,6 @@ static noinline int cow_file_range(struct inode *inode,
 
 	num_bytes = ALIGN(end - start + 1, blocksize);
 	num_bytes = max(blocksize,  num_bytes);
-	disk_num_bytes = num_bytes;
 
 	inode_should_defrag(BTRFS_I(inode), start, end, num_bytes, SZ_64K);
 
@@ -1010,15 +1008,14 @@ static noinline int cow_file_range(struct inode *inode,
 		}
 	}
 
-	BUG_ON(disk_num_bytes >
-	       btrfs_super_total_bytes(fs_info->super_copy));
+	BUG_ON(num_bytes > btrfs_super_total_bytes(fs_info->super_copy));
 
 	alloc_hint = get_extent_allocation_hint(inode, start, num_bytes);
 	btrfs_drop_extent_cache(BTRFS_I(inode), start,
 			start + num_bytes - 1, 0);
 
-	while (disk_num_bytes > 0) {
-		cur_alloc_size = disk_num_bytes;
+	while (num_bytes > 0) {
+		cur_alloc_size = num_bytes;
 		ret = btrfs_reserve_extent(root, cur_alloc_size, cur_alloc_size,
 					   fs_info->sectorsize, 0, alloc_hint,
 					   &ins, 1, 1);
@@ -1082,11 +1079,10 @@ static noinline int cow_file_range(struct inode *inode,
 					     delalloc_end, locked_page,
 					     EXTENT_LOCKED | EXTENT_DELALLOC,
 					     page_ops);
-		if (disk_num_bytes < cur_alloc_size)
-			disk_num_bytes = 0;
+		if (num_bytes < cur_alloc_size)
+			num_bytes = 0;
 		else
-			disk_num_bytes -= cur_alloc_size;
-		num_bytes -= cur_alloc_size;
+			num_bytes -= cur_alloc_size;
 		alloc_hint = ins.objectid + ins.offset;
 		start += cur_alloc_size;
 		extent_reserved = false;

From 566b1760b42be7c99c02e894b1ec9a707c2e83d1 Mon Sep 17 00:00:00 2001
From: Anand Jain <anand.jain@oracle.com>
Date: Thu, 15 Feb 2018 18:07:59 +0800
Subject: [PATCH 040/164] btrfs: use ASSERT to report logical error in
 cow_file_range()

Use ASSERT to report logical error in cow_file_range(), also move it a
bit closer to when the num_bytes is derived.

The extent start could be (u64)-1 in some cases, the assert should catch
that we do not accidentally pass it to cow_file_range.

Signed-off-by: Anand Jain <anand.jain@oracle.com>
Reviewed-by: Nikolay Borisov <nborisov@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/inode.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index e90d7c597ef3..28491a82e645 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -978,6 +978,7 @@ static noinline int cow_file_range(struct inode *inode,
 
 	num_bytes = ALIGN(end - start + 1, blocksize);
 	num_bytes = max(blocksize,  num_bytes);
+	ASSERT(num_bytes <= btrfs_super_total_bytes(fs_info->super_copy));
 
 	inode_should_defrag(BTRFS_I(inode), start, end, num_bytes, SZ_64K);
 
@@ -1008,8 +1009,6 @@ static noinline int cow_file_range(struct inode *inode,
 		}
 	}
 
-	BUG_ON(num_bytes > btrfs_super_total_bytes(fs_info->super_copy));
-
 	alloc_hint = get_extent_allocation_hint(inode, start, num_bytes);
 	btrfs_drop_extent_cache(BTRFS_I(inode), start,
 			start + num_bytes - 1, 0);

From 4117f207d4b0d85ab78fec9cb138ed9c2a4b9f20 Mon Sep 17 00:00:00 2001
From: Qu Wenruo <wqu@suse.com>
Date: Mon, 22 Jan 2018 13:50:54 +0800
Subject: [PATCH 041/164] btrfs: Add chunk allocation ENOSPC debug message for
 enospc_debug mount option

Enospc_debug makes extent allocator print more debug messages,
however for chunk allocation, there is no debug message for enospc_debug
at all.

This patch will add message for the following parts of chunk allocator:

1) No rw device at all
   Quite rare, but at least output one message for this case.

2) Not enough space for some device
   This debug message is quite handy for unbalanced disks with stripe
   based profiles (RAID0/10/5/6).

3) Not enough free devices
   This debug message should tell us if current chunk allocator is
   working correctly under minimal device requirements.

Although in most cases, we will hit other ENOSPC before we even hit a
chunk allocator ENOSPC, but such debug info won't help.

Signed-off-by: Qu Wenruo <wqu@suse.com>
Reviewed-by: Nikolay Borisov <nborisov@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/volumes.c | 19 +++++++++++++++++--
 1 file changed, 17 insertions(+), 2 deletions(-)

diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index e4bfd81340a7..77c7a0c18190 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -4700,8 +4700,11 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
 
 	BUG_ON(!alloc_profile_is_valid(type, 0));
 
-	if (list_empty(&fs_devices->alloc_list))
+	if (list_empty(&fs_devices->alloc_list)) {
+		if (btrfs_test_opt(info, ENOSPC_DEBUG))
+			btrfs_debug(info, "%s: no writable device", __func__);
 		return -ENOSPC;
+	}
 
 	index = __get_raid_index(type);
 
@@ -4784,8 +4787,14 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
 		if (ret == 0)
 			max_avail = max_stripe_size * dev_stripes;
 
-		if (max_avail < BTRFS_STRIPE_LEN * dev_stripes)
+		if (max_avail < BTRFS_STRIPE_LEN * dev_stripes) {
+			if (btrfs_test_opt(info, ENOSPC_DEBUG))
+				btrfs_debug(info,
+			"%s: devid %llu has no free space, have=%llu want=%u",
+					    __func__, device->devid, max_avail,
+					    BTRFS_STRIPE_LEN * dev_stripes);
 			continue;
+		}
 
 		if (ndevs == fs_devices->rw_devices) {
 			WARN(1, "%s: found more than %llu devices\n",
@@ -4810,6 +4819,12 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
 
 	if (ndevs < devs_increment * sub_stripes || ndevs < devs_min) {
 		ret = -ENOSPC;
+		if (btrfs_test_opt(info, ENOSPC_DEBUG)) {
+			btrfs_debug(info,
+	"%s: not enough devices with free space: have=%d minimum required=%d",
+				    __func__, ndevs, min(devs_min,
+				    devs_increment * sub_stripes));
+		}
 		goto error;
 	}
 

From 7ef2d6a7228f54f337a6fcb2e40de41bcc826ef2 Mon Sep 17 00:00:00 2001
From: Anand Jain <anand.jain@oracle.com>
Date: Fri, 5 Jan 2018 10:47:07 +0800
Subject: [PATCH 042/164] btrfs: not a disk error if the bio_add_page fails

bio_add_page() can fail for logical reasons as from the bio_add_page()
comments:

/*
 * This will only fail if either bio->bi_vcnt == bio->bi_max_vecs or
 * it's a cloned bio.
 */

Here we have just allocated the bio, so both of those failures can't
occur. So drop the check. We can also drop the error stats for write
error.

Signed-off-by: Anand Jain <anand.jain@oracle.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/scrub.c | 11 ++++-------
 1 file changed, 4 insertions(+), 7 deletions(-)

diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index ec56f33feea9..beb441d0c5c6 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -4620,7 +4620,6 @@ static int write_page_nocow(struct scrub_ctx *sctx,
 {
 	struct bio *bio;
 	struct btrfs_device *dev;
-	int ret;
 
 	dev = sctx->wr_tgtdev;
 	if (!dev)
@@ -4635,17 +4634,15 @@ static int write_page_nocow(struct scrub_ctx *sctx,
 	bio->bi_iter.bi_sector = physical_for_dev_replace >> 9;
 	bio_set_dev(bio, dev->bdev);
 	bio->bi_opf = REQ_OP_WRITE | REQ_SYNC;
-	ret = bio_add_page(bio, page, PAGE_SIZE, 0);
-	if (ret != PAGE_SIZE) {
-leave_with_eio:
+	/* bio_add_page won't fail on a freshly allocated bio */
+	bio_add_page(bio, page, PAGE_SIZE, 0);
+
+	if (btrfsic_submit_bio_wait(bio)) {
 		bio_put(bio);
 		btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_WRITE_ERRS);
 		return -EIO;
 	}
 
-	if (btrfsic_submit_bio_wait(bio))
-		goto leave_with_eio;
-
 	bio_put(bio);
 	return 0;
 }

From f9cacae3145a07c8a2b699f18824df0cf7778431 Mon Sep 17 00:00:00 2001
From: Nikolay Borisov <nborisov@suse.com>
Date: Fri, 9 Feb 2018 11:30:18 +0200
Subject: [PATCH 043/164] btrfs: Move error handling of
 btrfs_start_dirty_block_groups closer to call site

Even though btrfs_start_dirty_block_groups is fairly in the beginning of
btrfs_commit_transaction outside of the critical section defined by the
transaction states it can only be run by a single comitter. In other
words it defines its own critical section thanks to the
BTRFS_TRANS_DIRTY_BG run flag and ro_block_group_mutex. However, its
error handling is outside of this critical section which is a bit
counter-intuitive. So move the error handling righ after the function is
executed and let the sole runner of dirty block groups handle the return
value. No functional changes.

Signed-off-by: Nikolay Borisov <nborisov@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/transaction.c | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index ca2d91163af9..665438542b96 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -2013,12 +2013,13 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
 			run_it = 1;
 		mutex_unlock(&fs_info->ro_block_group_mutex);
 
-		if (run_it)
+		if (run_it) {
 			ret = btrfs_start_dirty_block_groups(trans);
-	}
-	if (ret) {
-		btrfs_end_transaction(trans);
-		return ret;
+			if (ret) {
+				btrfs_end_transaction(trans);
+				return ret;
+			}
+		}
 	}
 
 	spin_lock(&fs_info->trans_lock);

From bc5511d0eda1c2534aa3cedb485eae1fc354f2dc Mon Sep 17 00:00:00 2001
From: Nikolay Borisov <nborisov@suse.com>
Date: Tue, 23 Jan 2018 14:46:53 +0200
Subject: [PATCH 044/164] btrfs: Use schedule_timeout_interruptible

Instead of manually fiddling with the state of the task
(RUNNING->INTERRUPTIBLE->RUNNING) again just use schedule_timeout_interruptible
which adjusts the task state as needed. No functional changes.

Signed-off-by: Nikolay Borisov <nborisov@suse.com>
Reviewed-by: Josef Bacik <jbacik@fb.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/disk-io.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index b23961a4af52..fa80de56340f 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -1800,12 +1800,10 @@ sleep:
 		if (unlikely(test_bit(BTRFS_FS_STATE_ERROR,
 				      &fs_info->fs_state)))
 			btrfs_cleanup_transaction(fs_info);
-		set_current_state(TASK_INTERRUPTIBLE);
 		if (!kthread_should_stop() &&
 				(!btrfs_transaction_blocked(fs_info) ||
 				 cannot_commit))
-			schedule_timeout(delay);
-		__set_current_state(TASK_RUNNING);
+			schedule_timeout_interruptible(delay);
 	} while (!kthread_should_stop());
 	return 0;
 }

From 3d5addafd0c49d57afe867339ec1cc535d23715e Mon Sep 17 00:00:00 2001
From: Liu Bo <bo.li.liu@oracle.com>
Date: Thu, 25 Jan 2018 11:02:55 -0700
Subject: [PATCH 045/164] Btrfs: do not check inode's runtime flags under
 root->orphan_lock

It's not necessary to hold ->orphan_lock when checking inode's runtime
flags.

Signed-off-by: Liu Bo <bo.li.liu@oracle.com>
Reviewed-by: Josef Bacik <jbacik@fb.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/inode.c | 21 ++++++++++++---------
 1 file changed, 12 insertions(+), 9 deletions(-)

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 28491a82e645..4d3a4d1507a1 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -3355,14 +3355,6 @@ int btrfs_orphan_add(struct btrfs_trans_handle *trans,
 			return -ENOMEM;
 	}
 
-	spin_lock(&root->orphan_lock);
-	if (!root->orphan_block_rsv) {
-		root->orphan_block_rsv = block_rsv;
-	} else if (block_rsv) {
-		btrfs_free_block_rsv(fs_info, block_rsv);
-		block_rsv = NULL;
-	}
-
 	if (!test_and_set_bit(BTRFS_INODE_HAS_ORPHAN_ITEM,
 			      &inode->runtime_flags)) {
 #if 0
@@ -3377,12 +3369,23 @@ int btrfs_orphan_add(struct btrfs_trans_handle *trans,
 			insert = 1;
 #endif
 		insert = 1;
-		atomic_inc(&root->orphan_inodes);
 	}
 
 	if (!test_and_set_bit(BTRFS_INODE_ORPHAN_META_RESERVED,
 			      &inode->runtime_flags))
 		reserve = 1;
+
+	spin_lock(&root->orphan_lock);
+	/* If someone has created ->orphan_block_rsv, be happy to use it. */
+	if (!root->orphan_block_rsv) {
+		root->orphan_block_rsv = block_rsv;
+	} else if (block_rsv) {
+		btrfs_free_block_rsv(fs_info, block_rsv);
+		block_rsv = NULL;
+	}
+
+	if (insert)
+		atomic_inc(&root->orphan_inodes);
 	spin_unlock(&root->orphan_lock);
 
 	/* grab metadata reservation from transaction handle */

From f8e10cd3f8dd3b15ea49f820a0d2decf3c654b13 Mon Sep 17 00:00:00 2001
From: Anand Jain <anand.jain@oracle.com>
Date: Mon, 22 Jan 2018 14:49:36 -0800
Subject: [PATCH 046/164] btrfs: keep device list sorted

By maintaining the device list sorted lets us reproduce the problems
related to missing chunk in the degraded mode much more consistent. So
fix this by sorting the devices by devid within the kernel. So that we
know which device is assigned to the struct fs_info::latest_bdev when
all the devices are having and same SB generation.

Signed-off-by: Anand Jain <anand.jain@oracle.com>
Reviewed-by: Nikolay Borisov <nborisov@suse.com>
[ update changelog ]
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/volumes.c | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 77c7a0c18190..abc6555105bb 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -27,6 +27,7 @@
 #include <linux/raid/pq.h>
 #include <linux/semaphore.h>
 #include <linux/uuid.h>
+#include <linux/list_sort.h>
 #include <asm/div64.h>
 #include "ctree.h"
 #include "extent_map.h"
@@ -1103,6 +1104,20 @@ out:
 	return ret;
 }
 
+static int devid_cmp(void *priv, struct list_head *a, struct list_head *b)
+{
+	struct btrfs_device *dev1, *dev2;
+
+	dev1 = list_entry(a, struct btrfs_device, dev_list);
+	dev2 = list_entry(b, struct btrfs_device, dev_list);
+
+	if (dev1->devid < dev2->devid)
+		return -1;
+	else if (dev1->devid > dev2->devid)
+		return 1;
+	return 0;
+}
+
 int btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
 		       fmode_t flags, void *holder)
 {
@@ -1113,6 +1128,7 @@ int btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
 		fs_devices->opened++;
 		ret = 0;
 	} else {
+		list_sort(NULL, &fs_devices->devices, devid_cmp);
 		ret = __btrfs_open_devices(fs_devices, flags, holder);
 	}
 	mutex_unlock(&uuid_mutex);

From b1b8e38622ea044ccdee0460253e428cb6e9038e Mon Sep 17 00:00:00 2001
From: Anand Jain <anand.jain@oracle.com>
Date: Mon, 22 Jan 2018 14:49:37 -0800
Subject: [PATCH 047/164] btrfs: insert newly opened device to the end of the
 list

Add opened device to the tail of dev_alloc_list instead of head, so that
it maintains the same order as dev_list.

Signed-off-by: Anand Jain <anand.jain@oracle.com>
Reviewed-by: Nikolay Borisov <nborisov@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/volumes.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index abc6555105bb..b2a2afe980f9 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -709,7 +709,7 @@ static int btrfs_open_one_device(struct btrfs_fs_devices *fs_devices,
 	if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) &&
 	    device->devid != BTRFS_DEV_REPLACE_DEVID) {
 		fs_devices->rw_devices++;
-		list_add(&device->dev_alloc_list, &fs_devices->alloc_list);
+		list_add_tail(&device->dev_alloc_list, &fs_devices->alloc_list);
 	}
 	brelse(bh);
 

From da07d4ab33e6207f77615acc4c17a19a4793980d Mon Sep 17 00:00:00 2001
From: Nikolay Borisov <nborisov@suse.com>
Date: Fri, 12 Jan 2018 16:21:05 +0200
Subject: [PATCH 048/164] btrfs: Streamline btrfs_delalloc_reserve_metadata
 initial operations

The behavior of btrfs_delalloc_reserve_metadata depends on whether
the inode we are allocating for is the freespace inode or not. As it
stands if we are the free node we set 'flush' and 'delalloc_lock'
variable to certain values. Subsequently we check the values of those
vars and act accordingly. Instead, simplify things by having 1 if
which checks whether we are the freespace inode or not and do any
specific operation in either branches of that if. This makes the code
a bit easier to understand, as an added bonus it also shrinks the
compiled size:

add/remove: 0/0 grow/shrink: 0/1 up/down: 0/-17 (-17)
Function                                     old     new   delta
btrfs_delalloc_reserve_metadata             1876    1859     -17
Total: Before=85966, After=85949, chg -0.02%

No functional changes.

Signed-off-by: Nikolay Borisov <nborisov@suse.com>
Reviewed-by: Edmund Nadolski <enadolski@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/extent-tree.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 85258e85a030..b17506688914 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -6061,13 +6061,13 @@ int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes)
 	if (btrfs_is_free_space_inode(inode)) {
 		flush = BTRFS_RESERVE_NO_FLUSH;
 		delalloc_lock = false;
-	} else if (current->journal_info) {
-		flush = BTRFS_RESERVE_FLUSH_LIMIT;
-	}
+	} else {
+		if (current->journal_info)
+			flush = BTRFS_RESERVE_FLUSH_LIMIT;
 
-	if (flush != BTRFS_RESERVE_NO_FLUSH &&
-	    btrfs_transaction_in_commit(fs_info))
-		schedule_timeout(1);
+		if (btrfs_transaction_in_commit(fs_info))
+			schedule_timeout(1);
+	}
 
 	if (delalloc_lock)
 		mutex_lock(&inode->delalloc_mutex);

From 793ff2c88c6397b3531c08cc4f920619b56a9def Mon Sep 17 00:00:00 2001
From: Qu Wenruo <wqu@suse.com>
Date: Wed, 31 Jan 2018 14:16:34 +0800
Subject: [PATCH 049/164] btrfs: volumes: Cleanup stripe size calculation

Cleanup the following things:
1) open coded SZ_16M round up
2) use min() to replace open-coded size comparison
3) code style

Signed-off-by: Qu Wenruo <wqu@suse.com>
Reviewed-by: Nikolay Borisov <nborisov@suse.com>
Reviewed-by: Gu Jinxiang <gujx@cn.fujitsu.com>
[ reformat comment ]
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/volumes.c | 13 ++++++-------
 1 file changed, 6 insertions(+), 7 deletions(-)

diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index b2a2afe980f9..c5d2954668fe 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -4874,18 +4874,17 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
 	 * and compare that answer with the max chunk size
 	 */
 	if (stripe_size * data_stripes > max_chunk_size) {
-		u64 mask = (1ULL << 24) - 1;
-
 		stripe_size = div_u64(max_chunk_size, data_stripes);
 
 		/* bump the answer up to a 16MB boundary */
-		stripe_size = (stripe_size + mask) & ~mask;
+		stripe_size = round_up(stripe_size, SZ_16M);
 
-		/* but don't go higher than the limits we found
-		 * while searching for free extents
+		/*
+		 * But don't go higher than the limits we found while searching
+		 * for free extents
 		 */
-		if (stripe_size > devices_info[ndevs-1].max_avail)
-			stripe_size = devices_info[ndevs-1].max_avail;
+		stripe_size = min(devices_info[ndevs - 1].max_avail,
+				  stripe_size);
 	}
 
 	/* align to BTRFS_STRIPE_LEN */

From c1c3fac2a95b14e36333528e408e76d90c93bbf7 Mon Sep 17 00:00:00 2001
From: Nikolay Borisov <nborisov@suse.com>
Date: Tue, 16 Jan 2018 09:31:58 +0200
Subject: [PATCH 050/164] btrfs: Remove btrfs_inode::delayed_iput_count

delayed_iput_count wa supposed to be used to implement, well, delayed
iput. The idea is that we keep accumulating the number of iputs we do
until eventually the inode is deleted. Turns out we never really
switched the delayed_iput_count from 0 to 1, hence all conditional
code relying on the value of that member being different than 0 was
never executed. This, as it turns out, didn't cause any problem due
to the simple fact that the generic inode's i_count member was always
used to count the number of iputs. So let's just remove the unused
member and all unused code. This patch essentially provides no
functional changes. While at it, also add proper documentation for
btrfs_add_delayed_iput

Signed-off-by: Nikolay Borisov <nborisov@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
[ reformat comment ]
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/btrfs_inode.h |  1 -
 fs/btrfs/inode.c       | 27 +++++++++++++--------------
 2 files changed, 13 insertions(+), 15 deletions(-)

diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index 63f0ccc92a71..f527e99c9f8d 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -195,7 +195,6 @@ struct btrfs_inode {
 
 	/* Hook into fs_info->delayed_iputs */
 	struct list_head delayed_iput;
-	long delayed_iput_count;
 
 	/*
 	 * To avoid races between lockless (i_mutex not held) direct IO writes
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 4d3a4d1507a1..b55b47f493e9 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -3242,6 +3242,16 @@ static int btrfs_readpage_end_io_hook(struct btrfs_io_bio *io_bio,
 				      start, (size_t)(end - start + 1));
 }
 
+/*
+ * btrfs_add_delayed_iput - perform a delayed iput on @inode
+ *
+ * @inode: The inode we want to perform iput on
+ *
+ * This function uses the generic vfs_inode::i_count to track whether we should
+ * just decrement it (in case it's > 1) or if this is the last iput then link
+ * the inode to the delayed iput machinery. Delayed iputs are processed at
+ * transaction commit time/superblock commit/cleaner kthread.
+ */
 void btrfs_add_delayed_iput(struct inode *inode)
 {
 	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
@@ -3251,12 +3261,8 @@ void btrfs_add_delayed_iput(struct inode *inode)
 		return;
 
 	spin_lock(&fs_info->delayed_iput_lock);
-	if (binode->delayed_iput_count == 0) {
-		ASSERT(list_empty(&binode->delayed_iput));
-		list_add_tail(&binode->delayed_iput, &fs_info->delayed_iputs);
-	} else {
-		binode->delayed_iput_count++;
-	}
+	ASSERT(list_empty(&binode->delayed_iput));
+	list_add_tail(&binode->delayed_iput, &fs_info->delayed_iputs);
 	spin_unlock(&fs_info->delayed_iput_lock);
 }
 
@@ -3269,13 +3275,7 @@ void btrfs_run_delayed_iputs(struct btrfs_fs_info *fs_info)
 
 		inode = list_first_entry(&fs_info->delayed_iputs,
 				struct btrfs_inode, delayed_iput);
-		if (inode->delayed_iput_count) {
-			inode->delayed_iput_count--;
-			list_move_tail(&inode->delayed_iput,
-					&fs_info->delayed_iputs);
-		} else {
-			list_del_init(&inode->delayed_iput);
-		}
+		list_del_init(&inode->delayed_iput);
 		spin_unlock(&fs_info->delayed_iput_lock);
 		iput(&inode->vfs_inode);
 		spin_lock(&fs_info->delayed_iput_lock);
@@ -9333,7 +9333,6 @@ struct inode *btrfs_alloc_inode(struct super_block *sb)
 	ei->dir_index = 0;
 	ei->last_unlink_trans = 0;
 	ei->last_log_commit = 0;
-	ei->delayed_iput_count = 0;
 
 	spin_lock_init(&ei->lock);
 	ei->outstanding_extents = 0;

From 019599ada73ce8ac0b0d60705819eba95df54630 Mon Sep 17 00:00:00 2001
From: Gu Jinxiang <gujx@cn.fujitsu.com>
Date: Thu, 11 Jan 2018 16:12:17 +0800
Subject: [PATCH 051/164] btrfs: use reada direction enum instead of constant
 value in populate_free_space_tree

populate_free_space_tree calls function btrfs_search_slot_for_read with
parameter int find_higher = 1, it means that, if no exact match is
found, then use the next higher item.  So in function
populate_free_space_tree, use READA_FORWARD to read forward ahead.

This also changes the value from READA_BACK to READA_FORWARD, since
according to the logic, it should reada_for_search forward, not
backward.

Signed-off-by: Gu JinXiang <gujx@cn.fujitsu.com>
Reviewed-by: Nikolay Borisov <nborisov@suse.com>
[ update changelog ]
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/free-space-tree.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/btrfs/free-space-tree.c b/fs/btrfs/free-space-tree.c
index fe5e0324dca9..d7b6c0017143 100644
--- a/fs/btrfs/free-space-tree.c
+++ b/fs/btrfs/free-space-tree.c
@@ -1071,7 +1071,7 @@ static int populate_free_space_tree(struct btrfs_trans_handle *trans,
 	path = btrfs_alloc_path();
 	if (!path)
 		return -ENOMEM;
-	path->reada = 1;
+	path->reada = READA_FORWARD;
 
 	path2 = btrfs_alloc_path();
 	if (!path2) {

From 7ce311d552e473a5176f8bb143fb45de64b8bd81 Mon Sep 17 00:00:00 2001
From: Gu JinXiang <gujx@cn.fujitsu.com>
Date: Thu, 11 Jan 2018 16:12:18 +0800
Subject: [PATCH 052/164] btrfs: use reada direction enum instead of constant
 value in load_free_space_tree

load_free_space_tree calls either function load_free_space_bitmaps or
load_free_space_extents. And either of those two will lead to call
btrfs_next_item.  So in function load_free_space_tree, use READA_FORWARD
to read forward ahead.

This also changes the value from READA_BACK to READA_FORWARD, since
according to the logic, it should reada_for_search forward, not
backward.

Signed-off-by: Gu JinXiang <gujx@cn.fujitsu.com>
Reviewed-by: Nikolay Borisov <nborisov@suse.com>
[ update changelog ]
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/free-space-tree.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/btrfs/free-space-tree.c b/fs/btrfs/free-space-tree.c
index d7b6c0017143..af36a6a971fe 100644
--- a/fs/btrfs/free-space-tree.c
+++ b/fs/btrfs/free-space-tree.c
@@ -1573,7 +1573,7 @@ int load_free_space_tree(struct btrfs_caching_control *caching_ctl)
 	 */
 	path->skip_locking = 1;
 	path->search_commit_root = 1;
-	path->reada = 1;
+	path->reada = READA_FORWARD;
 
 	info = search_free_space_info(NULL, fs_info, block_group, path, 0);
 	if (IS_ERR(info)) {

From 5d23515be66904fa3b1b5d6bd72d2199cd2447ab Mon Sep 17 00:00:00 2001
From: Nikolay Borisov <nborisov@suse.com>
Date: Wed, 31 Jan 2018 10:52:04 +0200
Subject: [PATCH 053/164] btrfs: Move qgroup rescan on quota enable to
 btrfs_quota_enable

Currently btrfs_run_qgroups is doing a bit too much. Not only is it
responsible for synchronizing in-memory state of qgroups to disk but
it also contains code to trigger the initial qgroup rescan when
quota is enabled initially. This condition is detected by checking that
BTRFS_FS_QUOTA_ENABLED is not set and BTRFS_FS_QUOTA_ENABLING is set.
Nothing really requires from the code to be structured (and scattered)
the way it is so let's streamline things. First move the quota rescan
code into btrfs_quota_enable, where its invocation is closer to the
use. This also makes the FS_QUOTA_ENABLING flag redundant so let's
remove it as well.

This has been tested with a full xfstest run with qgroups enabled on
the scratch device of every xfstest and no regressions were observed.

Signed-off-by: Nikolay Borisov <nborisov@suse.com>
Reviewed-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/ctree.h  |  1 -
 fs/btrfs/qgroup.c | 35 ++++++++++-------------------------
 2 files changed, 10 insertions(+), 26 deletions(-)

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 024d5feb5856..fa29ad826d7c 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -707,7 +707,6 @@ struct btrfs_delayed_root;
 #define BTRFS_FS_LOG_RECOVERING			4
 #define BTRFS_FS_OPEN				5
 #define BTRFS_FS_QUOTA_ENABLED			6
-#define BTRFS_FS_QUOTA_ENABLING			7
 #define BTRFS_FS_UPDATE_UUID_TREE_GEN		9
 #define BTRFS_FS_CREATING_FREE_SPACE_TREE	10
 #define BTRFS_FS_BTREE_ERR			11
diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index aa259d6986e1..0fa4f07b80b8 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -826,10 +826,8 @@ int btrfs_quota_enable(struct btrfs_trans_handle *trans,
 	int slot;
 
 	mutex_lock(&fs_info->qgroup_ioctl_lock);
-	if (fs_info->quota_root) {
-		set_bit(BTRFS_FS_QUOTA_ENABLING, &fs_info->flags);
+	if (fs_info->quota_root)
 		goto out;
-	}
 
 	fs_info->qgroup_ulist = ulist_alloc(GFP_KERNEL);
 	if (!fs_info->qgroup_ulist) {
@@ -923,8 +921,15 @@ out_add_root:
 	}
 	spin_lock(&fs_info->qgroup_lock);
 	fs_info->quota_root = quota_root;
-	set_bit(BTRFS_FS_QUOTA_ENABLING, &fs_info->flags);
+	set_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags);
 	spin_unlock(&fs_info->qgroup_lock);
+	ret = qgroup_rescan_init(fs_info, 0, 1);
+	if (!ret) {
+	        qgroup_rescan_zero_tracking(fs_info);
+	        btrfs_queue_work(fs_info->qgroup_rescan_workers,
+	                         &fs_info->qgroup_rescan_work);
+	}
+
 out_free_path:
 	btrfs_free_path(path);
 out_free_root:
@@ -2080,17 +2085,9 @@ int btrfs_run_qgroups(struct btrfs_trans_handle *trans,
 {
 	struct btrfs_root *quota_root = fs_info->quota_root;
 	int ret = 0;
-	int start_rescan_worker = 0;
 
 	if (!quota_root)
-		goto out;
-
-	if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags) &&
-	    test_bit(BTRFS_FS_QUOTA_ENABLING, &fs_info->flags))
-		start_rescan_worker = 1;
-
-	if (test_and_clear_bit(BTRFS_FS_QUOTA_ENABLING, &fs_info->flags))
-		set_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags);
+		return ret;
 
 	spin_lock(&fs_info->qgroup_lock);
 	while (!list_empty(&fs_info->dirty_qgroups)) {
@@ -2119,18 +2116,6 @@ int btrfs_run_qgroups(struct btrfs_trans_handle *trans,
 	if (ret)
 		fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
 
-	if (!ret && start_rescan_worker) {
-		ret = qgroup_rescan_init(fs_info, 0, 1);
-		if (!ret) {
-			qgroup_rescan_zero_tracking(fs_info);
-			btrfs_queue_work(fs_info->qgroup_rescan_workers,
-					 &fs_info->qgroup_rescan_work);
-		}
-		ret = 0;
-	}
-
-out:
-
 	return ret;
 }
 

From 393da91819e35af538ef97c7c6a04899e2fbfe0e Mon Sep 17 00:00:00 2001
From: Liu Bo <bo.li.liu@oracle.com>
Date: Fri, 5 Jan 2018 12:51:16 -0700
Subject: [PATCH 054/164] Btrfs: add tracepoint for em's EEXIST case

This is adding a tracepoint 'btrfs_handle_em_exist' to help debug the
subtle bugs around merge_extent_mapping.

Signed-off-by: Liu Bo <bo.li.liu@oracle.com>
Reviewed-by: Josef Bacik <jbacik@fb.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/extent_map.c        |  3 +++
 include/trace/events/btrfs.h | 35 +++++++++++++++++++++++++++++++++++
 2 files changed, 38 insertions(+)

diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c
index c80dea7c69af..b8ead8dc2ebe 100644
--- a/fs/btrfs/extent_map.c
+++ b/fs/btrfs/extent_map.c
@@ -551,6 +551,9 @@ int btrfs_add_extent_mapping(struct extent_map_tree *em_tree,
 		ret = 0;
 
 		existing = search_extent_mapping(em_tree, start, len);
+
+		trace_btrfs_handle_em_exist(existing, em, start, len);
+
 		/*
 		 * existing will always be non-NULL, since there must be
 		 * extent causing the -EEXIST.
diff --git a/include/trace/events/btrfs.h b/include/trace/events/btrfs.h
index c3ac5ec86519..486771e3f4cb 100644
--- a/include/trace/events/btrfs.h
+++ b/include/trace/events/btrfs.h
@@ -248,6 +248,41 @@ TRACE_EVENT_CONDITION(btrfs_get_extent,
 		  __entry->refs, __entry->compress_type)
 );
 
+TRACE_EVENT(btrfs_handle_em_exist,
+
+	TP_PROTO(const struct extent_map *existing, const struct extent_map *map, u64 start, u64 len),
+
+	TP_ARGS(existing, map, start, len),
+
+	TP_STRUCT__entry(
+		__field(	u64,  e_start		)
+		__field(	u64,  e_len		)
+		__field(	u64,  map_start		)
+		__field(	u64,  map_len		)
+		__field(	u64,  start		)
+		__field(	u64,  len		)
+	),
+
+	TP_fast_assign(
+		__entry->e_start	= existing->start;
+		__entry->e_len		= existing->len;
+		__entry->map_start	= map->start;
+		__entry->map_len	= map->len;
+		__entry->start		= start;
+		__entry->len		= len;
+	),
+
+	TP_printk("start=%llu len=%llu "
+		  "existing(start=%llu len=%llu) "
+		  "em(start=%llu len=%llu)",
+		  (unsigned long long)__entry->start,
+		  (unsigned long long)__entry->len,
+		  (unsigned long long)__entry->e_start,
+		  (unsigned long long)__entry->e_len,
+		  (unsigned long long)__entry->map_start,
+		  (unsigned long long)__entry->map_len)
+);
+
 /* file extent item */
 DECLARE_EVENT_CLASS(btrfs__file_extent_item_regular,
 

From 2f659546c9048931c2b8e146824a892b74a8e33c Mon Sep 17 00:00:00 2001
From: Qu Wenruo <wqu@suse.com>
Date: Thu, 25 Jan 2018 14:56:18 +0800
Subject: [PATCH 055/164] btrfs: tree-checker: Replace root parameter with
 fs_info

When inspecting the error message with real corruption, the "root=%llu"
always shows "1" (root tree), instead of the correct owner.

The problem is that we are getting @root from page->mapping->host, which
points the same btree inode, so we will always get the same root.

This makes the root owner output meaningless, and harder to port
tree-checker to btrfs-progs.

So get rid of the false and meaningless @root parameter and replace it
with @fs_info.
To get the owner, we can only rely on btrfs_header_owner() now.

Signed-off-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/disk-io.c      |   6 +-
 fs/btrfs/tree-checker.c | 147 ++++++++++++++++++++--------------------
 fs/btrfs/tree-checker.h |   7 +-
 3 files changed, 82 insertions(+), 78 deletions(-)

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index fa80de56340f..9d2c932b012e 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -602,12 +602,12 @@ static int btree_readpage_end_io_hook(struct btrfs_io_bio *io_bio,
 	 * that we don't try and read the other copies of this block, just
 	 * return -EIO.
 	 */
-	if (found_level == 0 && btrfs_check_leaf_full(root, eb)) {
+	if (found_level == 0 && btrfs_check_leaf_full(fs_info, eb)) {
 		set_bit(EXTENT_BUFFER_CORRUPT, &eb->bflags);
 		ret = -EIO;
 	}
 
-	if (found_level > 0 && btrfs_check_node(root, eb))
+	if (found_level > 0 && btrfs_check_node(fs_info, eb))
 		ret = -EIO;
 
 	if (!ret)
@@ -3854,7 +3854,7 @@ void btrfs_mark_buffer_dirty(struct extent_buffer *buf)
 	 * So here we should only check item pointers, not item data.
 	 */
 	if (btrfs_header_level(buf) == 0 &&
-	    btrfs_check_leaf_relaxed(root, buf)) {
+	    btrfs_check_leaf_relaxed(fs_info, buf)) {
 		btrfs_print_leaf(buf);
 		ASSERT(0);
 	}
diff --git a/fs/btrfs/tree-checker.c b/fs/btrfs/tree-checker.c
index c3c8d48f6618..a5244f98f3b4 100644
--- a/fs/btrfs/tree-checker.c
+++ b/fs/btrfs/tree-checker.c
@@ -53,7 +53,7 @@
  * Allows callers to customize the output.
  */
 __printf(4, 5)
-static void generic_err(const struct btrfs_root *root,
+static void generic_err(const struct btrfs_fs_info *fs_info,
 			const struct extent_buffer *eb, int slot,
 			const char *fmt, ...)
 {
@@ -65,10 +65,10 @@ static void generic_err(const struct btrfs_root *root,
 	vaf.fmt = fmt;
 	vaf.va = &args;
 
-	btrfs_crit(root->fs_info,
+	btrfs_crit(fs_info,
 		"corrupt %s: root=%llu block=%llu slot=%d, %pV",
 		btrfs_header_level(eb) == 0 ? "leaf" : "node",
-		root->objectid, btrfs_header_bytenr(eb), slot, &vaf);
+		btrfs_header_owner(eb), btrfs_header_bytenr(eb), slot, &vaf);
 	va_end(args);
 }
 
@@ -77,7 +77,7 @@ static void generic_err(const struct btrfs_root *root,
  * offset has its own meaning.
  */
 __printf(4, 5)
-static void file_extent_err(const struct btrfs_root *root,
+static void file_extent_err(const struct btrfs_fs_info *fs_info,
 			    const struct extent_buffer *eb, int slot,
 			    const char *fmt, ...)
 {
@@ -91,10 +91,11 @@ static void file_extent_err(const struct btrfs_root *root,
 	vaf.fmt = fmt;
 	vaf.va = &args;
 
-	btrfs_crit(root->fs_info,
+	btrfs_crit(fs_info,
 	"corrupt %s: root=%llu block=%llu slot=%d ino=%llu file_offset=%llu, %pV",
-		btrfs_header_level(eb) == 0 ? "leaf" : "node", root->objectid,
-		btrfs_header_bytenr(eb), slot, key.objectid, key.offset, &vaf);
+		btrfs_header_level(eb) == 0 ? "leaf" : "node",
+		btrfs_header_owner(eb), btrfs_header_bytenr(eb), slot,
+		key.objectid, key.offset, &vaf);
 	va_end(args);
 }
 
@@ -102,26 +103,26 @@ static void file_extent_err(const struct btrfs_root *root,
  * Return 0 if the btrfs_file_extent_##name is aligned to @alignment
  * Else return 1
  */
-#define CHECK_FE_ALIGNED(root, leaf, slot, fi, name, alignment)		      \
+#define CHECK_FE_ALIGNED(fs_info, leaf, slot, fi, name, alignment)	      \
 ({									      \
 	if (!IS_ALIGNED(btrfs_file_extent_##name((leaf), (fi)), (alignment))) \
-		file_extent_err((root), (leaf), (slot),			      \
+		file_extent_err((fs_info), (leaf), (slot),		      \
 	"invalid %s for file extent, have %llu, should be aligned to %u",     \
 			(#name), btrfs_file_extent_##name((leaf), (fi)),      \
 			(alignment));					      \
 	(!IS_ALIGNED(btrfs_file_extent_##name((leaf), (fi)), (alignment)));   \
 })
 
-static int check_extent_data_item(struct btrfs_root *root,
+static int check_extent_data_item(struct btrfs_fs_info *fs_info,
 				  struct extent_buffer *leaf,
 				  struct btrfs_key *key, int slot)
 {
 	struct btrfs_file_extent_item *fi;
-	u32 sectorsize = root->fs_info->sectorsize;
+	u32 sectorsize = fs_info->sectorsize;
 	u32 item_size = btrfs_item_size_nr(leaf, slot);
 
 	if (!IS_ALIGNED(key->offset, sectorsize)) {
-		file_extent_err(root, leaf, slot,
+		file_extent_err(fs_info, leaf, slot,
 "unaligned file_offset for file extent, have %llu should be aligned to %u",
 			key->offset, sectorsize);
 		return -EUCLEAN;
@@ -130,7 +131,7 @@ static int check_extent_data_item(struct btrfs_root *root,
 	fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item);
 
 	if (btrfs_file_extent_type(leaf, fi) > BTRFS_FILE_EXTENT_TYPES) {
-		file_extent_err(root, leaf, slot,
+		file_extent_err(fs_info, leaf, slot,
 		"invalid type for file extent, have %u expect range [0, %u]",
 			btrfs_file_extent_type(leaf, fi),
 			BTRFS_FILE_EXTENT_TYPES);
@@ -142,14 +143,14 @@ static int check_extent_data_item(struct btrfs_root *root,
 	 * and must be caught in open_ctree().
 	 */
 	if (btrfs_file_extent_compression(leaf, fi) > BTRFS_COMPRESS_TYPES) {
-		file_extent_err(root, leaf, slot,
+		file_extent_err(fs_info, leaf, slot,
 	"invalid compression for file extent, have %u expect range [0, %u]",
 			btrfs_file_extent_compression(leaf, fi),
 			BTRFS_COMPRESS_TYPES);
 		return -EUCLEAN;
 	}
 	if (btrfs_file_extent_encryption(leaf, fi)) {
-		file_extent_err(root, leaf, slot,
+		file_extent_err(fs_info, leaf, slot,
 			"invalid encryption for file extent, have %u expect 0",
 			btrfs_file_extent_encryption(leaf, fi));
 		return -EUCLEAN;
@@ -157,7 +158,7 @@ static int check_extent_data_item(struct btrfs_root *root,
 	if (btrfs_file_extent_type(leaf, fi) == BTRFS_FILE_EXTENT_INLINE) {
 		/* Inline extent must have 0 as key offset */
 		if (key->offset) {
-			file_extent_err(root, leaf, slot,
+			file_extent_err(fs_info, leaf, slot,
 		"invalid file_offset for inline file extent, have %llu expect 0",
 				key->offset);
 			return -EUCLEAN;
@@ -171,7 +172,7 @@ static int check_extent_data_item(struct btrfs_root *root,
 		/* Uncompressed inline extent size must match item size */
 		if (item_size != BTRFS_FILE_EXTENT_INLINE_DATA_START +
 		    btrfs_file_extent_ram_bytes(leaf, fi)) {
-			file_extent_err(root, leaf, slot,
+			file_extent_err(fs_info, leaf, slot,
 	"invalid ram_bytes for uncompressed inline extent, have %u expect %llu",
 				item_size, BTRFS_FILE_EXTENT_INLINE_DATA_START +
 				btrfs_file_extent_ram_bytes(leaf, fi));
@@ -182,40 +183,41 @@ static int check_extent_data_item(struct btrfs_root *root,
 
 	/* Regular or preallocated extent has fixed item size */
 	if (item_size != sizeof(*fi)) {
-		file_extent_err(root, leaf, slot,
+		file_extent_err(fs_info, leaf, slot,
 	"invalid item size for reg/prealloc file extent, have %u expect %zu",
 			item_size, sizeof(*fi));
 		return -EUCLEAN;
 	}
-	if (CHECK_FE_ALIGNED(root, leaf, slot, fi, ram_bytes, sectorsize) ||
-	    CHECK_FE_ALIGNED(root, leaf, slot, fi, disk_bytenr, sectorsize) ||
-	    CHECK_FE_ALIGNED(root, leaf, slot, fi, disk_num_bytes, sectorsize) ||
-	    CHECK_FE_ALIGNED(root, leaf, slot, fi, offset, sectorsize) ||
-	    CHECK_FE_ALIGNED(root, leaf, slot, fi, num_bytes, sectorsize))
+	if (CHECK_FE_ALIGNED(fs_info, leaf, slot, fi, ram_bytes, sectorsize) ||
+	    CHECK_FE_ALIGNED(fs_info, leaf, slot, fi, disk_bytenr, sectorsize) ||
+	    CHECK_FE_ALIGNED(fs_info, leaf, slot, fi, disk_num_bytes, sectorsize) ||
+	    CHECK_FE_ALIGNED(fs_info, leaf, slot, fi, offset, sectorsize) ||
+	    CHECK_FE_ALIGNED(fs_info, leaf, slot, fi, num_bytes, sectorsize))
 		return -EUCLEAN;
 	return 0;
 }
 
-static int check_csum_item(struct btrfs_root *root, struct extent_buffer *leaf,
-			   struct btrfs_key *key, int slot)
+static int check_csum_item(struct btrfs_fs_info *fs_info,
+			   struct extent_buffer *leaf, struct btrfs_key *key,
+			   int slot)
 {
-	u32 sectorsize = root->fs_info->sectorsize;
-	u32 csumsize = btrfs_super_csum_size(root->fs_info->super_copy);
+	u32 sectorsize = fs_info->sectorsize;
+	u32 csumsize = btrfs_super_csum_size(fs_info->super_copy);
 
 	if (key->objectid != BTRFS_EXTENT_CSUM_OBJECTID) {
-		generic_err(root, leaf, slot,
+		generic_err(fs_info, leaf, slot,
 		"invalid key objectid for csum item, have %llu expect %llu",
 			key->objectid, BTRFS_EXTENT_CSUM_OBJECTID);
 		return -EUCLEAN;
 	}
 	if (!IS_ALIGNED(key->offset, sectorsize)) {
-		generic_err(root, leaf, slot,
+		generic_err(fs_info, leaf, slot,
 	"unaligned key offset for csum item, have %llu should be aligned to %u",
 			key->offset, sectorsize);
 		return -EUCLEAN;
 	}
 	if (!IS_ALIGNED(btrfs_item_size_nr(leaf, slot), csumsize)) {
-		generic_err(root, leaf, slot,
+		generic_err(fs_info, leaf, slot,
 	"unaligned item size for csum item, have %u should be aligned to %u",
 			btrfs_item_size_nr(leaf, slot), csumsize);
 		return -EUCLEAN;
@@ -228,7 +230,7 @@ static int check_csum_item(struct btrfs_root *root, struct extent_buffer *leaf,
  * which represents inode number
  */
 __printf(4, 5)
-static void dir_item_err(const struct btrfs_root *root,
+static void dir_item_err(const struct btrfs_fs_info *fs_info,
 			 const struct extent_buffer *eb, int slot,
 			 const char *fmt, ...)
 {
@@ -242,14 +244,15 @@ static void dir_item_err(const struct btrfs_root *root,
 	vaf.fmt = fmt;
 	vaf.va = &args;
 
-	btrfs_crit(root->fs_info,
+	btrfs_crit(fs_info,
 	"corrupt %s: root=%llu block=%llu slot=%d ino=%llu, %pV",
-		btrfs_header_level(eb) == 0 ? "leaf" : "node", root->objectid,
-		btrfs_header_bytenr(eb), slot, key.objectid, &vaf);
+		btrfs_header_level(eb) == 0 ? "leaf" : "node",
+		btrfs_header_owner(eb), btrfs_header_bytenr(eb), slot,
+		key.objectid, &vaf);
 	va_end(args);
 }
 
-static int check_dir_item(struct btrfs_root *root,
+static int check_dir_item(struct btrfs_fs_info *fs_info,
 			  struct extent_buffer *leaf,
 			  struct btrfs_key *key, int slot)
 {
@@ -268,7 +271,7 @@ static int check_dir_item(struct btrfs_root *root,
 
 		/* header itself should not cross item boundary */
 		if (cur + sizeof(*di) > item_size) {
-			dir_item_err(root, leaf, slot,
+			dir_item_err(fs_info, leaf, slot,
 		"dir item header crosses item boundary, have %zu boundary %u",
 				cur + sizeof(*di), item_size);
 			return -EUCLEAN;
@@ -277,7 +280,7 @@ static int check_dir_item(struct btrfs_root *root,
 		/* dir type check */
 		dir_type = btrfs_dir_type(leaf, di);
 		if (dir_type >= BTRFS_FT_MAX) {
-			dir_item_err(root, leaf, slot,
+			dir_item_err(fs_info, leaf, slot,
 			"invalid dir item type, have %u expect [0, %u)",
 				dir_type, BTRFS_FT_MAX);
 			return -EUCLEAN;
@@ -285,14 +288,14 @@ static int check_dir_item(struct btrfs_root *root,
 
 		if (key->type == BTRFS_XATTR_ITEM_KEY &&
 		    dir_type != BTRFS_FT_XATTR) {
-			dir_item_err(root, leaf, slot,
+			dir_item_err(fs_info, leaf, slot,
 		"invalid dir item type for XATTR key, have %u expect %u",
 				dir_type, BTRFS_FT_XATTR);
 			return -EUCLEAN;
 		}
 		if (dir_type == BTRFS_FT_XATTR &&
 		    key->type != BTRFS_XATTR_ITEM_KEY) {
-			dir_item_err(root, leaf, slot,
+			dir_item_err(fs_info, leaf, slot,
 			"xattr dir type found for non-XATTR key");
 			return -EUCLEAN;
 		}
@@ -305,21 +308,21 @@ static int check_dir_item(struct btrfs_root *root,
 		name_len = btrfs_dir_name_len(leaf, di);
 		data_len = btrfs_dir_data_len(leaf, di);
 		if (name_len > max_name_len) {
-			dir_item_err(root, leaf, slot,
+			dir_item_err(fs_info, leaf, slot,
 			"dir item name len too long, have %u max %u",
 				name_len, max_name_len);
 			return -EUCLEAN;
 		}
-		if (name_len + data_len > BTRFS_MAX_XATTR_SIZE(root->fs_info)) {
-			dir_item_err(root, leaf, slot,
+		if (name_len + data_len > BTRFS_MAX_XATTR_SIZE(fs_info)) {
+			dir_item_err(fs_info, leaf, slot,
 			"dir item name and data len too long, have %u max %u",
 				name_len + data_len,
-				BTRFS_MAX_XATTR_SIZE(root->fs_info));
+				BTRFS_MAX_XATTR_SIZE(fs_info));
 			return -EUCLEAN;
 		}
 
 		if (data_len && dir_type != BTRFS_FT_XATTR) {
-			dir_item_err(root, leaf, slot,
+			dir_item_err(fs_info, leaf, slot,
 			"dir item with invalid data len, have %u expect 0",
 				data_len);
 			return -EUCLEAN;
@@ -329,7 +332,7 @@ static int check_dir_item(struct btrfs_root *root,
 
 		/* header and name/data should not cross item boundary */
 		if (cur + total_size > item_size) {
-			dir_item_err(root, leaf, slot,
+			dir_item_err(fs_info, leaf, slot,
 		"dir item data crosses item boundary, have %u boundary %u",
 				cur + total_size, item_size);
 			return -EUCLEAN;
@@ -347,7 +350,7 @@ static int check_dir_item(struct btrfs_root *root,
 					(unsigned long)(di + 1), name_len);
 			name_hash = btrfs_name_hash(namebuf, name_len);
 			if (key->offset != name_hash) {
-				dir_item_err(root, leaf, slot,
+				dir_item_err(fs_info, leaf, slot,
 		"name hash mismatch with key, have 0x%016x expect 0x%016llx",
 					name_hash, key->offset);
 				return -EUCLEAN;
@@ -362,7 +365,7 @@ static int check_dir_item(struct btrfs_root *root,
 /*
  * Common point to switch the item-specific validation.
  */
-static int check_leaf_item(struct btrfs_root *root,
+static int check_leaf_item(struct btrfs_fs_info *fs_info,
 			   struct extent_buffer *leaf,
 			   struct btrfs_key *key, int slot)
 {
@@ -370,24 +373,23 @@ static int check_leaf_item(struct btrfs_root *root,
 
 	switch (key->type) {
 	case BTRFS_EXTENT_DATA_KEY:
-		ret = check_extent_data_item(root, leaf, key, slot);
+		ret = check_extent_data_item(fs_info, leaf, key, slot);
 		break;
 	case BTRFS_EXTENT_CSUM_KEY:
-		ret = check_csum_item(root, leaf, key, slot);
+		ret = check_csum_item(fs_info, leaf, key, slot);
 		break;
 	case BTRFS_DIR_ITEM_KEY:
 	case BTRFS_DIR_INDEX_KEY:
 	case BTRFS_XATTR_ITEM_KEY:
-		ret = check_dir_item(root, leaf, key, slot);
+		ret = check_dir_item(fs_info, leaf, key, slot);
 		break;
 	}
 	return ret;
 }
 
-static int check_leaf(struct btrfs_root *root, struct extent_buffer *leaf,
+static int check_leaf(struct btrfs_fs_info *fs_info, struct extent_buffer *leaf,
 		      bool check_item_data)
 {
-	struct btrfs_fs_info *fs_info = root->fs_info;
 	/* No valid key type is 0, so all key should be larger than this key */
 	struct btrfs_key prev_key = {0, 0, 0};
 	struct btrfs_key key;
@@ -420,7 +422,7 @@ static int check_leaf(struct btrfs_root *root, struct extent_buffer *leaf,
 			eb = btrfs_root_node(check_root);
 			/* if leaf is the root, then it's fine */
 			if (leaf != eb) {
-				generic_err(check_root, leaf, 0,
+				generic_err(fs_info, leaf, 0,
 		"invalid nritems, have %u should not be 0 for non-root leaf",
 					nritems);
 				free_extent_buffer(eb);
@@ -453,7 +455,7 @@ static int check_leaf(struct btrfs_root *root, struct extent_buffer *leaf,
 
 		/* Make sure the keys are in the right order */
 		if (btrfs_comp_cpu_keys(&prev_key, &key) >= 0) {
-			generic_err(root, leaf, slot,
+			generic_err(fs_info, leaf, slot,
 	"bad key order, prev (%llu %u %llu) current (%llu %u %llu)",
 				prev_key.objectid, prev_key.type,
 				prev_key.offset, key.objectid, key.type,
@@ -472,7 +474,7 @@ static int check_leaf(struct btrfs_root *root, struct extent_buffer *leaf,
 			item_end_expected = btrfs_item_offset_nr(leaf,
 								 slot - 1);
 		if (btrfs_item_end_nr(leaf, slot) != item_end_expected) {
-			generic_err(root, leaf, slot,
+			generic_err(fs_info, leaf, slot,
 				"unexpected item end, have %u expect %u",
 				btrfs_item_end_nr(leaf, slot),
 				item_end_expected);
@@ -486,7 +488,7 @@ static int check_leaf(struct btrfs_root *root, struct extent_buffer *leaf,
 		 */
 		if (btrfs_item_end_nr(leaf, slot) >
 		    BTRFS_LEAF_DATA_SIZE(fs_info)) {
-			generic_err(root, leaf, slot,
+			generic_err(fs_info, leaf, slot,
 			"slot end outside of leaf, have %u expect range [0, %u]",
 				btrfs_item_end_nr(leaf, slot),
 				BTRFS_LEAF_DATA_SIZE(fs_info));
@@ -496,7 +498,7 @@ static int check_leaf(struct btrfs_root *root, struct extent_buffer *leaf,
 		/* Also check if the item pointer overlaps with btrfs item. */
 		if (btrfs_item_nr_offset(slot) + sizeof(struct btrfs_item) >
 		    btrfs_item_ptr_offset(leaf, slot)) {
-			generic_err(root, leaf, slot,
+			generic_err(fs_info, leaf, slot,
 		"slot overlaps with its data, item end %lu data start %lu",
 				btrfs_item_nr_offset(slot) +
 				sizeof(struct btrfs_item),
@@ -509,7 +511,7 @@ static int check_leaf(struct btrfs_root *root, struct extent_buffer *leaf,
 			 * Check if the item size and content meet other
 			 * criteria
 			 */
-			ret = check_leaf_item(root, leaf, &key, slot);
+			ret = check_leaf_item(fs_info, leaf, &key, slot);
 			if (ret < 0)
 				return ret;
 		}
@@ -522,18 +524,19 @@ static int check_leaf(struct btrfs_root *root, struct extent_buffer *leaf,
 	return 0;
 }
 
-int btrfs_check_leaf_full(struct btrfs_root *root, struct extent_buffer *leaf)
+int btrfs_check_leaf_full(struct btrfs_fs_info *fs_info,
+			  struct extent_buffer *leaf)
 {
-	return check_leaf(root, leaf, true);
+	return check_leaf(fs_info, leaf, true);
 }
 
-int btrfs_check_leaf_relaxed(struct btrfs_root *root,
+int btrfs_check_leaf_relaxed(struct btrfs_fs_info *fs_info,
 			     struct extent_buffer *leaf)
 {
-	return check_leaf(root, leaf, false);
+	return check_leaf(fs_info, leaf, false);
 }
 
-int btrfs_check_node(struct btrfs_root *root, struct extent_buffer *node)
+int btrfs_check_node(struct btrfs_fs_info *fs_info, struct extent_buffer *node)
 {
 	unsigned long nr = btrfs_header_nritems(node);
 	struct btrfs_key key, next_key;
@@ -541,12 +544,12 @@ int btrfs_check_node(struct btrfs_root *root, struct extent_buffer *node)
 	u64 bytenr;
 	int ret = 0;
 
-	if (nr == 0 || nr > BTRFS_NODEPTRS_PER_BLOCK(root->fs_info)) {
-		btrfs_crit(root->fs_info,
+	if (nr == 0 || nr > BTRFS_NODEPTRS_PER_BLOCK(fs_info)) {
+		btrfs_crit(fs_info,
 "corrupt node: root=%llu block=%llu, nritems too %s, have %lu expect range [1,%u]",
-			   root->objectid, node->start,
+			   btrfs_header_owner(node), node->start,
 			   nr == 0 ? "small" : "large", nr,
-			   BTRFS_NODEPTRS_PER_BLOCK(root->fs_info));
+			   BTRFS_NODEPTRS_PER_BLOCK(fs_info));
 		return -EUCLEAN;
 	}
 
@@ -556,21 +559,21 @@ int btrfs_check_node(struct btrfs_root *root, struct extent_buffer *node)
 		btrfs_node_key_to_cpu(node, &next_key, slot + 1);
 
 		if (!bytenr) {
-			generic_err(root, node, slot,
+			generic_err(fs_info, node, slot,
 				"invalid NULL node pointer");
 			ret = -EUCLEAN;
 			goto out;
 		}
-		if (!IS_ALIGNED(bytenr, root->fs_info->sectorsize)) {
-			generic_err(root, node, slot,
+		if (!IS_ALIGNED(bytenr, fs_info->sectorsize)) {
+			generic_err(fs_info, node, slot,
 			"unaligned pointer, have %llu should be aligned to %u",
-				bytenr, root->fs_info->sectorsize);
+				bytenr, fs_info->sectorsize);
 			ret = -EUCLEAN;
 			goto out;
 		}
 
 		if (btrfs_comp_cpu_keys(&key, &next_key) >= 0) {
-			generic_err(root, node, slot,
+			generic_err(fs_info, node, slot,
 	"bad key order, current (%llu %u %llu) next (%llu %u %llu)",
 				key.objectid, key.type, key.offset,
 				next_key.objectid, next_key.type,
diff --git a/fs/btrfs/tree-checker.h b/fs/btrfs/tree-checker.h
index 3d53e8d6fda0..aba542755710 100644
--- a/fs/btrfs/tree-checker.h
+++ b/fs/btrfs/tree-checker.h
@@ -25,14 +25,15 @@
  * Will check not only the item pointers, but also every possible member
  * in item data.
  */
-int btrfs_check_leaf_full(struct btrfs_root *root, struct extent_buffer *leaf);
+int btrfs_check_leaf_full(struct btrfs_fs_info *fs_info,
+			  struct extent_buffer *leaf);
 
 /*
  * Less strict leaf checker.
  * Will only check item pointers, not reading item data.
  */
-int btrfs_check_leaf_relaxed(struct btrfs_root *root,
+int btrfs_check_leaf_relaxed(struct btrfs_fs_info *fs_info,
 			     struct extent_buffer *leaf);
-int btrfs_check_node(struct btrfs_root *root, struct extent_buffer *node);
+int btrfs_check_node(struct btrfs_fs_info *fs_info, struct extent_buffer *node);
 
 #endif

From 3e72ee8874f42ddbe72a090044f7c03740158183 Mon Sep 17 00:00:00 2001
From: Qu Wenruo <wqu@suse.com>
Date: Tue, 30 Jan 2018 18:20:45 +0800
Subject: [PATCH 056/164] btrfs: Refactor __get_raid_index() to
 btrfs_bg_flags_to_raid_index()

Function __get_raid_index() is used to convert block group flags into
raid index, which can be used to get various info directly from
btrfs_raid_array[].

Refactor this function a little:

1) Rename to btrfs_bg_flags_to_raid_index()
   Double underline prefix is normally for internal functions, while the
   function is used by both extent-tree and volumes.

   Although the name is a little longer, but it should explain its usage
   quite well.

2) Move it to volumes.h and make it static inline
   Just several if-else branches, really no need to define it as a normal
   function.

   This also makes later code re-use between kernel and btrfs-progs
   easier.

3) Remove function get_block_group_index()
   Really no need to do such a simple thing as an exported function.

Signed-off-by: Qu Wenruo <wqu@suse.com>
Reviewed-by: Anand Jain <anand.jain@oracle.com>
Reviewed-by: Nikolay Borisov <nborisov@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/ctree.h       |  2 --
 fs/btrfs/extent-tree.c | 41 ++++++++++-------------------------------
 fs/btrfs/volumes.c     |  2 +-
 fs/btrfs/volumes.h     | 22 ++++++++++++++++++++++
 4 files changed, 33 insertions(+), 34 deletions(-)

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index fa29ad826d7c..a4877b6959e3 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -2627,7 +2627,6 @@ struct btrfs_block_group_cache *btrfs_lookup_block_group(
 						 u64 bytenr);
 void btrfs_get_block_group(struct btrfs_block_group_cache *cache);
 void btrfs_put_block_group(struct btrfs_block_group_cache *cache);
-int get_block_group_index(struct btrfs_block_group_cache *cache);
 struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans,
 					     struct btrfs_root *root,
 					     u64 parent, u64 root_objectid,
@@ -2787,7 +2786,6 @@ int btrfs_trim_fs(struct btrfs_fs_info *fs_info, struct fstrim_range *range);
 int btrfs_init_space_info(struct btrfs_fs_info *fs_info);
 int btrfs_delayed_refs_qgroup_accounting(struct btrfs_trans_handle *trans,
 					 struct btrfs_fs_info *fs_info);
-int __get_raid_index(u64 flags);
 int btrfs_start_write_no_snapshotting(struct btrfs_root *root);
 void btrfs_end_write_no_snapshotting(struct btrfs_root *root);
 void btrfs_wait_for_snapshot_creation(struct btrfs_root *root);
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index b17506688914..22ac82198a54 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -7345,29 +7345,6 @@ wait_block_group_cache_done(struct btrfs_block_group_cache *cache)
 	return ret;
 }
 
-int __get_raid_index(u64 flags)
-{
-	if (flags & BTRFS_BLOCK_GROUP_RAID10)
-		return BTRFS_RAID_RAID10;
-	else if (flags & BTRFS_BLOCK_GROUP_RAID1)
-		return BTRFS_RAID_RAID1;
-	else if (flags & BTRFS_BLOCK_GROUP_DUP)
-		return BTRFS_RAID_DUP;
-	else if (flags & BTRFS_BLOCK_GROUP_RAID0)
-		return BTRFS_RAID_RAID0;
-	else if (flags & BTRFS_BLOCK_GROUP_RAID5)
-		return BTRFS_RAID_RAID5;
-	else if (flags & BTRFS_BLOCK_GROUP_RAID6)
-		return BTRFS_RAID_RAID6;
-
-	return BTRFS_RAID_SINGLE; /* BTRFS_BLOCK_GROUP_SINGLE */
-}
-
-int get_block_group_index(struct btrfs_block_group_cache *cache)
-{
-	return __get_raid_index(cache->flags);
-}
-
 static const char *btrfs_raid_type_names[BTRFS_NR_RAID_TYPES] = {
 	[BTRFS_RAID_RAID10]	= "raid10",
 	[BTRFS_RAID_RAID1]	= "raid1",
@@ -7482,7 +7459,7 @@ static noinline int find_free_extent(struct btrfs_fs_info *fs_info,
 	u64 empty_cluster = 0;
 	struct btrfs_space_info *space_info;
 	int loop = 0;
-	int index = __get_raid_index(flags);
+	int index = btrfs_bg_flags_to_raid_index(flags);
 	bool failed_cluster_refill = false;
 	bool failed_alloc = false;
 	bool use_cluster = true;
@@ -7568,7 +7545,8 @@ static noinline int find_free_extent(struct btrfs_fs_info *fs_info,
 				btrfs_put_block_group(block_group);
 				up_read(&space_info->groups_sem);
 			} else {
-				index = get_block_group_index(block_group);
+				index = btrfs_bg_flags_to_raid_index(
+						block_group->flags);
 				btrfs_lock_block_group(block_group, delalloc);
 				goto have_block_group;
 			}
@@ -7578,7 +7556,7 @@ static noinline int find_free_extent(struct btrfs_fs_info *fs_info,
 	}
 search:
 	have_caching_bg = false;
-	if (index == 0 || index == __get_raid_index(flags))
+	if (index == 0 || index == btrfs_bg_flags_to_raid_index(flags))
 		full_search = true;
 	down_read(&space_info->groups_sem);
 	list_for_each_entry(block_group, &space_info->block_groups[index],
@@ -7836,7 +7814,8 @@ checks:
 loop:
 		failed_cluster_refill = false;
 		failed_alloc = false;
-		BUG_ON(index != get_block_group_index(block_group));
+		BUG_ON(btrfs_bg_flags_to_raid_index(block_group->flags) !=
+		       index);
 		btrfs_release_block_group(block_group, delalloc);
 		cond_resched();
 	}
@@ -9642,7 +9621,7 @@ int btrfs_can_relocate(struct btrfs_fs_info *fs_info, u64 bytenr)
 	 */
 	target = get_restripe_target(fs_info, block_group->flags);
 	if (target) {
-		index = __get_raid_index(extended_to_chunk(target));
+		index = btrfs_bg_flags_to_raid_index(extended_to_chunk(target));
 	} else {
 		/*
 		 * this is just a balance, so if we were marked as full
@@ -9656,7 +9635,7 @@ int btrfs_can_relocate(struct btrfs_fs_info *fs_info, u64 bytenr)
 			goto out;
 		}
 
-		index = get_block_group_index(block_group);
+		index = btrfs_bg_flags_to_raid_index(block_group->flags);
 	}
 
 	if (index == BTRFS_RAID_RAID10) {
@@ -9908,7 +9887,7 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info)
 static void link_block_group(struct btrfs_block_group_cache *cache)
 {
 	struct btrfs_space_info *space_info = cache->space_info;
-	int index = get_block_group_index(cache);
+	int index = btrfs_bg_flags_to_raid_index(cache->flags);
 	bool first = false;
 
 	down_write(&space_info->groups_sem);
@@ -10328,7 +10307,7 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
 				  block_group->key.offset);
 
 	memcpy(&key, &block_group->key, sizeof(key));
-	index = get_block_group_index(block_group);
+	index = btrfs_bg_flags_to_raid_index(block_group->flags);
 	if (block_group->flags & (BTRFS_BLOCK_GROUP_DUP |
 				  BTRFS_BLOCK_GROUP_RAID1 |
 				  BTRFS_BLOCK_GROUP_RAID10))
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index c5d2954668fe..2c0c08ec987a 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -4722,7 +4722,7 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
 		return -ENOSPC;
 	}
 
-	index = __get_raid_index(type);
+	index = btrfs_bg_flags_to_raid_index(type);
 
 	sub_stripes = btrfs_raid_array[index].sub_stripes;
 	dev_stripes = btrfs_raid_array[index].dev_stripes;
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index 654df38faa7a..d110fb03ec0d 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -544,6 +544,28 @@ static inline void btrfs_dev_stat_reset(struct btrfs_device *dev,
 	btrfs_dev_stat_set(dev, index, 0);
 }
 
+/*
+ * Convert block group flags (BTRFS_BLOCK_GROUP_*) to btrfs_raid_types, which
+ * can be used as index to access btrfs_raid_array[].
+ */
+static inline enum btrfs_raid_types btrfs_bg_flags_to_raid_index(u64 flags)
+{
+	if (flags & BTRFS_BLOCK_GROUP_RAID10)
+		return BTRFS_RAID_RAID10;
+	else if (flags & BTRFS_BLOCK_GROUP_RAID1)
+		return BTRFS_RAID_RAID1;
+	else if (flags & BTRFS_BLOCK_GROUP_DUP)
+		return BTRFS_RAID_DUP;
+	else if (flags & BTRFS_BLOCK_GROUP_RAID0)
+		return BTRFS_RAID_RAID0;
+	else if (flags & BTRFS_BLOCK_GROUP_RAID5)
+		return BTRFS_RAID_RAID5;
+	else if (flags & BTRFS_BLOCK_GROUP_RAID6)
+		return BTRFS_RAID_RAID6;
+
+	return BTRFS_RAID_SINGLE; /* BTRFS_BLOCK_GROUP_SINGLE */
+}
+
 void btrfs_update_commit_device_size(struct btrfs_fs_info *fs_info);
 void btrfs_update_commit_device_bytes_used(struct btrfs_transaction *trans);
 

From df91f56adce1fc131e05368a0ad0ea72afd9a79a Mon Sep 17 00:00:00 2001
From: Nikolay Borisov <nborisov@suse.com>
Date: Mon, 8 Jan 2018 11:45:04 +0200
Subject: [PATCH 057/164] libcrc32c: Add crc32c_impl function

This function returns a string with the currently in-use implementation
of the crc32c algorithm, i.e crc32c-generic (for unoptimised, generic
implementation) or crc32c-intel for the sse optimised version. This
will be used by btrfs.

Signed-off-by: Nikolay Borisov <nborisov@suse.com>
Acked-by: Herbert Xu <herbert@gondor.apana.org.au>
[ use crypto_shash_driver_name as suggested by Herbert ]
Signed-off-by: David Sterba <dsterba@suse.com>
---
 include/linux/crc32c.h | 1 +
 lib/libcrc32c.c        | 6 ++++++
 2 files changed, 7 insertions(+)

diff --git a/include/linux/crc32c.h b/include/linux/crc32c.h
index 357ae4611a45..bd21af828ff6 100644
--- a/include/linux/crc32c.h
+++ b/include/linux/crc32c.h
@@ -5,6 +5,7 @@
 #include <linux/types.h>
 
 extern u32 crc32c(u32 crc, const void *address, unsigned int length);
+extern const char *crc32c_impl(void);
 
 /* This macro exists for backwards-compatibility. */
 #define crc32c_le crc32c
diff --git a/lib/libcrc32c.c b/lib/libcrc32c.c
index 9f79547d1b97..f0a2934605bf 100644
--- a/lib/libcrc32c.c
+++ b/lib/libcrc32c.c
@@ -71,6 +71,12 @@ static void __exit libcrc32c_mod_fini(void)
 	crypto_free_shash(tfm);
 }
 
+const char *crc32c_impl(void)
+{
+	return crypto_shash_driver_name(tfm);
+}
+EXPORT_SYMBOL(crc32c_impl);
+
 module_init(libcrc32c_mod_init);
 module_exit(libcrc32c_mod_fini);
 

From 9678c54388b6a6b309ff7ee5c8d23fa9eba7c06f Mon Sep 17 00:00:00 2001
From: Nikolay Borisov <nborisov@suse.com>
Date: Mon, 8 Jan 2018 11:45:05 +0200
Subject: [PATCH 058/164] btrfs: Remove custom crc32c init code

The custom crc32 init code was introduced in
14a958e678cd ("Btrfs: fix btrfs boot when compiled as built-in") to
enable using btrfs as a built-in. However, later as pointed out by
60efa5eb2e88 ("Btrfs: use late_initcall instead of module_init") this
wasn't enough and finally btrfs was switched to late_initcall which
comes after the generic crc32c implementation is initiliased. The
latter commit superseeded the former. Now that we don't have to
maintain our own code let's just remove it and switch to using the
generic implementation.

Despite touching a lot of files the patch is really simple. Here is the gist of
the changes:

1. Select LIBCRC32C rather than the low-level modules.
2. s/btrfs_crc32c/crc32c/g
3. replace hash.h with linux/crc32c.h
4. Move the btrfs namehash funcs to ctree.h and change the tree accordingly.

I've tested this with btrfs being both a module and a built-in and xfstest
doesn't complain.

Does seem to fix the longstanding problem of not automatically selectiong
the crc32c module when btrfs is used. Possibly there is a workaround in
dracut.

The modinfo confirms that now all the module dependencies are there:

before:
depends:        zstd_compress,zstd_decompress,raid6_pq,xor,zlib_deflate

after:
depends:        libcrc32c,zstd_compress,zstd_decompress,raid6_pq,xor,zlib_deflate

Signed-off-by: Nikolay Borisov <nborisov@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
[ add more info to changelog from mails ]
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/Kconfig           |  3 +--
 fs/btrfs/Makefile          |  2 +-
 fs/btrfs/check-integrity.c |  4 +--
 fs/btrfs/ctree.h           | 16 +++++++++++
 fs/btrfs/dir-item.c        |  1 -
 fs/btrfs/disk-io.c         |  4 +--
 fs/btrfs/extent-tree.c     | 10 +++----
 fs/btrfs/hash.c            | 54 --------------------------------------
 fs/btrfs/hash.h            | 43 ------------------------------
 fs/btrfs/inode-item.c      |  1 -
 fs/btrfs/inode.c           |  1 -
 fs/btrfs/props.c           |  2 +-
 fs/btrfs/send.c            |  4 +--
 fs/btrfs/super.c           | 14 +++-------
 fs/btrfs/tree-checker.c    |  1 -
 fs/btrfs/tree-log.c        |  2 +-
 16 files changed, 35 insertions(+), 127 deletions(-)
 delete mode 100644 fs/btrfs/hash.c
 delete mode 100644 fs/btrfs/hash.h

diff --git a/fs/btrfs/Kconfig b/fs/btrfs/Kconfig
index 273351ee4c46..167e5dc7eadd 100644
--- a/fs/btrfs/Kconfig
+++ b/fs/btrfs/Kconfig
@@ -1,7 +1,6 @@
 config BTRFS_FS
 	tristate "Btrfs filesystem support"
-	select CRYPTO
-	select CRYPTO_CRC32C
+	select LIBCRC32C
 	select ZLIB_INFLATE
 	select ZLIB_DEFLATE
 	select LZO_COMPRESS
diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile
index 0c4373628eb4..ca693dd554e9 100644
--- a/fs/btrfs/Makefile
+++ b/fs/btrfs/Makefile
@@ -10,7 +10,7 @@ btrfs-y += super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \
 	   export.o tree-log.o free-space-cache.o zlib.o lzo.o zstd.o \
 	   compression.o delayed-ref.o relocation.o delayed-inode.o scrub.o \
 	   reada.o backref.o ulist.o qgroup.o send.o dev-replace.o raid56.o \
-	   uuid-tree.o props.o hash.o free-space-tree.o tree-checker.o
+	   uuid-tree.o props.o free-space-tree.o tree-checker.o
 
 btrfs-$(CONFIG_BTRFS_FS_POSIX_ACL) += acl.o
 btrfs-$(CONFIG_BTRFS_FS_CHECK_INTEGRITY) += check-integrity.o
diff --git a/fs/btrfs/check-integrity.c b/fs/btrfs/check-integrity.c
index 7d51b5a5b505..3baebbc021c5 100644
--- a/fs/btrfs/check-integrity.c
+++ b/fs/btrfs/check-integrity.c
@@ -96,9 +96,9 @@
 #include <linux/blkdev.h>
 #include <linux/mm.h>
 #include <linux/string.h>
+#include <linux/crc32c.h>
 #include "ctree.h"
 #include "disk-io.h"
-#include "hash.h"
 #include "transaction.h"
 #include "extent_io.h"
 #include "volumes.h"
@@ -1736,7 +1736,7 @@ static int btrfsic_test_for_metadata(struct btrfsic_state *state,
 		size_t sublen = i ? PAGE_SIZE :
 				    (PAGE_SIZE - BTRFS_CSUM_SIZE);
 
-		crc = btrfs_crc32c(crc, data, sublen);
+		crc = crc32c(crc, data, sublen);
 	}
 	btrfs_csum_final(crc, csum);
 	if (memcmp(csum, h->csum, state->csum_size))
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index a4877b6959e3..92b9db7186bb 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -40,6 +40,7 @@
 #include <linux/sizes.h>
 #include <linux/dynamic_debug.h>
 #include <linux/refcount.h>
+#include <linux/crc32c.h>
 #include "extent_io.h"
 #include "extent_map.h"
 #include "async-thread.h"
@@ -98,6 +99,7 @@ static const int btrfs_csum_sizes[] = { 4 };
 
 #define BTRFS_MAX_EXTENT_SIZE SZ_128M
 
+
 /*
  * Count how many BTRFS_MAX_EXTENT_SIZE cover the @size
  */
@@ -2553,6 +2555,20 @@ BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_cursor_right,
 	((unsigned long)(BTRFS_LEAF_DATA_OFFSET + \
 	btrfs_item_offset_nr(leaf, slot)))
 
+static inline u64 btrfs_name_hash(const char *name, int len)
+{
+       return crc32c((u32)~1, name, len);
+}
+
+/*
+ * Figure the key offset of an extended inode ref
+ */
+static inline u64 btrfs_extref_hash(u64 parent_objectid, const char *name,
+                                   int len)
+{
+       return (u64) crc32c(parent_objectid, name, len);
+}
+
 static inline bool btrfs_mixed_space_info(struct btrfs_space_info *space_info)
 {
 	return ((space_info->flags & BTRFS_BLOCK_GROUP_METADATA) &&
diff --git a/fs/btrfs/dir-item.c b/fs/btrfs/dir-item.c
index cbe421605cd5..29e967b2c667 100644
--- a/fs/btrfs/dir-item.c
+++ b/fs/btrfs/dir-item.c
@@ -18,7 +18,6 @@
 
 #include "ctree.h"
 #include "disk-io.h"
-#include "hash.h"
 #include "transaction.h"
 
 /*
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 9d2c932b012e..c10c84640eee 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -31,10 +31,10 @@
 #include <linux/uuid.h>
 #include <linux/semaphore.h>
 #include <linux/error-injection.h>
+#include <linux/crc32c.h>
 #include <asm/unaligned.h>
 #include "ctree.h"
 #include "disk-io.h"
-#include "hash.h"
 #include "transaction.h"
 #include "btrfs_inode.h"
 #include "volumes.h"
@@ -270,7 +270,7 @@ out:
 
 u32 btrfs_csum_data(const char *data, u32 seed, size_t len)
 {
-	return btrfs_crc32c(seed, data, len);
+	return crc32c(seed, data, len);
 }
 
 void btrfs_csum_final(u32 crc, u8 *result)
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 22ac82198a54..2760292e1175 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -27,7 +27,7 @@
 #include <linux/ratelimit.h>
 #include <linux/percpu_counter.h>
 #include <linux/lockdep.h>
-#include "hash.h"
+#include <linux/crc32c.h>
 #include "tree-log.h"
 #include "disk-io.h"
 #include "print-tree.h"
@@ -1203,11 +1203,11 @@ static u64 hash_extent_data_ref(u64 root_objectid, u64 owner, u64 offset)
 	__le64 lenum;
 
 	lenum = cpu_to_le64(root_objectid);
-	high_crc = btrfs_crc32c(high_crc, &lenum, sizeof(lenum));
+	high_crc = crc32c(high_crc, &lenum, sizeof(lenum));
 	lenum = cpu_to_le64(owner);
-	low_crc = btrfs_crc32c(low_crc, &lenum, sizeof(lenum));
+	low_crc = crc32c(low_crc, &lenum, sizeof(lenum));
 	lenum = cpu_to_le64(offset);
-	low_crc = btrfs_crc32c(low_crc, &lenum, sizeof(lenum));
+	low_crc = crc32c(low_crc, &lenum, sizeof(lenum));
 
 	return ((u64)high_crc << 31) ^ (u64)low_crc;
 }
@@ -5944,7 +5944,7 @@ int btrfs_orphan_reserve_metadata(struct btrfs_trans_handle *trans,
 	 */
 	u64 num_bytes = btrfs_calc_trans_metadata_size(fs_info, 1);
 
-	trace_btrfs_space_reservation(fs_info, "orphan", btrfs_ino(inode), 
+	trace_btrfs_space_reservation(fs_info, "orphan", btrfs_ino(inode),
 			num_bytes, 1);
 	return btrfs_block_rsv_migrate(src_rsv, dst_rsv, num_bytes, 1);
 }
diff --git a/fs/btrfs/hash.c b/fs/btrfs/hash.c
deleted file mode 100644
index baacc1866861..000000000000
--- a/fs/btrfs/hash.c
+++ /dev/null
@@ -1,54 +0,0 @@
-/*
- * Copyright (C) 2014 Filipe David Borba Manana <fdmanana@gmail.com>
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License v2 as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
- */
-
-#include <crypto/hash.h>
-#include <linux/err.h>
-#include "hash.h"
-
-static struct crypto_shash *tfm;
-
-int __init btrfs_hash_init(void)
-{
-	tfm = crypto_alloc_shash("crc32c", 0, 0);
-
-	return PTR_ERR_OR_ZERO(tfm);
-}
-
-const char* btrfs_crc32c_impl(void)
-{
-	return crypto_tfm_alg_driver_name(crypto_shash_tfm(tfm));
-}
-
-void btrfs_hash_exit(void)
-{
-	crypto_free_shash(tfm);
-}
-
-u32 btrfs_crc32c(u32 crc, const void *address, unsigned int length)
-{
-	SHASH_DESC_ON_STACK(shash, tfm);
-	u32 *ctx = (u32 *)shash_desc_ctx(shash);
-	u32 retval;
-	int err;
-
-	shash->tfm = tfm;
-	shash->flags = 0;
-	*ctx = crc;
-
-	err = crypto_shash_update(shash, address, length);
-	BUG_ON(err);
-
-	retval = *ctx;
-	barrier_data(ctx);
-	return retval;
-}
diff --git a/fs/btrfs/hash.h b/fs/btrfs/hash.h
deleted file mode 100644
index c3a2ec554361..000000000000
--- a/fs/btrfs/hash.h
+++ /dev/null
@@ -1,43 +0,0 @@
-/*
- * Copyright (C) 2007 Oracle.  All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License v2 as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public
- * License along with this program; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 021110-1307, USA.
- */
-
-#ifndef __HASH__
-#define __HASH__
-
-int __init btrfs_hash_init(void);
-
-void btrfs_hash_exit(void);
-const char* btrfs_crc32c_impl(void);
-
-u32 btrfs_crc32c(u32 crc, const void *address, unsigned int length);
-
-static inline u64 btrfs_name_hash(const char *name, int len)
-{
-	return btrfs_crc32c((u32)~1, name, len);
-}
-
-/*
- * Figure the key offset of an extended inode ref
- */
-static inline u64 btrfs_extref_hash(u64 parent_objectid, const char *name,
-				    int len)
-{
-	return (u64) btrfs_crc32c(parent_objectid, name, len);
-}
-
-#endif
diff --git a/fs/btrfs/inode-item.c b/fs/btrfs/inode-item.c
index 65e1a76bf755..1d5631ef2738 100644
--- a/fs/btrfs/inode-item.c
+++ b/fs/btrfs/inode-item.c
@@ -18,7 +18,6 @@
 
 #include "ctree.h"
 #include "disk-io.h"
-#include "hash.h"
 #include "transaction.h"
 #include "print-tree.h"
 
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index b55b47f493e9..6504e63b2317 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -58,7 +58,6 @@
 #include "free-space-cache.h"
 #include "inode-map.h"
 #include "backref.h"
-#include "hash.h"
 #include "props.h"
 #include "qgroup.h"
 #include "dedupe.h"
diff --git a/fs/btrfs/props.c b/fs/btrfs/props.c
index b30a056963ab..e4ac24175524 100644
--- a/fs/btrfs/props.c
+++ b/fs/btrfs/props.c
@@ -19,8 +19,8 @@
 #include <linux/hashtable.h>
 #include "props.h"
 #include "btrfs_inode.h"
-#include "hash.h"
 #include "transaction.h"
+#include "ctree.h"
 #include "xattr.h"
 #include "compression.h"
 
diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index 484e2af793de..b0c5d710183e 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -27,10 +27,10 @@
 #include <linux/vmalloc.h>
 #include <linux/string.h>
 #include <linux/compat.h>
+#include <linux/crc32c.h>
 
 #include "send.h"
 #include "backref.h"
-#include "hash.h"
 #include "locking.h"
 #include "disk-io.h"
 #include "btrfs_inode.h"
@@ -695,7 +695,7 @@ static int send_cmd(struct send_ctx *sctx)
 	hdr->len = cpu_to_le32(sctx->send_size - sizeof(*hdr));
 	hdr->crc = 0;
 
-	crc = btrfs_crc32c(0, (unsigned char *)sctx->send_buf, sctx->send_size);
+	crc = crc32c(0, (unsigned char *)sctx->send_buf, sctx->send_size);
 	hdr->crc = cpu_to_le32(crc);
 
 	ret = write_buf(sctx->send_filp, sctx->send_buf, sctx->send_size,
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 540c18511e7a..5d752f791950 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -41,6 +41,7 @@
 #include <linux/slab.h>
 #include <linux/cleancache.h>
 #include <linux/ratelimit.h>
+#include <linux/crc32c.h>
 #include <linux/btrfs.h>
 #include "delayed-inode.h"
 #include "ctree.h"
@@ -48,7 +49,6 @@
 #include "transaction.h"
 #include "btrfs_inode.h"
 #include "print-tree.h"
-#include "hash.h"
 #include "props.h"
 #include "xattr.h"
 #include "volumes.h"
@@ -2357,22 +2357,18 @@ static void __init btrfs_print_mod_info(void)
 			", ref-verify=on"
 #endif
 			"\n",
-			btrfs_crc32c_impl());
+			crc32c_impl());
 }
 
 static int __init init_btrfs_fs(void)
 {
 	int err;
 
-	err = btrfs_hash_init();
-	if (err)
-		return err;
-
 	btrfs_props_init();
 
 	err = btrfs_init_sysfs();
 	if (err)
-		goto free_hash;
+		return err;
 
 	btrfs_init_compress();
 
@@ -2453,8 +2449,7 @@ free_cachep:
 free_compress:
 	btrfs_exit_compress();
 	btrfs_exit_sysfs();
-free_hash:
-	btrfs_hash_exit();
+
 	return err;
 }
 
@@ -2474,7 +2469,6 @@ static void __exit exit_btrfs_fs(void)
 	btrfs_exit_sysfs();
 	btrfs_cleanup_fs_uuids();
 	btrfs_exit_compress();
-	btrfs_hash_exit();
 }
 
 late_initcall(init_btrfs_fs);
diff --git a/fs/btrfs/tree-checker.c b/fs/btrfs/tree-checker.c
index a5244f98f3b4..e96cfd93ae3f 100644
--- a/fs/btrfs/tree-checker.c
+++ b/fs/btrfs/tree-checker.c
@@ -30,7 +30,6 @@
 #include "tree-checker.h"
 #include "disk-io.h"
 #include "compression.h"
-#include "hash.h"
 
 /*
  * Error message should follow the following format:
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index fac5fd1cc786..bbd8a40b2006 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -21,12 +21,12 @@
 #include <linux/blkdev.h>
 #include <linux/list_sort.h>
 #include <linux/iversion.h>
+#include "ctree.h"
 #include "tree-log.h"
 #include "disk-io.h"
 #include "locking.h"
 #include "print-tree.h"
 #include "backref.h"
-#include "hash.h"
 #include "compression.h"
 #include "qgroup.h"
 #include "inode-map.h"

From 5811375325420052fcadd944792a416a43072b7f Mon Sep 17 00:00:00 2001
From: Liu Bo <bo.li.liu@oracle.com>
Date: Wed, 31 Jan 2018 17:09:13 -0700
Subject: [PATCH 059/164] Btrfs: fix unexpected cow in run_delalloc_nocow

Fstests generic/475 provides a way to fail metadata reads while
checking if checksum exists for the inode inside run_delalloc_nocow(),
and csum_exist_in_range() interprets error (-EIO) as inode having
checksum and makes its caller enter the cow path.

In case of free space inode, this ends up with a warning in
cow_file_range().

The same problem applies to btrfs_cross_ref_exist() since it may also
read metadata in between.

With this, run_delalloc_nocow() bails out when errors occur at the two
places.

cc: <stable@vger.kernel.org> v2.6.28+
Fixes: 17d217fe970d ("Btrfs: fix nodatasum handling in balancing code")
Signed-off-by: Liu Bo <bo.li.liu@oracle.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/inode.c | 37 ++++++++++++++++++++++++++++++++-----
 1 file changed, 32 insertions(+), 5 deletions(-)

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 6504e63b2317..491a7397f6fa 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -1256,6 +1256,8 @@ static noinline int csum_exist_in_range(struct btrfs_fs_info *fs_info,
 		list_del(&sums->list);
 		kfree(sums);
 	}
+	if (ret < 0)
+		return ret;
 	return 1;
 }
 
@@ -1388,10 +1390,23 @@ next_slot:
 				goto out_check;
 			if (btrfs_extent_readonly(fs_info, disk_bytenr))
 				goto out_check;
-			if (btrfs_cross_ref_exist(root, ino,
-						  found_key.offset -
-						  extent_offset, disk_bytenr))
+			ret = btrfs_cross_ref_exist(root, ino,
+						    found_key.offset -
+						    extent_offset, disk_bytenr);
+			if (ret) {
+				/*
+				 * ret could be -EIO if the above fails to read
+				 * metadata.
+				 */
+				if (ret < 0) {
+					if (cow_start != (u64)-1)
+						cur_offset = cow_start;
+					goto error;
+				}
+
+				WARN_ON_ONCE(nolock);
 				goto out_check;
+			}
 			disk_bytenr += extent_offset;
 			disk_bytenr += cur_offset - found_key.offset;
 			num_bytes = min(end + 1, extent_end) - cur_offset;
@@ -1409,10 +1424,22 @@ next_slot:
 			 * this ensure that csum for a given extent are
 			 * either valid or do not exist.
 			 */
-			if (csum_exist_in_range(fs_info, disk_bytenr,
-						num_bytes)) {
+			ret = csum_exist_in_range(fs_info, disk_bytenr,
+						  num_bytes);
+			if (ret) {
 				if (!nolock)
 					btrfs_end_write_no_snapshotting(root);
+
+				/*
+				 * ret could be -EIO if the above fails to read
+				 * metadata.
+				 */
+				if (ret < 0) {
+					if (cow_start != (u64)-1)
+						cur_offset = cow_start;
+					goto error;
+				}
+				WARN_ON_ONCE(nolock);
 				goto out_check;
 			}
 			if (!btrfs_inc_nocow_writers(fs_info, disk_bytenr)) {

From ccb0e7d1c1bc222966085550e27957c17138b629 Mon Sep 17 00:00:00 2001
From: Anand Jain <anand.jain@oracle.com>
Date: Thu, 15 Feb 2018 01:11:37 +0800
Subject: [PATCH 060/164] btrfs: verify subvolid mount parameter

We aren't verifying the parameter passed to the subvolid mount option,
so we won't report and fail the mount if a junk value is specified for
example, -o subvolid=abc.
This patch verifies the subvolid option with match_u64.

Up to now the memparse function accepts the K/M/G/ suffixes, that are
usually meant for size values and do not make sense for a subvolume it.

Signed-off-by: Anand Jain <anand.jain@oracle.com>
Reviewed-by: David Sterba <dsterba@suse.com>
[ update changelog ]
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/super.c | 21 +++++++++------------
 1 file changed, 9 insertions(+), 12 deletions(-)

diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 5d752f791950..07bc2bfbdb96 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -914,8 +914,8 @@ static int btrfs_parse_subvol_options(const char *options, fmode_t flags,
 {
 	substring_t args[MAX_OPT_ARGS];
 	char *opts, *orig, *p;
-	char *num = NULL;
 	int error = 0;
+	u64 subvolid;
 
 	if (!options)
 		return 0;
@@ -945,18 +945,15 @@ static int btrfs_parse_subvol_options(const char *options, fmode_t flags,
 			}
 			break;
 		case Opt_subvolid:
-			num = match_strdup(&args[0]);
-			if (num) {
-				*subvol_objectid = memparse(num, NULL);
-				kfree(num);
-				/* we want the original fs_tree */
-				if (!*subvol_objectid)
-					*subvol_objectid =
-						BTRFS_FS_TREE_OBJECTID;
-			} else {
-				error = -EINVAL;
+			error = match_u64(&args[0], &subvolid);
+			if (error)
 				goto out;
-			}
+
+			/* we want the original fs_tree */
+			if (subvolid == 0)
+				subvolid = BTRFS_FS_TREE_OBJECTID;
+
+			*subvol_objectid = subvolid;
 			break;
 		case Opt_subvolrootid:
 			pr_warn("BTRFS: 'subvolrootid' mount option is deprecated and has no effect\n");

From ffc5a3794f9779c1a09e18e6d75bb6cc22b37523 Mon Sep 17 00:00:00 2001
From: David Sterba <dsterba@suse.com>
Date: Mon, 19 Feb 2018 17:24:15 +0100
Subject: [PATCH 061/164] btrfs: add (the only possible) __exit annotation

Recently, the __init annotations have been added. There's unfortunatelly
only one case where we can add __exit, because most of the cleanup
helpers are also called from the __init phase.

As the __exit annotated functions get discarded completely for a
built-in code, we'd miss them from the init phase.

Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/volumes.c | 2 +-
 fs/btrfs/volumes.h | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 2c0c08ec987a..797e7706e67b 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -279,7 +279,7 @@ static void btrfs_kobject_uevent(struct block_device *bdev,
 			&disk_to_dev(bdev->bd_disk)->kobj);
 }
 
-void btrfs_cleanup_fs_uuids(void)
+void __exit btrfs_cleanup_fs_uuids(void)
 {
 	struct btrfs_fs_devices *fs_devices;
 
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index d110fb03ec0d..d28f5745fee2 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -436,7 +436,7 @@ struct btrfs_device *btrfs_alloc_device(struct btrfs_fs_info *fs_info,
 					const u8 *uuid);
 int btrfs_rm_device(struct btrfs_fs_info *fs_info,
 		    const char *device_path, u64 devid);
-void btrfs_cleanup_fs_uuids(void);
+void __exit btrfs_cleanup_fs_uuids(void);
 int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len);
 int btrfs_grow_device(struct btrfs_trans_handle *trans,
 		      struct btrfs_device *device, u64 new_size);

From e67c718b5b9a306bde7e966be7b4ca48fa063d73 Mon Sep 17 00:00:00 2001
From: David Sterba <dsterba@suse.com>
Date: Mon, 19 Feb 2018 17:24:18 +0100
Subject: [PATCH 062/164] btrfs: add more __cold annotations

The __cold functions are placed to a special section, as they're
expected to be called rarely. This could help i-cache prefetches or help
compiler to decide which branches are more/less likely to be taken
without any other annotations needed.

Though we can't add more __exit annotations, it's still possible to add
__cold (that's also added with __exit). That way the following function
categories are tagged:

- printf wrappers, error messages
- exit helpers

Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/backref.c       | 2 +-
 fs/btrfs/backref.h       | 2 +-
 fs/btrfs/compression.c   | 2 +-
 fs/btrfs/compression.h   | 2 +-
 fs/btrfs/ctree.h         | 9 +++++----
 fs/btrfs/delayed-inode.c | 2 +-
 fs/btrfs/delayed-inode.h | 2 +-
 fs/btrfs/delayed-ref.c   | 2 +-
 fs/btrfs/delayed-ref.h   | 2 +-
 fs/btrfs/disk-io.c       | 2 +-
 fs/btrfs/disk-io.h       | 2 +-
 fs/btrfs/extent_io.c     | 2 +-
 fs/btrfs/extent_io.h     | 2 +-
 fs/btrfs/extent_map.c    | 2 +-
 fs/btrfs/extent_map.h    | 2 +-
 fs/btrfs/file.c          | 2 +-
 fs/btrfs/inode.c         | 2 +-
 fs/btrfs/ordered-data.c  | 2 +-
 fs/btrfs/ordered-data.h  | 2 +-
 fs/btrfs/send.c          | 1 +
 fs/btrfs/super.c         | 2 +-
 fs/btrfs/sysfs.c         | 2 +-
 fs/btrfs/tree-checker.c  | 3 +++
 23 files changed, 29 insertions(+), 24 deletions(-)

diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c
index 26484648d090..4a33448cbb01 100644
--- a/fs/btrfs/backref.c
+++ b/fs/btrfs/backref.c
@@ -170,7 +170,7 @@ int __init btrfs_prelim_ref_init(void)
 	return 0;
 }
 
-void btrfs_prelim_ref_exit(void)
+void __cold btrfs_prelim_ref_exit(void)
 {
 	kmem_cache_destroy(btrfs_prelim_ref_cache);
 }
diff --git a/fs/btrfs/backref.h b/fs/btrfs/backref.h
index 0c2fab8514ff..0a30028d5196 100644
--- a/fs/btrfs/backref.h
+++ b/fs/btrfs/backref.h
@@ -73,7 +73,7 @@ int btrfs_find_one_extref(struct btrfs_root *root, u64 inode_objectid,
 int btrfs_check_shared(struct btrfs_root *root, u64 inum, u64 bytenr);
 
 int __init btrfs_prelim_ref_init(void);
-void btrfs_prelim_ref_exit(void);
+void __cold btrfs_prelim_ref_exit(void);
 
 struct prelim_ref {
 	struct rb_node rbnode;
diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index 07d049c0c20f..562c3e633403 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -1133,7 +1133,7 @@ int btrfs_decompress(int type, unsigned char *data_in, struct page *dest_page,
 	return ret;
 }
 
-void btrfs_exit_compress(void)
+void __cold btrfs_exit_compress(void)
 {
 	free_workspaces();
 }
diff --git a/fs/btrfs/compression.h b/fs/btrfs/compression.h
index 677fa4aa0bd7..ce796557a918 100644
--- a/fs/btrfs/compression.h
+++ b/fs/btrfs/compression.h
@@ -76,7 +76,7 @@ struct compressed_bio {
 };
 
 void __init btrfs_init_compress(void);
-void btrfs_exit_compress(void);
+void __cold btrfs_exit_compress(void);
 
 int btrfs_compress_pages(unsigned int type_level, struct address_space *mapping,
 			 u64 start, struct page **pages,
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 92b9db7186bb..d6a2fc311187 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -3204,7 +3204,7 @@ struct inode *btrfs_alloc_inode(struct super_block *sb);
 void btrfs_destroy_inode(struct inode *inode);
 int btrfs_drop_inode(struct inode *inode);
 int __init btrfs_init_cachep(void);
-void btrfs_destroy_cachep(void);
+void __cold btrfs_destroy_cachep(void);
 long btrfs_ioctl_trans_end(struct file *file);
 struct inode *btrfs_iget(struct super_block *s, struct btrfs_key *location,
 			 struct btrfs_root *root, int *was_new);
@@ -3255,7 +3255,7 @@ ssize_t btrfs_dedupe_file_range(struct file *src_file, u64 loff, u64 olen,
 
 /* file.c */
 int __init btrfs_auto_defrag_init(void);
-void btrfs_auto_defrag_exit(void);
+void __cold btrfs_auto_defrag_exit(void);
 int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans,
 			   struct btrfs_inode *inode);
 int btrfs_run_defrag_inodes(struct btrfs_fs_info *fs_info);
@@ -3290,7 +3290,7 @@ int btrfs_defrag_leaves(struct btrfs_trans_handle *trans,
 
 /* sysfs.c */
 int __init btrfs_init_sysfs(void);
-void btrfs_exit_sysfs(void);
+void __cold btrfs_exit_sysfs(void);
 int btrfs_sysfs_add_mounted(struct btrfs_fs_info *fs_info);
 void btrfs_sysfs_remove_mounted(struct btrfs_fs_info *fs_info);
 
@@ -3302,13 +3302,14 @@ int btrfs_parse_options(struct btrfs_fs_info *info, char *options,
 			unsigned long new_flags);
 int btrfs_sync_fs(struct super_block *sb, int wait);
 
-static inline __printf(2, 3)
+static inline __printf(2, 3) __cold
 void btrfs_no_printk(const struct btrfs_fs_info *fs_info, const char *fmt, ...)
 {
 }
 
 #ifdef CONFIG_PRINTK
 __printf(2, 3)
+__cold
 void btrfs_printk(const struct btrfs_fs_info *fs_info, const char *fmt, ...);
 #else
 #define btrfs_printk(fs_info, fmt, args...) \
diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c
index 09939fc37f2a..d06bef16ebd5 100644
--- a/fs/btrfs/delayed-inode.c
+++ b/fs/btrfs/delayed-inode.c
@@ -42,7 +42,7 @@ int __init btrfs_delayed_inode_init(void)
 	return 0;
 }
 
-void btrfs_delayed_inode_exit(void)
+void __cold btrfs_delayed_inode_exit(void)
 {
 	kmem_cache_destroy(delayed_node_cache);
 }
diff --git a/fs/btrfs/delayed-inode.h b/fs/btrfs/delayed-inode.h
index ae893d85224f..100a91e26b55 100644
--- a/fs/btrfs/delayed-inode.h
+++ b/fs/btrfs/delayed-inode.h
@@ -149,7 +149,7 @@ int btrfs_readdir_delayed_dir_index(struct dir_context *ctx,
 
 /* for init */
 int __init btrfs_delayed_inode_init(void);
-void btrfs_delayed_inode_exit(void);
+void __cold btrfs_delayed_inode_exit(void);
 
 /* for debugging */
 void btrfs_assert_delayed_root_empty(struct btrfs_fs_info *fs_info);
diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c
index 7ab5e0128f0c..03bdf355107a 100644
--- a/fs/btrfs/delayed-ref.c
+++ b/fs/btrfs/delayed-ref.c
@@ -930,7 +930,7 @@ btrfs_find_delayed_ref_head(struct btrfs_delayed_ref_root *delayed_refs, u64 byt
 	return find_ref_head(&delayed_refs->href_root, bytenr, 0);
 }
 
-void btrfs_delayed_ref_exit(void)
+void __cold btrfs_delayed_ref_exit(void)
 {
 	kmem_cache_destroy(btrfs_delayed_ref_head_cachep);
 	kmem_cache_destroy(btrfs_delayed_tree_ref_cachep);
diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h
index c4f625e5a691..9e3e5aff0937 100644
--- a/fs/btrfs/delayed-ref.h
+++ b/fs/btrfs/delayed-ref.h
@@ -204,7 +204,7 @@ extern struct kmem_cache *btrfs_delayed_data_ref_cachep;
 extern struct kmem_cache *btrfs_delayed_extent_op_cachep;
 
 int __init btrfs_delayed_ref_init(void);
-void btrfs_delayed_ref_exit(void);
+void __cold btrfs_delayed_ref_exit(void);
 
 static inline struct btrfs_delayed_extent_op *
 btrfs_alloc_delayed_extent_op(void)
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index c10c84640eee..798e602c1834 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -110,7 +110,7 @@ int __init btrfs_end_io_wq_init(void)
 	return 0;
 }
 
-void btrfs_end_io_wq_exit(void)
+void __cold btrfs_end_io_wq_exit(void)
 {
 	kmem_cache_destroy(btrfs_end_io_wq_cache);
 }
diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
index e2ac6a14150a..aaf99529883d 100644
--- a/fs/btrfs/disk-io.h
+++ b/fs/btrfs/disk-io.h
@@ -153,7 +153,7 @@ struct extent_map *btree_get_extent(struct btrfs_inode *inode,
 		int create);
 int btrfs_get_num_tolerated_disk_barrier_failures(u64 flags);
 int __init btrfs_end_io_wq_init(void);
-void btrfs_end_io_wq_exit(void);
+void __cold btrfs_end_io_wq_exit(void);
 
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
 void btrfs_init_lockdep(void);
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 4e73705b405e..da46e9372262 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -187,7 +187,7 @@ free_state_cache:
 	return -ENOMEM;
 }
 
-void extent_io_exit(void)
+void __cold extent_io_exit(void)
 {
 	btrfs_leak_debug_check();
 
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index da9be2fb0502..e359c5d4305c 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -286,7 +286,7 @@ int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end);
 int extent_read_full_page(struct extent_io_tree *tree, struct page *page,
 			  get_extent_t *get_extent, int mirror_num);
 int __init extent_io_init(void);
-void extent_io_exit(void);
+void __cold extent_io_exit(void);
 
 u64 count_range_bits(struct extent_io_tree *tree,
 		     u64 *start, u64 search_end,
diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c
index b8ead8dc2ebe..53a0633c6ef7 100644
--- a/fs/btrfs/extent_map.c
+++ b/fs/btrfs/extent_map.c
@@ -19,7 +19,7 @@ int __init extent_map_init(void)
 	return 0;
 }
 
-void extent_map_exit(void)
+void __cold extent_map_exit(void)
 {
 	kmem_cache_destroy(extent_map_cache);
 }
diff --git a/fs/btrfs/extent_map.h b/fs/btrfs/extent_map.h
index b29f77bc0732..f6f8ba114977 100644
--- a/fs/btrfs/extent_map.h
+++ b/fs/btrfs/extent_map.h
@@ -86,7 +86,7 @@ void replace_extent_mapping(struct extent_map_tree *tree,
 struct extent_map *alloc_extent_map(void);
 void free_extent_map(struct extent_map *em);
 int __init extent_map_init(void);
-void extent_map_exit(void);
+void __cold extent_map_exit(void);
 int unpin_extent_cache(struct extent_map_tree *tree, u64 start, u64 len, u64 gen);
 void clear_em_logging(struct extent_map_tree *tree, struct extent_map *em);
 struct extent_map *search_extent_mapping(struct extent_map_tree *tree,
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 41ab9073d1d4..a335e2e6c84d 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -3378,7 +3378,7 @@ const struct file_operations btrfs_file_operations = {
 	.dedupe_file_range = btrfs_dedupe_file_range,
 };
 
-void btrfs_auto_defrag_exit(void)
+void __cold btrfs_auto_defrag_exit(void)
 {
 	kmem_cache_destroy(btrfs_inode_defrag_cachep);
 }
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 491a7397f6fa..bb5de52cbc09 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -9478,7 +9478,7 @@ static void init_once(void *foo)
 	inode_init_once(&ei->vfs_inode);
 }
 
-void btrfs_destroy_cachep(void)
+void __cold btrfs_destroy_cachep(void)
 {
 	/*
 	 * Make sure all delayed rcu free inodes are flushed before we
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index 5b311aeddcc8..9be98e42cfb6 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -1154,7 +1154,7 @@ int __init ordered_data_init(void)
 	return 0;
 }
 
-void ordered_data_exit(void)
+void __cold ordered_data_exit(void)
 {
 	kmem_cache_destroy(btrfs_ordered_extent_cache);
 }
diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h
index c53e2cfb72d9..4a1672a13ba6 100644
--- a/fs/btrfs/ordered-data.h
+++ b/fs/btrfs/ordered-data.h
@@ -217,5 +217,5 @@ void btrfs_wait_logged_extents(struct btrfs_trans_handle *trans,
 			       struct btrfs_root *log, u64 transid);
 void btrfs_free_logged_extents(struct btrfs_root *log, u64 transid);
 int __init ordered_data_init(void);
-void ordered_data_exit(void);
+void __cold ordered_data_exit(void);
 #endif
diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index b0c5d710183e..085542832b9a 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -270,6 +270,7 @@ struct name_cache_entry {
 	char name[];
 };
 
+__cold
 static void inconsistent_snapshot_error(struct send_ctx *sctx,
 					enum btrfs_compare_tree_result result,
 					const char *what)
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 07bc2bfbdb96..1dd2e785918c 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -2333,7 +2333,7 @@ static int __init btrfs_interface_init(void)
 	return misc_register(&btrfs_misc);
 }
 
-static void btrfs_interface_exit(void)
+static __cold void btrfs_interface_exit(void)
 {
 	misc_deregister(&btrfs_misc);
 }
diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c
index a8bafed931f4..6af7b58e1a90 100644
--- a/fs/btrfs/sysfs.c
+++ b/fs/btrfs/sysfs.c
@@ -923,7 +923,7 @@ out1:
 	return ret;
 }
 
-void btrfs_exit_sysfs(void)
+void __cold btrfs_exit_sysfs(void)
 {
 	sysfs_remove_group(&btrfs_kset->kobj, &btrfs_feature_attr_group);
 	kset_unregister(btrfs_kset);
diff --git a/fs/btrfs/tree-checker.c b/fs/btrfs/tree-checker.c
index e96cfd93ae3f..8871286c1a91 100644
--- a/fs/btrfs/tree-checker.c
+++ b/fs/btrfs/tree-checker.c
@@ -52,6 +52,7 @@
  * Allows callers to customize the output.
  */
 __printf(4, 5)
+__cold
 static void generic_err(const struct btrfs_fs_info *fs_info,
 			const struct extent_buffer *eb, int slot,
 			const char *fmt, ...)
@@ -76,6 +77,7 @@ static void generic_err(const struct btrfs_fs_info *fs_info,
  * offset has its own meaning.
  */
 __printf(4, 5)
+__cold
 static void file_extent_err(const struct btrfs_fs_info *fs_info,
 			    const struct extent_buffer *eb, int slot,
 			    const char *fmt, ...)
@@ -229,6 +231,7 @@ static int check_csum_item(struct btrfs_fs_info *fs_info,
  * which represents inode number
  */
 __printf(4, 5)
+__cold
 static void dir_item_err(const struct btrfs_fs_info *fs_info,
 			 const struct extent_buffer *eb, int slot,
 			 const char *fmt, ...)

From 16db5758fef8c71b0353a7b5505496d6f9229820 Mon Sep 17 00:00:00 2001
From: Anand Jain <Anand.Jain@oracle.com>
Date: Tue, 20 Feb 2018 22:50:36 +0800
Subject: [PATCH 063/164] btrfs: remove assert in
 btrfs_init_dev_replace_tgtdev()

In the same function we just ran btrfs_alloc_device() which means the
btrfs_device::resized_list is sure to be empty and we are protected
with the btrfs_fs_info::volume_mutex.

Signed-off-by: Anand Jain <anand.jain@oracle.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/volumes.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 797e7706e67b..b580ef08c368 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -2658,7 +2658,6 @@ int btrfs_init_dev_replace_tgtdev(struct btrfs_fs_info *fs_info,
 	device->total_bytes = btrfs_device_get_total_bytes(srcdev);
 	device->disk_total_bytes = btrfs_device_get_disk_total_bytes(srcdev);
 	device->bytes_used = btrfs_device_get_bytes_used(srcdev);
-	ASSERT(list_empty(&srcdev->resized_list));
 	device->commit_total_bytes = srcdev->commit_total_bytes;
 	device->commit_bytes_used = device->bytes_used;
 	device->fs_info = fs_info;

From de224b7c56baa335bde9afcb4aa68f03c38f5f42 Mon Sep 17 00:00:00 2001
From: Nikolay Borisov <nborisov@suse.com>
Date: Wed, 14 Feb 2018 10:53:36 +0200
Subject: [PATCH 064/164] btrfs: Remove redundant memory barriers around
 dio_private error status

Using any kind of memory barriers around atomic operations which have
a return value is redundant, since those operations themselves are
fully ordered. atomic_t.txt states:

    - RMW operations that have a return value are fully ordered;

    Fully ordered primitives are ordered against everything prior and
    everything subsequent. Therefore a fully ordered primitive is like
    having an smp_mb() before and an smp_mb() after the primitive.

Given this let's replace the extra memory barriers with comments.

Signed-off-by: Nikolay Borisov <nborisov@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/inode.c | 17 +++++++++--------
 1 file changed, 9 insertions(+), 8 deletions(-)

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index bb5de52cbc09..be167f5ec433 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -8322,13 +8322,13 @@ static void btrfs_end_dio_bio(struct bio *bio)
 		err = dip->subio_endio(dip->inode, btrfs_io_bio(bio), err);
 
 	if (err) {
-		dip->errors = 1;
-
 		/*
-		 * before atomic variable goto zero, we must make sure
-		 * dip->errors is perceived to be set.
+		 * We want to perceive the errors flag being set before
+		 * decrementing the reference count. We don't need a barrier
+		 * since atomic operations with a return value are fully
+		 * ordered as per atomic_t.txt
 		 */
-		smp_mb__before_atomic();
+		dip->errors = 1;
 	}
 
 	/* if there are more bios still pending for this dio, just exit */
@@ -8516,10 +8516,11 @@ submit:
 out_err:
 	dip->errors = 1;
 	/*
-	 * before atomic variable goto zero, we must
-	 * make sure dip->errors is perceived to be set.
+	 * Before atomic variable goto zero, we must  make sure dip->errors is
+	 * perceived to be set. This ordering is ensured by the fact that an
+	 * atomic operations with a return value are fully ordered as per
+	 * atomic_t.txt
 	 */
-	smp_mb__before_atomic();
 	if (atomic_dec_and_test(&dip->pending_bios))
 		bio_io_error(dip->orig_bio);
 

From d612ac59efc3b57858f310c8471d7ee2779658c9 Mon Sep 17 00:00:00 2001
From: Anand Jain <anand.jain@oracle.com>
Date: Mon, 26 Feb 2018 16:46:05 +0800
Subject: [PATCH 065/164] btrfs: unify types for metadata_ratio and
 data_chunk_allocations

We have btrfs_fs_info::data_chunk_allocations and
btrfs_fs_info::metadata_ratio declared as unsigned which would be
unsinged int and kernel style prefers unsigned int over bare unsigned.
So this patch changes them to u32.

Signed-off-by: Anand Jain <anand.jain@oracle.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/ctree.h | 4 ++--
 fs/btrfs/super.c | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index d6a2fc311187..eabc8431b442 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -994,8 +994,8 @@ struct btrfs_fs_info {
 	struct btrfs_balance_control *balance_ctl;
 	wait_queue_head_t balance_wait_q;
 
-	unsigned data_chunk_allocations;
-	unsigned metadata_ratio;
+	u32 data_chunk_allocations;
+	u32 metadata_ratio;
 
 	void *bdev_holder;
 
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 1dd2e785918c..defaccde8d16 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -1737,7 +1737,7 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data)
 	unsigned long old_compress_type = fs_info->compress_type;
 	u64 old_max_inline = fs_info->max_inline;
 	u32 old_thread_pool_size = fs_info->thread_pool_size;
-	unsigned int old_metadata_ratio = fs_info->metadata_ratio;
+	u32 old_metadata_ratio = fs_info->metadata_ratio;
 	int ret;
 
 	sync_filesystem(sb);

From ed5d5f37e653b606c93b2d5f1cdd155be6fefce0 Mon Sep 17 00:00:00 2001
From: Liu Bo <bo.li.liu@oracle.com>
Date: Tue, 27 Feb 2018 18:10:58 -0700
Subject: [PATCH 066/164] Btrfs: dev-replace: skip prealloc extents when copy
 nocow pages

It doens't make sense to process prealloc extents as pages will be
filled with zero when reading prealloc extents.

Signed-off-by: Liu Bo <bo.li.liu@oracle.com>
Reviewed-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/scrub.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index beb441d0c5c6..9fb7d09842e6 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -4480,7 +4480,8 @@ static int check_extent_to_block(struct btrfs_inode *inode, u64 start, u64 len,
 	 * move on to the next inode.
 	 */
 	if (em->block_start > logical ||
-	    em->block_start + em->block_len < logical + len) {
+	    em->block_start + em->block_len < logical + len ||
+	    test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) {
 		free_extent_map(em);
 		ret = 1;
 		goto out_unlock;

From 213e8c5520ed1ecc5401a3a0f716b51a7318bda9 Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana@suse.com>
Date: Tue, 6 Feb 2018 20:40:31 +0000
Subject: [PATCH 067/164] Btrfs: skip writeback of last page when truncating
 file to same size

When we truncate a file to the same size and that size is not aligned
with the sector size, we end up triggering writeback (and wait for it to
complete) of the last page. This is unncessary as we can not have delayed
allocation beyond the inode's i_size and the goal of truncating a file
to its own size is to discard prealloc extents (allocated via the
fallocate(2) system call). Besides the unnecessary IO start and wait, it
also breaks the oppurtunity for larger contiguous extents on disk, as
before the last dirty page there might be other dirty pages.

This scenario is probably not very common in general, however it is
common for btrfs receive implementations because currently the send
stream always issues a truncate operation for each processed inode as
the last operation for that inode (this truncate operation is not
always needed and the send implementation will be addressed to avoid
them).

So improve this by not starting and waiting for writeback of the inode's
last page when we are truncating to exactly the same size.

The following script was used to quickly measure the time a receive
operation takes:

 $ cat test_send.sh
 #!/bin/bash

 SRC_DEV=/dev/sdc
 DST_DEV=/dev/sdd
 SRC_MNT=/mnt/sdc
 DST_MNT=/mnt/sdd

 mkfs.btrfs -f $SRC_DEV >/dev/null
 mkfs.btrfs -f $DST_DEV >/dev/null
 mount $SRC_DEV $SRC_MNT
 mount $DST_DEV $DST_MNT

 echo "Creating source filesystem"
 for ((t = 0; t < 10; t++)); do
     (
         for ((i = 1; i <= 20000; i++)); do
             xfs_io -f -c "pwrite -S 0xab 0 5000" \
                $SRC_MNT/file_$i > /dev/null
         done
     ) &
     worker_pids[$t]=$!
 done
 wait ${worker_pids[@]}

 echo "Creating and sending snapshot"
 btrfs subvolume snapshot -r $SRC_MNT $SRC_MNT/snap1 >/dev/null
 /usr/bin/time -f "send took %e seconds"    \
     btrfs send -f $SRC_MNT/send_file $SRC_MNT/snap1
 /usr/bin/time -f "receive took %e seconds" \
     btrfs receive -f $SRC_MNT/send_file $DST_MNT

 umount $SRC_MNT
 umount $DST_MNT

The results for 5 runs were the following:

* Without this change

average receive time was 26.49 seconds
standard deviation of 2.53 seconds

* With this change

average receive time was 12.51 seconds
standard deviation of 0.32 seconds

Reported-by: Robbie Ko <robbieko@synology.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/inode.c | 18 ++++++++++--------
 1 file changed, 10 insertions(+), 8 deletions(-)

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index be167f5ec433..6dbdde9a798e 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -101,7 +101,7 @@ static const unsigned char btrfs_type_by_mode[S_IFMT >> S_SHIFT] = {
 };
 
 static int btrfs_setsize(struct inode *inode, struct iattr *attr);
-static int btrfs_truncate(struct inode *inode);
+static int btrfs_truncate(struct inode *inode, bool skip_writeback);
 static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent);
 static noinline int cow_file_range(struct inode *inode,
 				   struct page *locked_page,
@@ -3668,7 +3668,7 @@ int btrfs_orphan_cleanup(struct btrfs_root *root)
 				goto out;
 			}
 
-			ret = btrfs_truncate(inode);
+			ret = btrfs_truncate(inode, false);
 			if (ret)
 				btrfs_orphan_del(NULL, BTRFS_I(inode));
 		} else {
@@ -5154,7 +5154,7 @@ static int btrfs_setsize(struct inode *inode, struct iattr *attr)
 		inode_dio_wait(inode);
 		btrfs_inode_resume_unlocked_dio(BTRFS_I(inode));
 
-		ret = btrfs_truncate(inode);
+		ret = btrfs_truncate(inode, newsize == oldsize);
 		if (ret && inode->i_nlink) {
 			int err;
 
@@ -9136,7 +9136,7 @@ out_noreserve:
 	return ret;
 }
 
-static int btrfs_truncate(struct inode *inode)
+static int btrfs_truncate(struct inode *inode, bool skip_writeback)
 {
 	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
 	struct btrfs_root *root = BTRFS_I(inode)->root;
@@ -9147,10 +9147,12 @@ static int btrfs_truncate(struct inode *inode)
 	u64 mask = fs_info->sectorsize - 1;
 	u64 min_size = btrfs_calc_trunc_metadata_size(fs_info, 1);
 
-	ret = btrfs_wait_ordered_range(inode, inode->i_size & (~mask),
-				       (u64)-1);
-	if (ret)
-		return ret;
+	if (!skip_writeback) {
+		ret = btrfs_wait_ordered_range(inode, inode->i_size & (~mask),
+					       (u64)-1);
+		if (ret)
+			return ret;
+	}
 
 	/*
 	 * Yes ladies and gentlemen, this is indeed ugly.  The fact is we have

From ffa7c4296e93b25fc302c209ae488e57aad4c973 Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana@suse.com>
Date: Tue, 6 Feb 2018 20:40:40 +0000
Subject: [PATCH 068/164] Btrfs: send, do not issue unnecessary truncate
 operations

When send finishes processing an inode representing a regular file, it
always issues a truncate operation for that file, even if its size did
not change or the last write sets the file size correctly. In the most
common cases, the issued write operations set the file to correct size
(either full or incremental sends) or the file size did not change (for
incremental sends), so the only case where a truncate operation is needed
is when a file size becomes smaller in the send snapshot when compared
to the parent snapshot.

By not issuing unnecessary truncate operations we reduce the stream size
and save time in the receiver. Currently truncating a file to the same
size triggers writeback of its last page (if it's dirty) and waits for it
to complete (only if the file size is not aligned with the filesystem's
sector size). This is being fixed by another patch and is independent of
this change (that patch's title is "Btrfs: skip writeback of last page
when truncating file to same size").

The following script was used to measure time spent by a receiver without
this change applied, with this change applied, and without this change and
with the truncate fix applied (the fix to not make it start and wait for
writeback to complete).

  $ cat test_send.sh
  #!/bin/bash

  SRC_DEV=/dev/sdc
  DST_DEV=/dev/sdd
  SRC_MNT=/mnt/sdc
  DST_MNT=/mnt/sdd

  mkfs.btrfs -f $SRC_DEV >/dev/null
  mkfs.btrfs -f $DST_DEV >/dev/null
  mount $SRC_DEV $SRC_MNT
  mount $DST_DEV $DST_MNT

  echo "Creating source filesystem"
  for ((t = 0; t < 10; t++)); do
      (
          for ((i = 1; i <= 20000; i++)); do
              xfs_io -f -c "pwrite -S 0xab 0 5000" \
                  $SRC_MNT/file_$i > /dev/null
          done
      ) &
     worker_pids[$t]=$!
  done
  wait ${worker_pids[@]}

  echo "Creating and sending snapshot"
  btrfs subvolume snapshot -r $SRC_MNT $SRC_MNT/snap1 >/dev/null
  /usr/bin/time -f "send took %e seconds"    \
         btrfs send -f $SRC_MNT/send_file $SRC_MNT/snap1
  /usr/bin/time -f "receive took %e seconds" \
         btrfs receive -f $SRC_MNT/send_file $DST_MNT

  umount $SRC_MNT
  umount $DST_MNT

The results, which are averages for 5 runs for each case, were the
following:

* Without this change

average receive time was 26.49 seconds
standard deviation of 2.53 seconds

* Without this change and with the truncate fix

average receive time was 12.51 seconds
standard deviation of 0.32 seconds

* With this change and without the truncate fix

average receive time was 10.02 seconds
standard deviation of 1.11 seconds

Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/send.c | 26 +++++++++++++++++++++-----
 1 file changed, 21 insertions(+), 5 deletions(-)

diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index 085542832b9a..6615d849f0f0 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -112,6 +112,7 @@ struct send_ctx {
 	u64 cur_inode_mode;
 	u64 cur_inode_rdev;
 	u64 cur_inode_last_extent;
+	u64 cur_inode_next_write_offset;
 
 	u64 send_progress;
 
@@ -5030,6 +5031,7 @@ static int send_hole(struct send_ctx *sctx, u64 end)
 			break;
 		offset += len;
 	}
+	sctx->cur_inode_next_write_offset = offset;
 tlv_put_failure:
 	fs_path_free(p);
 	return ret;
@@ -5265,6 +5267,7 @@ static int send_write_or_clone(struct send_ctx *sctx,
 	} else {
 		ret = send_extent_data(sctx, offset, len);
 	}
+	sctx->cur_inode_next_write_offset = offset + len;
 out:
 	return ret;
 }
@@ -5789,6 +5792,7 @@ static int finish_inode_if_needed(struct send_ctx *sctx, int at_end)
 	u64 right_gid;
 	int need_chmod = 0;
 	int need_chown = 0;
+	int need_truncate = 1;
 	int pending_move = 0;
 	int refs_processed = 0;
 
@@ -5826,9 +5830,13 @@ static int finish_inode_if_needed(struct send_ctx *sctx, int at_end)
 		need_chown = 1;
 		if (!S_ISLNK(sctx->cur_inode_mode))
 			need_chmod = 1;
+		if (sctx->cur_inode_next_write_offset == sctx->cur_inode_size)
+			need_truncate = 0;
 	} else {
+		u64 old_size;
+
 		ret = get_inode_info(sctx->parent_root, sctx->cur_ino,
-				NULL, NULL, &right_mode, &right_uid,
+				&old_size, NULL, &right_mode, &right_uid,
 				&right_gid, NULL);
 		if (ret < 0)
 			goto out;
@@ -5837,6 +5845,10 @@ static int finish_inode_if_needed(struct send_ctx *sctx, int at_end)
 			need_chown = 1;
 		if (!S_ISLNK(sctx->cur_inode_mode) && left_mode != right_mode)
 			need_chmod = 1;
+		if ((old_size == sctx->cur_inode_size) ||
+		    (sctx->cur_inode_size > old_size &&
+		     sctx->cur_inode_next_write_offset == sctx->cur_inode_size))
+			need_truncate = 0;
 	}
 
 	if (S_ISREG(sctx->cur_inode_mode)) {
@@ -5855,10 +5867,13 @@ static int finish_inode_if_needed(struct send_ctx *sctx, int at_end)
 					goto out;
 			}
 		}
-		ret = send_truncate(sctx, sctx->cur_ino, sctx->cur_inode_gen,
-				sctx->cur_inode_size);
-		if (ret < 0)
-			goto out;
+		if (need_truncate) {
+			ret = send_truncate(sctx, sctx->cur_ino,
+					    sctx->cur_inode_gen,
+					    sctx->cur_inode_size);
+			if (ret < 0)
+				goto out;
+		}
 	}
 
 	if (need_chown) {
@@ -5912,6 +5927,7 @@ static int changed_inode(struct send_ctx *sctx,
 	sctx->cur_ino = key->objectid;
 	sctx->cur_inode_new_gen = 0;
 	sctx->cur_inode_last_extent = (u64)-1;
+	sctx->cur_inode_next_write_offset = 0;
 
 	/*
 	 * Set send_progress to current inode. This will tell all get_cur_xxx

From 7852781d94b30096ca0f273aa776d2dbcca6d640 Mon Sep 17 00:00:00 2001
From: David Sterba <dsterba@suse.com>
Date: Tue, 27 Feb 2018 15:48:52 +0100
Subject: [PATCH 069/164] btrfs: drop underscores from exported xattr functions

Reviewed-by: Nikolay Borisov <nborisov@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/acl.c   |  6 +++---
 fs/btrfs/props.c |  6 +++---
 fs/btrfs/xattr.c | 12 ++++++------
 fs/btrfs/xattr.h |  4 ++--
 4 files changed, 14 insertions(+), 14 deletions(-)

diff --git a/fs/btrfs/acl.c b/fs/btrfs/acl.c
index 1ba49ebe67da..f8a1bdf06b2a 100644
--- a/fs/btrfs/acl.c
+++ b/fs/btrfs/acl.c
@@ -46,12 +46,12 @@ struct posix_acl *btrfs_get_acl(struct inode *inode, int type)
 		BUG();
 	}
 
-	size = __btrfs_getxattr(inode, name, "", 0);
+	size = btrfs_getxattr(inode, name, "", 0);
 	if (size > 0) {
 		value = kzalloc(size, GFP_KERNEL);
 		if (!value)
 			return ERR_PTR(-ENOMEM);
-		size = __btrfs_getxattr(inode, name, value, size);
+		size = btrfs_getxattr(inode, name, value, size);
 	}
 	if (size > 0) {
 		acl = posix_acl_from_xattr(&init_user_ns, value, size);
@@ -101,7 +101,7 @@ static int __btrfs_set_acl(struct btrfs_trans_handle *trans,
 			goto out;
 	}
 
-	ret = __btrfs_setxattr(trans, inode, name, value, size, 0);
+	ret = btrfs_setxattr(trans, inode, name, value, size, 0);
 out:
 	kfree(value);
 
diff --git a/fs/btrfs/props.c b/fs/btrfs/props.c
index e4ac24175524..5859f7d3cf3e 100644
--- a/fs/btrfs/props.c
+++ b/fs/btrfs/props.c
@@ -116,7 +116,7 @@ static int __btrfs_set_prop(struct btrfs_trans_handle *trans,
 		return -EINVAL;
 
 	if (value_len == 0) {
-		ret = __btrfs_setxattr(trans, inode, handler->xattr_name,
+		ret = btrfs_setxattr(trans, inode, handler->xattr_name,
 				       NULL, 0, flags);
 		if (ret)
 			return ret;
@@ -130,13 +130,13 @@ static int __btrfs_set_prop(struct btrfs_trans_handle *trans,
 	ret = handler->validate(value, value_len);
 	if (ret)
 		return ret;
-	ret = __btrfs_setxattr(trans, inode, handler->xattr_name,
+	ret = btrfs_setxattr(trans, inode, handler->xattr_name,
 			       value, value_len, flags);
 	if (ret)
 		return ret;
 	ret = handler->apply(inode, value, value_len);
 	if (ret) {
-		__btrfs_setxattr(trans, inode, handler->xattr_name,
+		btrfs_setxattr(trans, inode, handler->xattr_name,
 				 NULL, 0, flags);
 		return ret;
 	}
diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c
index de7d072c78ef..414ccbe26c71 100644
--- a/fs/btrfs/xattr.c
+++ b/fs/btrfs/xattr.c
@@ -33,7 +33,7 @@
 #include "locking.h"
 
 
-ssize_t __btrfs_getxattr(struct inode *inode, const char *name,
+ssize_t btrfs_getxattr(struct inode *inode, const char *name,
 				void *buffer, size_t size)
 {
 	struct btrfs_dir_item *di;
@@ -233,7 +233,7 @@ out:
 /*
  * @value: "" makes the attribute to empty, NULL removes it
  */
-int __btrfs_setxattr(struct btrfs_trans_handle *trans,
+int btrfs_setxattr(struct btrfs_trans_handle *trans,
 		     struct inode *inode, const char *name,
 		     const void *value, size_t size, int flags)
 {
@@ -374,7 +374,7 @@ static int btrfs_xattr_handler_get(const struct xattr_handler *handler,
 				   const char *name, void *buffer, size_t size)
 {
 	name = xattr_full_name(handler, name);
-	return __btrfs_getxattr(inode, name, buffer, size);
+	return btrfs_getxattr(inode, name, buffer, size);
 }
 
 static int btrfs_xattr_handler_set(const struct xattr_handler *handler,
@@ -383,7 +383,7 @@ static int btrfs_xattr_handler_set(const struct xattr_handler *handler,
 				   size_t size, int flags)
 {
 	name = xattr_full_name(handler, name);
-	return __btrfs_setxattr(NULL, inode, name, buffer, size, flags);
+	return btrfs_setxattr(NULL, inode, name, buffer, size, flags);
 }
 
 static int btrfs_xattr_handler_set_prop(const struct xattr_handler *handler,
@@ -448,8 +448,8 @@ static int btrfs_initxattrs(struct inode *inode,
 		}
 		strcpy(name, XATTR_SECURITY_PREFIX);
 		strcpy(name + XATTR_SECURITY_PREFIX_LEN, xattr->name);
-		err = __btrfs_setxattr(trans, inode, name,
-				       xattr->value, xattr->value_len, 0);
+		err = btrfs_setxattr(trans, inode, name, xattr->value,
+				xattr->value_len, 0);
 		kfree(name);
 		if (err < 0)
 			break;
diff --git a/fs/btrfs/xattr.h b/fs/btrfs/xattr.h
index 15fc4743dc70..3c227c549dfc 100644
--- a/fs/btrfs/xattr.h
+++ b/fs/btrfs/xattr.h
@@ -23,9 +23,9 @@
 
 extern const struct xattr_handler *btrfs_xattr_handlers[];
 
-extern ssize_t __btrfs_getxattr(struct inode *inode, const char *name,
+extern ssize_t btrfs_getxattr(struct inode *inode, const char *name,
 		void *buffer, size_t size);
-extern int __btrfs_setxattr(struct btrfs_trans_handle *trans,
+extern int btrfs_setxattr(struct btrfs_trans_handle *trans,
 			    struct inode *inode, const char *name,
 			    const void *value, size_t size, int flags);
 

From ab0d09361662b7593fe166d27ad49b8852a2ef3e Mon Sep 17 00:00:00 2001
From: David Sterba <dsterba@suse.com>
Date: Tue, 27 Feb 2018 15:48:54 +0100
Subject: [PATCH 070/164] btrfs: drop extern from function declarations

Extern for functions does not make any difference, there are only a few
so let's remove them before it's too late.

Reviewed-by: Nikolay Borisov <nborisov@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/extent_io.h | 4 ++--
 fs/btrfs/xattr.h     | 6 +++---
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index e359c5d4305c..c82a5842d524 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -83,8 +83,8 @@ static inline int le_test_bit(int nr, const u8 *addr)
 	return 1U & (addr[BIT_BYTE(nr)] >> (nr & (BITS_PER_BYTE-1)));
 }
 
-extern void le_bitmap_set(u8 *map, unsigned int start, int len);
-extern void le_bitmap_clear(u8 *map, unsigned int start, int len);
+void le_bitmap_set(u8 *map, unsigned int start, int len);
+void le_bitmap_clear(u8 *map, unsigned int start, int len);
 
 struct extent_state;
 struct btrfs_root;
diff --git a/fs/btrfs/xattr.h b/fs/btrfs/xattr.h
index 3c227c549dfc..96e6f3a304d3 100644
--- a/fs/btrfs/xattr.h
+++ b/fs/btrfs/xattr.h
@@ -23,13 +23,13 @@
 
 extern const struct xattr_handler *btrfs_xattr_handlers[];
 
-extern ssize_t btrfs_getxattr(struct inode *inode, const char *name,
+ssize_t btrfs_getxattr(struct inode *inode, const char *name,
 		void *buffer, size_t size);
-extern int btrfs_setxattr(struct btrfs_trans_handle *trans,
+int btrfs_setxattr(struct btrfs_trans_handle *trans,
 			    struct inode *inode, const char *name,
 			    const void *value, size_t size, int flags);
 
-extern int btrfs_xattr_security_init(struct btrfs_trans_handle *trans,
+int btrfs_xattr_security_init(struct btrfs_trans_handle *trans,
 				     struct inode *inode, struct inode *dir,
 				     const struct qstr *qstr);
 

From bcadd7050a43216e6ace936c23b4055fb5966ca1 Mon Sep 17 00:00:00 2001
From: David Sterba <dsterba@suse.com>
Date: Tue, 27 Feb 2018 15:48:57 +0100
Subject: [PATCH 071/164] btrfs: adjust return type of btrfs_getxattr

The xattr_handler::get prototype returns int, use it. The only ssize_t
exception is the per-inode listxattr handler.

Reviewed-by: Nikolay Borisov <nborisov@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/xattr.c | 2 +-
 fs/btrfs/xattr.h | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c
index 414ccbe26c71..e1e8177deb5e 100644
--- a/fs/btrfs/xattr.c
+++ b/fs/btrfs/xattr.c
@@ -33,7 +33,7 @@
 #include "locking.h"
 
 
-ssize_t btrfs_getxattr(struct inode *inode, const char *name,
+int btrfs_getxattr(struct inode *inode, const char *name,
 				void *buffer, size_t size)
 {
 	struct btrfs_dir_item *di;
diff --git a/fs/btrfs/xattr.h b/fs/btrfs/xattr.h
index 96e6f3a304d3..57c638730617 100644
--- a/fs/btrfs/xattr.h
+++ b/fs/btrfs/xattr.h
@@ -23,7 +23,7 @@
 
 extern const struct xattr_handler *btrfs_xattr_handlers[];
 
-ssize_t btrfs_getxattr(struct inode *inode, const char *name,
+int btrfs_getxattr(struct inode *inode, const char *name,
 		void *buffer, size_t size);
 int btrfs_setxattr(struct btrfs_trans_handle *trans,
 			    struct inode *inode, const char *name,

From 738c93d42c277d790cd49372c9bf24bcfea13306 Mon Sep 17 00:00:00 2001
From: David Sterba <dsterba@suse.com>
Date: Tue, 27 Feb 2018 15:48:59 +0100
Subject: [PATCH 072/164] btrfs: move btrfs_listxattr prototype to xattr.h

There's a proper header for xattr handlers.

Reviewed-by: Nikolay Borisov <nborisov@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/ctree.h | 3 ---
 fs/btrfs/xattr.h | 1 +
 2 files changed, 1 insertion(+), 3 deletions(-)

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index eabc8431b442..f4926dc9649f 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -3294,9 +3294,6 @@ void __cold btrfs_exit_sysfs(void);
 int btrfs_sysfs_add_mounted(struct btrfs_fs_info *fs_info);
 void btrfs_sysfs_remove_mounted(struct btrfs_fs_info *fs_info);
 
-/* xattr.c */
-ssize_t btrfs_listxattr(struct dentry *dentry, char *buffer, size_t size);
-
 /* super.c */
 int btrfs_parse_options(struct btrfs_fs_info *info, char *options,
 			unsigned long new_flags);
diff --git a/fs/btrfs/xattr.h b/fs/btrfs/xattr.h
index 57c638730617..e215a3212a2a 100644
--- a/fs/btrfs/xattr.h
+++ b/fs/btrfs/xattr.h
@@ -28,6 +28,7 @@ int btrfs_getxattr(struct inode *inode, const char *name,
 int btrfs_setxattr(struct btrfs_trans_handle *trans,
 			    struct inode *inode, const char *name,
 			    const void *value, size_t size, int flags);
+ssize_t btrfs_listxattr(struct dentry *dentry, char *buffer, size_t size);
 
 int btrfs_xattr_security_init(struct btrfs_trans_handle *trans,
 				     struct inode *inode, struct inode *dir,

From 448f3a17ac538f6e5b0fa94259cb41f3f019394f Mon Sep 17 00:00:00 2001
From: Nikolay Borisov <nborisov@suse.com>
Date: Tue, 27 Feb 2018 17:37:16 +0200
Subject: [PATCH 073/164] btrfs: Remove redundant comment from
 btrfs_search_forward

This function always sets keep_locks to 1 and saves the old value of
keep_locks which is restored at the end. So there is no way it can be
called without keep_locks being set. Remove comment imposing redundant
requirement on callers.

Signed-off-by: Nikolay Borisov <nborisov@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/ctree.c | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index b88a79e69ddf..a80fcd285b34 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -5145,9 +5145,6 @@ int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path)
  * into min_key, so you can call btrfs_search_slot with cow=1 on the
  * key and get a writable path.
  *
- * This does lock as it descends, and path->keep_locks should be set
- * to 1 by the caller.
- *
  * This honors path->lowest_level to prevent descent past a given level
  * of the tree.
  *

From f882274b2df2a2533b4fc81713838778d4ac12d6 Mon Sep 17 00:00:00 2001
From: Nikolay Borisov <nborisov@suse.com>
Date: Tue, 27 Feb 2018 17:37:17 +0200
Subject: [PATCH 074/164] btrfs: Remove root arg from btrfs_log_inode_parent

btrfs_log_inode_parent is called from 2 places (btrfs_log_dentry_safe
and btrfs_log_new_name) both of which pass inode->root as the root
argument and the inode itself. Remove the redundant root argument and
get a reference to the root directly from the inode, also remove
redundant root != inode->root check from the same function. No
functional change.

Signed-off-by: Nikolay Borisov <nborisov@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/tree-log.c | 13 ++++++-------
 1 file changed, 6 insertions(+), 7 deletions(-)

diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index bbd8a40b2006..05ce8b12beec 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -5513,7 +5513,6 @@ out:
  * the last committed transaction
  */
 static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
-				  struct btrfs_root *root,
 				  struct btrfs_inode *inode,
 				  struct dentry *parent,
 				  const loff_t start,
@@ -5521,6 +5520,7 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
 				  int inode_only,
 				  struct btrfs_log_ctx *ctx)
 {
+	struct btrfs_root *root = inode->root;
 	struct btrfs_fs_info *fs_info = root->fs_info;
 	struct super_block *sb;
 	struct dentry *old_parent = NULL;
@@ -5546,7 +5546,7 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
 		goto end_no_trans;
 	}
 
-	if (root != inode->root || btrfs_root_refs(&root->root_item) == 0) {
+	if (btrfs_root_refs(&root->root_item) == 0) {
 		ret = 1;
 		goto end_no_trans;
 	}
@@ -5686,8 +5686,8 @@ int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans,
 	struct dentry *parent = dget_parent(dentry);
 	int ret;
 
-	ret = btrfs_log_inode_parent(trans, root, BTRFS_I(d_inode(dentry)),
-			parent, start, end, LOG_INODE_ALL, ctx);
+	ret = btrfs_log_inode_parent(trans, BTRFS_I(d_inode(dentry)), parent,
+				     start, end, LOG_INODE_ALL, ctx);
 	dput(parent);
 
 	return ret;
@@ -5949,7 +5949,6 @@ int btrfs_log_new_name(struct btrfs_trans_handle *trans,
 			struct dentry *parent)
 {
 	struct btrfs_fs_info *fs_info = btrfs_sb(inode->vfs_inode.i_sb);
-	struct btrfs_root *root = inode->root;
 
 	/*
 	 * this will force the logging code to walk the dentry chain
@@ -5966,7 +5965,7 @@ int btrfs_log_new_name(struct btrfs_trans_handle *trans,
 	    (!old_dir || old_dir->logged_trans <= fs_info->last_trans_committed))
 		return 0;
 
-	return btrfs_log_inode_parent(trans, root, inode, parent, 0,
-				      LLONG_MAX, LOG_INODE_EXISTS, NULL);
+	return btrfs_log_inode_parent(trans, inode, parent, 0, LLONG_MAX,
+				      LOG_INODE_EXISTS, NULL);
 }
 

From e5b84f7a258a8344947e8b955b8c2648008d8ccc Mon Sep 17 00:00:00 2001
From: Nikolay Borisov <nborisov@suse.com>
Date: Tue, 27 Feb 2018 17:37:18 +0200
Subject: [PATCH 075/164] btrfs: Remove root argument from
 btrfs_log_dentry_safe

Now that nothing uses the root arg of btrfs_log_dentry_safe it can be
safely removed. No functional changes.

Signed-off-by: Nikolay Borisov <nborisov@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/file.c     | 2 +-
 fs/btrfs/tree-log.c | 2 +-
 fs/btrfs/tree-log.h | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index a335e2e6c84d..8f425c64d75f 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -2214,7 +2214,7 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
 	}
 	trans->sync = true;
 
-	ret = btrfs_log_dentry_safe(trans, root, dentry, start, end, &ctx);
+	ret = btrfs_log_dentry_safe(trans, dentry, start, end, &ctx);
 	if (ret < 0) {
 		/* Fallthrough and commit/free transaction. */
 		ret = 1;
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 05ce8b12beec..7b8fee45b29e 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -5678,7 +5678,7 @@ end_no_trans:
  * data on disk.
  */
 int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans,
-			  struct btrfs_root *root, struct dentry *dentry,
+			  struct dentry *dentry,
 			  const loff_t start,
 			  const loff_t end,
 			  struct btrfs_log_ctx *ctx)
diff --git a/fs/btrfs/tree-log.h b/fs/btrfs/tree-log.h
index 483027f9a7f4..88abc43312a1 100644
--- a/fs/btrfs/tree-log.h
+++ b/fs/btrfs/tree-log.h
@@ -65,7 +65,7 @@ int btrfs_free_log_root_tree(struct btrfs_trans_handle *trans,
 			     struct btrfs_fs_info *fs_info);
 int btrfs_recover_log_trees(struct btrfs_root *tree_root);
 int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans,
-			  struct btrfs_root *root, struct dentry *dentry,
+			  struct dentry *dentry,
 			  const loff_t start,
 			  const loff_t end,
 			  struct btrfs_log_ctx *ctx);

From 895a72be411e9a6b40c1062f0f54faca2bb03568 Mon Sep 17 00:00:00 2001
From: Liu Bo <bo.li.liu@oracle.com>
Date: Fri, 2 Mar 2018 18:05:49 -0700
Subject: [PATCH 076/164] Btrfs: send: fix typo in TLV_PUT

According to tlv_put()'s prototype, data and attrlen needs to be
exchanged in the macro, but seems all callers are already aware of
this misorder and are therefore not affected.

Signed-off-by: Liu Bo <bo.li.liu@oracle.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/send.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index 6615d849f0f0..1f5748c7d1c7 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -613,9 +613,9 @@ static int tlv_put_btrfs_timespec(struct send_ctx *sctx, u16 attr,
 }
 
 
-#define TLV_PUT(sctx, attrtype, attrlen, data) \
+#define TLV_PUT(sctx, attrtype, data, attrlen) \
 	do { \
-		ret = tlv_put(sctx, attrtype, attrlen, data); \
+		ret = tlv_put(sctx, attrtype, data, attrlen); \
 		if (ret < 0) \
 			goto tlv_put_failure; \
 	} while (0)

From d02c0e20194bfd3066bff2505b2540a14f3fa3d2 Mon Sep 17 00:00:00 2001
From: Nikolay Borisov <nborisov@suse.com>
Date: Fri, 2 Mar 2018 09:43:15 +0200
Subject: [PATCH 077/164] btrfs: Remove root argument from
 cow_file_range_inline

This argument is always set to the root of the inode, which is also
passed. So let's get a reference inside the function and simplify
the arg list.

Signed-off-by: Nikolay Borisov <nborisov@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/inode.c | 15 +++++++--------
 1 file changed, 7 insertions(+), 8 deletions(-)

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 6dbdde9a798e..ce65acafe7bb 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -276,12 +276,12 @@ fail:
  * does the checks required to make sure the data is small enough
  * to fit as an inline extent.
  */
-static noinline int cow_file_range_inline(struct btrfs_root *root,
-					  struct inode *inode, u64 start,
+static noinline int cow_file_range_inline(struct inode *inode, u64 start,
 					  u64 end, size_t compressed_size,
 					  int compress_type,
 					  struct page **compressed_pages)
 {
+	struct btrfs_root *root = BTRFS_I(inode)->root;
 	struct btrfs_fs_info *fs_info = root->fs_info;
 	struct btrfs_trans_handle *trans;
 	u64 isize = i_size_read(inode);
@@ -457,7 +457,6 @@ static noinline void compress_file_range(struct inode *inode,
 					int *num_added)
 {
 	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
-	struct btrfs_root *root = BTRFS_I(inode)->root;
 	u64 blocksize = fs_info->sectorsize;
 	u64 actual_end;
 	u64 isize = i_size_read(inode);
@@ -579,11 +578,11 @@ cont:
 			/* we didn't compress the entire range, try
 			 * to make an uncompressed inline extent.
 			 */
-			ret = cow_file_range_inline(root, inode, start, end,
-					    0, BTRFS_COMPRESS_NONE, NULL);
+			ret = cow_file_range_inline(inode, start, end, 0,
+						    BTRFS_COMPRESS_NONE, NULL);
 		} else {
 			/* try making a compressed inline extent */
-			ret = cow_file_range_inline(root, inode, start, end,
+			ret = cow_file_range_inline(inode, start, end,
 						    total_compressed,
 						    compress_type, pages);
 		}
@@ -983,8 +982,8 @@ static noinline int cow_file_range(struct inode *inode,
 
 	if (start == 0) {
 		/* lets try to make an inline extent */
-		ret = cow_file_range_inline(root, inode, start, end, 0,
-					BTRFS_COMPRESS_NONE, NULL);
+		ret = cow_file_range_inline(inode, start, end, 0,
+					    BTRFS_COMPRESS_NONE, NULL);
 		if (ret == 0) {
 			/*
 			 * We use DO_ACCOUNTING here because we need the

From 9b99b11564446600f3a3ce394a841d525b7993e7 Mon Sep 17 00:00:00 2001
From: Anand Jain <anand.jain@oracle.com>
Date: Tue, 27 Feb 2018 12:41:59 +0800
Subject: [PATCH 078/164] btrfs: rename btrfs_close_extra_device to
 btrfs_free_extra_devids

This function btrfs_close_extra_devices() is about freeing
extra devids which once it may have belonged to this filesystem.
So rename it and add the comment. The _devid suffix is
appropriate as this function won't handle devices which are
outside of the filesytem being mounted.

Signed-off-by: Anand Jain <anand.jain@oracle.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/disk-io.c | 8 ++++----
 fs/btrfs/volumes.c | 6 +++++-
 fs/btrfs/volumes.h | 2 +-
 3 files changed, 10 insertions(+), 6 deletions(-)

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 798e602c1834..bbbcbf87ac93 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -2758,10 +2758,10 @@ int open_ctree(struct super_block *sb,
 	}
 
 	/*
-	 * keep the device that is marked to be the target device for the
-	 * dev_replace procedure
+	 * Keep the devid that is marked to be the target device for the
+	 * device replace procedure
 	 */
-	btrfs_close_extra_devices(fs_devices, 0);
+	btrfs_free_extra_devids(fs_devices, 0);
 
 	if (!fs_devices->latest_bdev) {
 		btrfs_err(fs_info, "failed to read devices");
@@ -2824,7 +2824,7 @@ retry_root_backup:
 		goto fail_block_groups;
 	}
 
-	btrfs_close_extra_devices(fs_devices, 1);
+	btrfs_free_extra_devids(fs_devices, 1);
 
 	ret = btrfs_sysfs_add_fsid(fs_devices, NULL);
 	if (ret) {
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index b580ef08c368..1622d3b73771 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -896,7 +896,11 @@ error:
 	return ERR_PTR(-ENOMEM);
 }
 
-void btrfs_close_extra_devices(struct btrfs_fs_devices *fs_devices, int step)
+/*
+ * After we have read the system tree and know devids belonging to
+ * this filesystem, remove the device which does not belong there.
+ */
+void btrfs_free_extra_devids(struct btrfs_fs_devices *fs_devices, int step)
 {
 	struct btrfs_device *device, *next;
 	struct btrfs_device *latest_dev = NULL;
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index d28f5745fee2..d1fcaea9fef5 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -422,7 +422,7 @@ int btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
 int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder,
 			  struct btrfs_fs_devices **fs_devices_ret);
 int btrfs_close_devices(struct btrfs_fs_devices *fs_devices);
-void btrfs_close_extra_devices(struct btrfs_fs_devices *fs_devices, int step);
+void btrfs_free_extra_devids(struct btrfs_fs_devices *fs_devices, int step);
 void btrfs_assign_next_active_device(struct btrfs_fs_info *fs_info,
 		struct btrfs_device *device, struct btrfs_device *this_dev);
 int btrfs_find_device_missing_or_by_path(struct btrfs_fs_info *fs_info,

From 005d67127fa9dfb3382f2c9e918feed7a243a7fe Mon Sep 17 00:00:00 2001
From: Su Yue <suy.fnst@cn.fujitsu.com>
Date: Mon, 5 Mar 2018 17:13:37 +0800
Subject: [PATCH 079/164] btrfs: adjust return values of btrfs_inode_by_name

Previously, btrfs_inode_by_name() returned 0 which left caller to check
objectid of location even location if the type was invalid.

Let btrfs_inode_by_name() return -EUCLEAN if a corrupted location of a
dir entry is found.  Removal of label out_err also simplifies the
function.

Signed-off-by: Su Yue <suy.fnst@cn.fujitsu.com>
Reviewed-by: David Sterba <dsterba@suse.com>
[ drop unlikely ]
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/inode.c | 22 ++++++++++------------
 1 file changed, 10 insertions(+), 12 deletions(-)

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index ce65acafe7bb..4a07cce28215 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -5489,7 +5489,8 @@ no_delete:
 
 /*
  * this returns the key found in the dir entry in the location pointer.
- * If no dir entries were found, location->objectid is 0.
+ * If no dir entries were found, returns -ENOENT.
+ * If found a corrupted location in dir entry, returns -EUCLEAN.
  */
 static int btrfs_inode_by_name(struct inode *dir, struct dentry *dentry,
 			       struct btrfs_key *location)
@@ -5507,27 +5508,27 @@ static int btrfs_inode_by_name(struct inode *dir, struct dentry *dentry,
 
 	di = btrfs_lookup_dir_item(NULL, root, path, btrfs_ino(BTRFS_I(dir)),
 			name, namelen, 0);
-	if (IS_ERR(di))
+	if (!di) {
+		ret = -ENOENT;
+		goto out;
+	}
+	if (IS_ERR(di)) {
 		ret = PTR_ERR(di);
-
-	if (IS_ERR_OR_NULL(di))
-		goto out_err;
+		goto out;
+	}
 
 	btrfs_dir_item_key_to_cpu(path->nodes[0], di, location);
 	if (location->type != BTRFS_INODE_ITEM_KEY &&
 	    location->type != BTRFS_ROOT_ITEM_KEY) {
+		ret = -EUCLEAN;
 		btrfs_warn(root->fs_info,
 "%s gets something invalid in DIR_ITEM (name %s, directory ino %llu, location(%llu %u %llu))",
 			   __func__, name, btrfs_ino(BTRFS_I(dir)),
 			   location->objectid, location->type, location->offset);
-		goto out_err;
 	}
 out:
 	btrfs_free_path(path);
 	return ret;
-out_err:
-	location->objectid = 0;
-	goto out;
 }
 
 /*
@@ -5830,9 +5831,6 @@ struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry)
 	if (ret < 0)
 		return ERR_PTR(ret);
 
-	if (location.objectid == 0)
-		return ERR_PTR(-ENOENT);
-
 	if (location.type == BTRFS_INODE_ITEM_KEY) {
 		inode = btrfs_iget(dir->i_sb, &location, root, NULL);
 		return inode;

From d6a691350b5d5381fad04a6dbf31bb6f393089b5 Mon Sep 17 00:00:00 2001
From: Liu Bo <bo.li.liu@oracle.com>
Date: Fri, 2 Mar 2018 16:10:39 -0700
Subject: [PATCH 080/164] Btrfs: raid56: remove redundant async_missing_raid56

async_missing_raid56() is identical to async_read_rebuild().

Signed-off-by: Liu Bo <bo.li.liu@oracle.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/raid56.c | 18 +-----------------
 1 file changed, 1 insertion(+), 17 deletions(-)

diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c
index fcfc20de2df3..002bddc7fa09 100644
--- a/fs/btrfs/raid56.c
+++ b/fs/btrfs/raid56.c
@@ -2768,24 +2768,8 @@ raid56_alloc_missing_rbio(struct btrfs_fs_info *fs_info, struct bio *bio,
 	return rbio;
 }
 
-static void missing_raid56_work(struct btrfs_work *work)
-{
-	struct btrfs_raid_bio *rbio;
-
-	rbio = container_of(work, struct btrfs_raid_bio, work);
-	__raid56_parity_recover(rbio);
-}
-
-static void async_missing_raid56(struct btrfs_raid_bio *rbio)
-{
-	btrfs_init_work(&rbio->work, btrfs_rmw_helper,
-			missing_raid56_work, NULL, NULL);
-
-	btrfs_queue_work(rbio->fs_info->rmw_workers, &rbio->work);
-}
-
 void raid56_submit_missing_rbio(struct btrfs_raid_bio *rbio)
 {
 	if (!lock_stripe_add(rbio))
-		async_missing_raid56(rbio);
+		async_read_rebuild(rbio);
 }

From 4759700a71ab109c67376360e9f00e2fed6b3807 Mon Sep 17 00:00:00 2001
From: Liu Bo <bo.li.liu@oracle.com>
Date: Fri, 2 Mar 2018 16:10:41 -0700
Subject: [PATCH 081/164] Btrfs: dev-replace: make sure target is identical to
 source when raid56 rebuild fails

In the last step of scrub_handle_error_block, we try to combine good
copies on all possible mirrors, this works fine for raid1 and raid10,
but not for raid56 as it's doing parity rebuild.

If parity rebuild doesn't get back with correct data which matches its
checksum, in case of replace we'd rather write what is stored in the
source device than the data calculuated from parity.

Signed-off-by: Liu Bo <bo.li.liu@oracle.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/scrub.c | 13 +++++++++++--
 1 file changed, 11 insertions(+), 2 deletions(-)

diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index 9fb7d09842e6..f4c9b30903b0 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -1412,8 +1412,17 @@ nodatasum_case:
 		if (!page_bad->io_error && !sctx->is_dev_replace)
 			continue;
 
-		/* try to find no-io-error page in mirrors */
-		if (page_bad->io_error) {
+		if (scrub_is_page_on_raid56(sblock_bad->pagev[0])) {
+			/*
+			 * In case of dev replace, if raid56 rebuild process
+			 * didn't work out correct data, then copy the content
+			 * in sblock_bad to make sure target device is identical
+			 * to source device, instead of writing garbage data in
+			 * sblock_for_recheck array to target device.
+			 */
+			sblock_other = NULL;
+		} else if (page_bad->io_error) {
+			/* try to find no-io-error page in mirrors */
 			for (mirror_index = 0;
 			     mirror_index < BTRFS_MAX_MIRRORS &&
 			     sblocks_for_recheck[mirror_index].page_count > 0;

From 965aab1cfc13dacc3d5ad0d8d8eb4e0a7ea8c1d1 Mon Sep 17 00:00:00 2001
From: Matthew Wilcox <mawilcox@microsoft.com>
Date: Tue, 6 Mar 2018 11:23:16 -0800
Subject: [PATCH 082/164] btrfs: Use filemap_range_has_page()

The current implementation of btrfs_page_exists_in_range() gives the
wrong answer if the workingset code has stored a shadow entry in the
page cache.  The filemap_range_has_page() function does not have this
problem, and it's shared code, so use it instead.

eigned-off-by: Matthew Wilcox <mawilcox@microsoft.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/btrfs_inode.h |  6 +++-
 fs/btrfs/inode.c       | 70 ------------------------------------------
 2 files changed, 5 insertions(+), 71 deletions(-)

diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index f527e99c9f8d..078a53e01ece 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -364,6 +364,10 @@ static inline void btrfs_print_data_csum_error(struct btrfs_inode *inode,
 			logical_start, csum, csum_expected, mirror_num);
 }
 
-bool btrfs_page_exists_in_range(struct inode *inode, loff_t start, loff_t end);
+static inline bool btrfs_page_exists_in_range(struct inode *inode,
+						loff_t start, loff_t end)
+{
+	return filemap_range_has_page(inode->i_mapping, start, end);
+}
 
 #endif
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 4a07cce28215..c92455428c01 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -7464,76 +7464,6 @@ out:
 	return ret;
 }
 
-bool btrfs_page_exists_in_range(struct inode *inode, loff_t start, loff_t end)
-{
-	struct radix_tree_root *root = &inode->i_mapping->page_tree;
-	bool found = false;
-	void **pagep = NULL;
-	struct page *page = NULL;
-	unsigned long start_idx;
-	unsigned long end_idx;
-
-	start_idx = start >> PAGE_SHIFT;
-
-	/*
-	 * end is the last byte in the last page.  end == start is legal
-	 */
-	end_idx = end >> PAGE_SHIFT;
-
-	rcu_read_lock();
-
-	/* Most of the code in this while loop is lifted from
-	 * find_get_page.  It's been modified to begin searching from a
-	 * page and return just the first page found in that range.  If the
-	 * found idx is less than or equal to the end idx then we know that
-	 * a page exists.  If no pages are found or if those pages are
-	 * outside of the range then we're fine (yay!) */
-	while (page == NULL &&
-	       radix_tree_gang_lookup_slot(root, &pagep, NULL, start_idx, 1)) {
-		page = radix_tree_deref_slot(pagep);
-		if (unlikely(!page))
-			break;
-
-		if (radix_tree_exception(page)) {
-			if (radix_tree_deref_retry(page)) {
-				page = NULL;
-				continue;
-			}
-			/*
-			 * Otherwise, shmem/tmpfs must be storing a swap entry
-			 * here as an exceptional entry: so return it without
-			 * attempting to raise page count.
-			 */
-			page = NULL;
-			break; /* TODO: Is this relevant for this use case? */
-		}
-
-		if (!page_cache_get_speculative(page)) {
-			page = NULL;
-			continue;
-		}
-
-		/*
-		 * Has the page moved?
-		 * This is part of the lockless pagecache protocol. See
-		 * include/linux/pagemap.h for details.
-		 */
-		if (unlikely(page != *pagep)) {
-			put_page(page);
-			page = NULL;
-		}
-	}
-
-	if (page) {
-		if (page->index <= end_idx)
-			found = true;
-		put_page(page);
-	}
-
-	rcu_read_unlock();
-	return found;
-}
-
 static int lock_extent_direct(struct inode *inode, u64 lockstart, u64 lockend,
 			      struct extent_state **cached_state, int writing)
 {

From 051c98eb11e6fd64a8306851c34ee485b5817955 Mon Sep 17 00:00:00 2001
From: David Sterba <dsterba@suse.com>
Date: Wed, 7 Mar 2018 15:33:22 +0100
Subject: [PATCH 083/164] btrfs: open code trivial helper
 btrfs_page_exists_in_range

The called function name is self explanatory.

Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/btrfs_inode.h | 6 ------
 fs/btrfs/file.c        | 3 ++-
 fs/btrfs/inode.c       | 4 ++--
 3 files changed, 4 insertions(+), 9 deletions(-)

diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index 078a53e01ece..ca15be569d69 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -364,10 +364,4 @@ static inline void btrfs_print_data_csum_error(struct btrfs_inode *inode,
 			logical_start, csum, csum_expected, mirror_num);
 }
 
-static inline bool btrfs_page_exists_in_range(struct inode *inode,
-						loff_t start, loff_t end)
-{
-	return filemap_range_has_page(inode->i_mapping, start, end);
-}
-
 #endif
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 8f425c64d75f..8cac40005e6c 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -2482,7 +2482,8 @@ static int btrfs_punch_hole_lock_range(struct inode *inode,
 		if ((!ordered ||
 		    (ordered->file_offset + ordered->len <= lockstart ||
 		     ordered->file_offset > lockend)) &&
-		     !btrfs_page_exists_in_range(inode, lockstart, lockend)) {
+		     !filemap_range_has_page(inode->i_mapping,
+					     lockstart, lockend)) {
 			if (ordered)
 				btrfs_put_ordered_extent(ordered);
 			break;
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index c92455428c01..fc5b7d82b842 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -7489,8 +7489,8 @@ static int lock_extent_direct(struct inode *inode, u64 lockstart, u64 lockend,
 		 * get stale data.
 		 */
 		if (!ordered &&
-		    (!writing ||
-		     !btrfs_page_exists_in_range(inode, lockstart, lockend)))
+		    (!writing || !filemap_range_has_page(inode->i_mapping,
+							 lockstart, lockend)))
 			break;
 
 		unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, lockend,

From 7c829b722dffb22aaf9e3ea1b1d88dac49bd0768 Mon Sep 17 00:00:00 2001
From: Anand Jain <anand.jain@oracle.com>
Date: Wed, 7 Mar 2018 17:29:18 +0800
Subject: [PATCH 084/164] btrfs: add define for oldest generation

Some functions can filter metadata by the generation. Add a define that
will annotate such arguments.

Signed-off-by: Anand Jain <anand.jain@oracle.com>
Reviewed-by: David Sterba <dsterba@suse.com>
[ update changelog ]
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/ctree.h       | 2 ++
 fs/btrfs/ioctl.c       | 2 +-
 fs/btrfs/tree-defrag.c | 5 ++---
 fs/btrfs/uuid-tree.c   | 2 +-
 fs/btrfs/volumes.c     | 3 ++-
 5 files changed, 8 insertions(+), 6 deletions(-)

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index f4926dc9649f..15bf111cd5c5 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -66,6 +66,8 @@ struct btrfs_ordered_sum;
 
 #define BTRFS_MAX_LEVEL 8
 
+#define BTRFS_OLDEST_GENERATION	0ULL
+
 #define BTRFS_COMPAT_EXTENT_TREE_V0
 
 /*
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index c5a559105949..ba403c00982c 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -2600,7 +2600,7 @@ static int btrfs_ioctl_defrag(struct file *file, void __user *argp)
 			range->len = (u64)-1;
 		}
 		ret = btrfs_defrag_file(file_inode(file), file,
-					range, 0, 0);
+					range, BTRFS_OLDEST_GENERATION, 0);
 		if (ret > 0)
 			ret = 0;
 		kfree(range);
diff --git a/fs/btrfs/tree-defrag.c b/fs/btrfs/tree-defrag.c
index cb65089127cc..c09dbe4bd6e7 100644
--- a/fs/btrfs/tree-defrag.c
+++ b/fs/btrfs/tree-defrag.c
@@ -39,7 +39,6 @@ int btrfs_defrag_leaves(struct btrfs_trans_handle *trans,
 	int level;
 	int next_key_ret = 0;
 	u64 last_ret = 0;
-	u64 min_trans = 0;
 
 	if (root->fs_info->extent_root == root) {
 		/*
@@ -81,7 +80,7 @@ int btrfs_defrag_leaves(struct btrfs_trans_handle *trans,
 
 	path->keep_locks = 1;
 
-	ret = btrfs_search_forward(root, &key, path, min_trans);
+	ret = btrfs_search_forward(root, &key, path, BTRFS_OLDEST_GENERATION);
 	if (ret < 0)
 		goto out;
 	if (ret > 0) {
@@ -130,7 +129,7 @@ int btrfs_defrag_leaves(struct btrfs_trans_handle *trans,
 	 */
 	path->slots[1] = btrfs_header_nritems(path->nodes[1]);
 	next_key_ret = btrfs_find_next_key(root, path, &key, 1,
-					   min_trans);
+					   BTRFS_OLDEST_GENERATION);
 	if (next_key_ret == 0) {
 		memcpy(&root->defrag_progress, &key, sizeof(key));
 		ret = -EAGAIN;
diff --git a/fs/btrfs/uuid-tree.c b/fs/btrfs/uuid-tree.c
index 726f928238d0..9916f03430bc 100644
--- a/fs/btrfs/uuid-tree.c
+++ b/fs/btrfs/uuid-tree.c
@@ -282,7 +282,7 @@ int btrfs_uuid_tree_iterate(struct btrfs_fs_info *fs_info,
 	key.offset = 0;
 
 again_search_slot:
-	ret = btrfs_search_forward(root, &key, path, 0);
+	ret = btrfs_search_forward(root, &key, path, BTRFS_OLDEST_GENERATION);
 	if (ret) {
 		if (ret > 0)
 			ret = 0;
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 1622d3b73771..1e72357bdfa8 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -4208,7 +4208,8 @@ static int btrfs_uuid_scan_kthread(void *data)
 	key.offset = 0;
 
 	while (1) {
-		ret = btrfs_search_forward(root, &key, path, 0);
+		ret = btrfs_search_forward(root, &key, path,
+				BTRFS_OLDEST_GENERATION);
 		if (ret) {
 			if (ret > 0)
 				ret = 0;

From 2e32ef87b074cb8098436634b649b4b2b523acbe Mon Sep 17 00:00:00 2001
From: Nikolay Borisov <nborisov@suse.com>
Date: Wed, 14 Feb 2018 14:37:26 +0200
Subject: [PATCH 085/164] btrfs: Relax memory barrier in btrfs_tree_unlock

When performing an unlock on an extent buffer we'd like to order the
decrement of extent_buffer::blocking_writers with waking up any
waiters. In such situations it's sufficient to use smp_mb__after_atomic
rather than the heavy smp_mb. On architectures where atomic operations
are fully ordered (such as x86 or s390) unconditionally executing
a heavyweight smp_mb instruction causes a severe hit to performance
while bringin no improvements in terms of correctness.

The better thing is to use the appropriate smp_mb__after_atomic routine
which will do the correct thing (invoke a full smp_mb or in the case
of ordered atomics insert a compiler barrier). Put another way,
an RMW atomic op + smp_load__after_atomic equals, in terms of
semantics, to a full smp_mb. This ensures that none of the problems
described in the accompanying comment of waitqueue_active occur.
No functional changes.

Signed-off-by: Nikolay Borisov <nborisov@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/locking.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/btrfs/locking.c b/fs/btrfs/locking.c
index d13128c70ddd..621083f8932c 100644
--- a/fs/btrfs/locking.c
+++ b/fs/btrfs/locking.c
@@ -290,7 +290,7 @@ void btrfs_tree_unlock(struct extent_buffer *eb)
 		/*
 		 * Make sure counter is updated before we wake up waiters.
 		 */
-		smp_mb();
+		smp_mb__after_atomic();
 		if (waitqueue_active(&eb->write_lock_wq))
 			wake_up(&eb->write_lock_wq);
 	} else {

From 4d31778aa2fa342f5f92ca4025b293a1729161d1 Mon Sep 17 00:00:00 2001
From: Qu Wenruo <wqu@suse.com>
Date: Tue, 19 Dec 2017 15:44:54 +0800
Subject: [PATCH 086/164] btrfs: qgroup: Fix root item corruption when multiple
 same source snapshots are created with quota enabled

When multiple pending snapshots referring to the same source subvolume
are executed, enabled quota will cause root item corruption, where root
items are using old bytenr (no backref in extent tree).

This can be triggered by fstests btrfs/152.

The cause is when source subvolume is still dirty, extra commit
(simplied transaction commit) of qgroup_account_snapshot() can skip
dirty roots not recorded in current transaction, making root item of
source subvolume not updated.

Fix it by forcing recording source subvolume in current transaction
before qgroup sub-transaction commit.

Reported-by: Justin Maggard <jmaggard@netgear.com>
Signed-off-by: Qu Wenruo <wqu@suse.com>
Reviewed-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/transaction.c | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 665438542b96..17978fd731f6 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -319,7 +319,7 @@ static int record_root_in_trans(struct btrfs_trans_handle *trans,
 	if ((test_bit(BTRFS_ROOT_REF_COWS, &root->state) &&
 	    root->last_trans < trans->transid) || force) {
 		WARN_ON(root == fs_info->extent_root);
-		WARN_ON(root->commit_root != root->node);
+		WARN_ON(!force && root->commit_root != root->node);
 
 		/*
 		 * see below for IN_TRANS_SETUP usage rules
@@ -1371,6 +1371,14 @@ static int qgroup_account_snapshot(struct btrfs_trans_handle *trans,
 	if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags))
 		return 0;
 
+	/*
+	 * Ensure dirty @src will be commited.  Or, after comming
+	 * commit_fs_roots() and switch_commit_roots(), any dirty but not
+	 * recorded root will never be updated again, causing an outdated root
+	 * item.
+	 */
+	record_root_in_trans(trans, src, 1);
+
 	/*
 	 * We are going to commit transaction, see btrfs_commit_transaction()
 	 * comment for reason locking tree_log_mutex

From 7a5a07a81062915c65ce27e80608b1c819b1f936 Mon Sep 17 00:00:00 2001
From: Nikolay Borisov <nborisov@suse.com>
Date: Mon, 5 Feb 2018 10:41:13 +0200
Subject: [PATCH 087/164] btrfs: Remove userspace transaction ioctls

Commit 3558d4f88ec8 ("btrfs: Deprecate userspace transaction ioctls")
marked the beginning of the end of userspace transaction. This commit
finishes the job! There are no known users and ceph does not use the
ioctl anymore.

Signed-off-by: Nikolay Borisov <nborisov@suse.com>
Acked-by: Sage Weil <sage@redhat.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/ctree.h |  1 -
 fs/btrfs/file.c  |  8 ----
 fs/btrfs/ioctl.c | 95 ------------------------------------------------
 3 files changed, 104 deletions(-)

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 15bf111cd5c5..f2396a13c0c2 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -3207,7 +3207,6 @@ void btrfs_destroy_inode(struct inode *inode);
 int btrfs_drop_inode(struct inode *inode);
 int __init btrfs_init_cachep(void);
 void __cold btrfs_destroy_cachep(void);
-long btrfs_ioctl_trans_end(struct file *file);
 struct inode *btrfs_iget(struct super_block *s, struct btrfs_key *location,
 			 struct btrfs_root *root, int *was_new);
 struct extent_map *btrfs_get_extent(struct btrfs_inode *inode,
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 8cac40005e6c..6d878f1d1082 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -1997,8 +1997,6 @@ int btrfs_release_file(struct inode *inode, struct file *filp)
 {
 	struct btrfs_file_private *private = filp->private_data;
 
-	if (private && private->trans)
-		btrfs_ioctl_trans_end(filp);
 	if (private && private->filldir_buf)
 		kfree(private->filldir_buf);
 	kfree(private);
@@ -2189,12 +2187,6 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
 		goto out;
 	}
 
-	/*
-	 * ok we haven't committed the transaction yet, lets do a commit
-	 */
-	if (file->private_data)
-		btrfs_ioctl_trans_end(file);
-
 	/*
 	 * We use start here because we will need to wait on the IO to complete
 	 * in btrfs_sync_log, which could require joining a transaction (for
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index ba403c00982c..5011b6272cfa 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -3936,73 +3936,6 @@ int btrfs_clone_file_range(struct file *src_file, loff_t off,
 	return btrfs_clone_files(dst_file, src_file, off, len, destoff);
 }
 
-/*
- * there are many ways the trans_start and trans_end ioctls can lead
- * to deadlocks.  They should only be used by applications that
- * basically own the machine, and have a very in depth understanding
- * of all the possible deadlocks and enospc problems.
- */
-static long btrfs_ioctl_trans_start(struct file *file)
-{
-	struct inode *inode = file_inode(file);
-	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
-	struct btrfs_root *root = BTRFS_I(inode)->root;
-	struct btrfs_trans_handle *trans;
-	struct btrfs_file_private *private;
-	int ret;
-	static bool warned = false;
-
-	ret = -EPERM;
-	if (!capable(CAP_SYS_ADMIN))
-		goto out;
-
-	if (!warned) {
-		btrfs_warn(fs_info,
-			"Userspace transaction mechanism is considered "
-			"deprecated and slated to be removed in 4.17. "
-			"If you have a valid use case please "
-			"speak up on the mailing list");
-		WARN_ON(1);
-		warned = true;
-	}
-
-	ret = -EINPROGRESS;
-	private = file->private_data;
-	if (private && private->trans)
-		goto out;
-	if (!private) {
-		private = kzalloc(sizeof(struct btrfs_file_private),
-				  GFP_KERNEL);
-		if (!private)
-			return -ENOMEM;
-		file->private_data = private;
-	}
-
-	ret = -EROFS;
-	if (btrfs_root_readonly(root))
-		goto out;
-
-	ret = mnt_want_write_file(file);
-	if (ret)
-		goto out;
-
-	atomic_inc(&fs_info->open_ioctl_trans);
-
-	ret = -ENOMEM;
-	trans = btrfs_start_ioctl_transaction(root);
-	if (IS_ERR(trans))
-		goto out_drop;
-
-	private->trans = trans;
-	return 0;
-
-out_drop:
-	atomic_dec(&fs_info->open_ioctl_trans);
-	mnt_drop_write_file(file);
-out:
-	return ret;
-}
-
 static long btrfs_ioctl_default_subvol(struct file *file, void __user *argp)
 {
 	struct inode *inode = file_inode(file);
@@ -4244,30 +4177,6 @@ out:
 	return ret;
 }
 
-/*
- * there are many ways the trans_start and trans_end ioctls can lead
- * to deadlocks.  They should only be used by applications that
- * basically own the machine, and have a very in depth understanding
- * of all the possible deadlocks and enospc problems.
- */
-long btrfs_ioctl_trans_end(struct file *file)
-{
-	struct inode *inode = file_inode(file);
-	struct btrfs_root *root = BTRFS_I(inode)->root;
-	struct btrfs_file_private *private = file->private_data;
-
-	if (!private || !private->trans)
-		return -EINVAL;
-
-	btrfs_end_transaction(private->trans);
-	private->trans = NULL;
-
-	atomic_dec(&root->fs_info->open_ioctl_trans);
-
-	mnt_drop_write_file(file);
-	return 0;
-}
-
 static noinline long btrfs_ioctl_start_sync(struct btrfs_root *root,
 					    void __user *argp)
 {
@@ -5575,10 +5484,6 @@ long btrfs_ioctl(struct file *file, unsigned int
 		return btrfs_ioctl_dev_info(fs_info, argp);
 	case BTRFS_IOC_BALANCE:
 		return btrfs_ioctl_balance(file, NULL);
-	case BTRFS_IOC_TRANS_START:
-		return btrfs_ioctl_trans_start(file);
-	case BTRFS_IOC_TRANS_END:
-		return btrfs_ioctl_trans_end(file);
 	case BTRFS_IOC_TREE_SEARCH:
 		return btrfs_ioctl_tree_search(file, argp);
 	case BTRFS_IOC_TREE_SEARCH_V2:

From 859e682d58a2bd60f1aa2a32f712ee444faf4003 Mon Sep 17 00:00:00 2001
From: Nikolay Borisov <nborisov@suse.com>
Date: Mon, 5 Feb 2018 10:41:14 +0200
Subject: [PATCH 088/164] btrfs: Remove btrfs_file_private::trans

Now that the userspace transaction IOCTL have been removed, this member
is no longer used so just remove it

Signed-off-by: Nikolay Borisov <nborisov@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/ctree.h | 1 -
 1 file changed, 1 deletion(-)

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index f2396a13c0c2..395bfc51e256 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -1268,7 +1268,6 @@ struct btrfs_root {
 };
 
 struct btrfs_file_private {
-	struct btrfs_trans_handle *trans;
 	void *filldir_buf;
 };
 

From bcf3a3e7fb55f9238bc848147316317d099c2048 Mon Sep 17 00:00:00 2001
From: Nikolay Borisov <nborisov@suse.com>
Date: Mon, 5 Feb 2018 10:41:15 +0200
Subject: [PATCH 089/164] btrfs: Remove code referencing unused TRANS_USERSPACE

Now that the userspace transaction ioctls have been removed,
TRANS_USERSPACE is no longer used hence we can remove it.

Signed-off-by: Nikolay Borisov <nborisov@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/transaction.c | 27 ++++++---------------------
 fs/btrfs/transaction.h |  6 +-----
 2 files changed, 7 insertions(+), 26 deletions(-)

diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 17978fd731f6..34efc9773719 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -37,22 +37,16 @@
 
 static const unsigned int btrfs_blocked_trans_types[TRANS_STATE_MAX] = {
 	[TRANS_STATE_RUNNING]		= 0U,
-	[TRANS_STATE_BLOCKED]		= (__TRANS_USERSPACE |
-					   __TRANS_START),
-	[TRANS_STATE_COMMIT_START]	= (__TRANS_USERSPACE |
-					   __TRANS_START |
-					   __TRANS_ATTACH),
-	[TRANS_STATE_COMMIT_DOING]	= (__TRANS_USERSPACE |
-					   __TRANS_START |
+	[TRANS_STATE_BLOCKED]		=  __TRANS_START,
+	[TRANS_STATE_COMMIT_START]	= (__TRANS_START | __TRANS_ATTACH),
+	[TRANS_STATE_COMMIT_DOING]	= (__TRANS_START |
 					   __TRANS_ATTACH |
 					   __TRANS_JOIN),
-	[TRANS_STATE_UNBLOCKED]		= (__TRANS_USERSPACE |
-					   __TRANS_START |
+	[TRANS_STATE_UNBLOCKED]		= (__TRANS_START |
 					   __TRANS_ATTACH |
 					   __TRANS_JOIN |
 					   __TRANS_JOIN_NOLOCK),
-	[TRANS_STATE_COMPLETED]		= (__TRANS_USERSPACE |
-					   __TRANS_START |
+	[TRANS_STATE_COMPLETED]		= (__TRANS_START |
 					   __TRANS_ATTACH |
 					   __TRANS_JOIN |
 					   __TRANS_JOIN_NOLOCK),
@@ -449,9 +443,6 @@ static int may_wait_transaction(struct btrfs_fs_info *fs_info, int type)
 	if (test_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags))
 		return 0;
 
-	if (type == TRANS_USERSPACE)
-		return 1;
-
 	if (type == TRANS_START &&
 	    !atomic_read(&fs_info->open_ioctl_trans))
 		return 1;
@@ -593,7 +584,7 @@ again:
 got_it:
 	btrfs_record_root_in_trans(h, root);
 
-	if (!current->journal_info && type != TRANS_USERSPACE)
+	if (!current->journal_info)
 		current->journal_info = h;
 	return h;
 
@@ -670,12 +661,6 @@ struct btrfs_trans_handle *btrfs_join_transaction_nolock(struct btrfs_root *root
 				 BTRFS_RESERVE_NO_FLUSH, true);
 }
 
-struct btrfs_trans_handle *btrfs_start_ioctl_transaction(struct btrfs_root *root)
-{
-	return start_transaction(root, 0, TRANS_USERSPACE,
-				 BTRFS_RESERVE_NO_FLUSH, true);
-}
-
 /*
  * btrfs_attach_transaction() - catch the running transaction
  *
diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h
index 2762c8d6191c..b6c94ce33503 100644
--- a/fs/btrfs/transaction.h
+++ b/fs/btrfs/transaction.h
@@ -105,21 +105,18 @@ struct btrfs_transaction {
 
 #define __TRANS_FREEZABLE	(1U << 0)
 
-#define __TRANS_USERSPACE	(1U << 8)
 #define __TRANS_START		(1U << 9)
 #define __TRANS_ATTACH		(1U << 10)
 #define __TRANS_JOIN		(1U << 11)
 #define __TRANS_JOIN_NOLOCK	(1U << 12)
 #define __TRANS_DUMMY		(1U << 13)
 
-#define TRANS_USERSPACE		(__TRANS_USERSPACE | __TRANS_FREEZABLE)
 #define TRANS_START		(__TRANS_START | __TRANS_FREEZABLE)
 #define TRANS_ATTACH		(__TRANS_ATTACH)
 #define TRANS_JOIN		(__TRANS_JOIN | __TRANS_FREEZABLE)
 #define TRANS_JOIN_NOLOCK	(__TRANS_JOIN_NOLOCK)
 
-#define TRANS_EXTWRITERS	(__TRANS_USERSPACE | __TRANS_START |	\
-				 __TRANS_ATTACH)
+#define TRANS_EXTWRITERS	(__TRANS_START | __TRANS_ATTACH)
 
 #define BTRFS_SEND_TRANS_STUB	((void *)1)
 
@@ -207,7 +204,6 @@ struct btrfs_trans_handle *btrfs_join_transaction_nolock(struct btrfs_root *root
 struct btrfs_trans_handle *btrfs_attach_transaction(struct btrfs_root *root);
 struct btrfs_trans_handle *btrfs_attach_transaction_barrier(
 					struct btrfs_root *root);
-struct btrfs_trans_handle *btrfs_start_ioctl_transaction(struct btrfs_root *root);
 int btrfs_wait_for_commit(struct btrfs_fs_info *fs_info, u64 transid);
 
 void btrfs_add_dead_root(struct btrfs_root *root);

From 92e2f7e37004115db2ba98c6999a74ff5e41c83f Mon Sep 17 00:00:00 2001
From: Nikolay Borisov <nborisov@suse.com>
Date: Mon, 5 Feb 2018 10:41:16 +0200
Subject: [PATCH 090/164] btrfs: Remove btrfs_fs_info::open_ioctl_trans

Since userspace transaction have been removed we no longer have use
for this field so delete it.

Signed-off-by: Nikolay Borisov <nborisov@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/ctree.h       | 1 -
 fs/btrfs/extent-tree.c | 3 +--
 fs/btrfs/transaction.c | 9 +++------
 3 files changed, 4 insertions(+), 9 deletions(-)

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 395bfc51e256..50c068fd3b38 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -880,7 +880,6 @@ struct btrfs_fs_info {
 	struct rb_root tree_mod_log;
 
 	atomic_t async_delalloc_pages;
-	atomic_t open_ioctl_trans;
 
 	/*
 	 * this is used to protect the following list -- ordered_roots.
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 2760292e1175..1469117a8e37 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -4338,8 +4338,7 @@ again:
 
 		/* commit the current transaction and try again */
 commit_trans:
-		if (need_commit &&
-		    !atomic_read(&fs_info->open_ioctl_trans)) {
+		if (need_commit) {
 			need_commit--;
 
 			if (need_commit > 0) {
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 34efc9773719..f4b1225a3bec 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -443,8 +443,7 @@ static int may_wait_transaction(struct btrfs_fs_info *fs_info, int type)
 	if (test_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags))
 		return 0;
 
-	if (type == TRANS_START &&
-	    !atomic_read(&fs_info->open_ioctl_trans))
+	if (type == TRANS_START)
 		return 1;
 
 	return 0;
@@ -766,8 +765,7 @@ out:
 
 void btrfs_throttle(struct btrfs_fs_info *fs_info)
 {
-	if (!atomic_read(&fs_info->open_ioctl_trans))
-		wait_current_trans(fs_info);
+	wait_current_trans(fs_info);
 }
 
 static int should_end_transaction(struct btrfs_trans_handle *trans)
@@ -870,8 +868,7 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
 
 	btrfs_trans_release_chunk_metadata(trans);
 
-	if (lock && !atomic_read(&info->open_ioctl_trans) &&
-	    should_end_transaction(trans) &&
+	if (lock && should_end_transaction(trans) &&
 	    READ_ONCE(cur_trans->state) == TRANS_STATE_RUNNING) {
 		spin_lock(&info->trans_lock);
 		if (cur_trans->state == TRANS_STATE_RUNNING)

From 62b8e077317971b74d79c9cc501db2ae861d48b2 Mon Sep 17 00:00:00 2001
From: Howard McLauchlan <hmclauchlan@fb.com>
Date: Thu, 8 Mar 2018 10:48:48 -0800
Subject: [PATCH 091/164] btrfs: Add nossd_spread mount option

Btrfs has two mount options for SSD optimizations: ssd and ssd_spread.
Presently there is an option to disable all SSD optimizations, but there
isn't an option to disable just ssd_spread.

This patch adds a mount option nossd_spread that disables ssd_spread
only.

Reviewed-by: Josef Bacik <jbacik@fb.com>
Signed-off-by: Howard McLauchlan <hmclauchlan@fb.com>
Reviewed-by: Omar Sandoval <osandov@fb.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/super.c | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index defaccde8d16..121b9d40ff8f 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -310,10 +310,10 @@ static void btrfs_put_super(struct super_block *sb)
 enum {
 	Opt_degraded, Opt_subvol, Opt_subvolid, Opt_device, Opt_nodatasum,
 	Opt_nodatacow, Opt_max_inline, Opt_alloc_start, Opt_nobarrier, Opt_ssd,
-	Opt_nossd, Opt_ssd_spread, Opt_thread_pool, Opt_noacl, Opt_compress,
-	Opt_compress_type, Opt_compress_force, Opt_compress_force_type,
-	Opt_notreelog, Opt_ratio, Opt_flushoncommit, Opt_discard,
-	Opt_space_cache, Opt_space_cache_version, Opt_clear_cache,
+	Opt_nossd, Opt_ssd_spread, Opt_nossd_spread, Opt_thread_pool, Opt_noacl,
+	Opt_compress, Opt_compress_type, Opt_compress_force,
+	Opt_compress_force_type, Opt_notreelog, Opt_ratio, Opt_flushoncommit,
+	Opt_discard, Opt_space_cache, Opt_space_cache_version, Opt_clear_cache,
 	Opt_user_subvol_rm_allowed, Opt_enospc_debug, Opt_subvolrootid,
 	Opt_defrag, Opt_inode_cache, Opt_no_space_cache, Opt_recovery,
 	Opt_skip_balance, Opt_check_integrity,
@@ -353,6 +353,7 @@ static const match_table_t tokens = {
 	{Opt_ssd, "ssd"},
 	{Opt_ssd_spread, "ssd_spread"},
 	{Opt_nossd, "nossd"},
+	{Opt_nossd_spread, "nossd_spread"},
 	{Opt_acl, "acl"},
 	{Opt_noacl, "noacl"},
 	{Opt_notreelog, "notreelog"},
@@ -579,6 +580,8 @@ int btrfs_parse_options(struct btrfs_fs_info *info, char *options,
 			btrfs_set_opt(info->mount_opt, NOSSD);
 			btrfs_clear_and_info(info, SSD,
 					     "not using ssd optimizations");
+			/* Fallthrough */
+		case Opt_nossd_spread:
 			btrfs_clear_and_info(info, SSD_SPREAD,
 					     "not using spread ssd allocation scheme");
 			break;

From 416a72022e43b4c1c5e144ec6af3f95870e8f71a Mon Sep 17 00:00:00 2001
From: David Sterba <dsterba@suse.com>
Date: Fri, 9 Mar 2018 14:37:01 +0100
Subject: [PATCH 092/164] btrfs: sort and group mount option definitions

Sort mount options by the primary name, followed by the 'no-'
counterpart if it exists. Group the deprecated and debugging options.
Enum and token defintions are synced.

Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/super.c | 143 +++++++++++++++++++++++++++++------------------
 1 file changed, 88 insertions(+), 55 deletions(-)

diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 121b9d40ff8f..d41d5960ef4a 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -308,21 +308,50 @@ static void btrfs_put_super(struct super_block *sb)
 }
 
 enum {
-	Opt_degraded, Opt_subvol, Opt_subvolid, Opt_device, Opt_nodatasum,
-	Opt_nodatacow, Opt_max_inline, Opt_alloc_start, Opt_nobarrier, Opt_ssd,
-	Opt_nossd, Opt_ssd_spread, Opt_nossd_spread, Opt_thread_pool, Opt_noacl,
-	Opt_compress, Opt_compress_type, Opt_compress_force,
-	Opt_compress_force_type, Opt_notreelog, Opt_ratio, Opt_flushoncommit,
-	Opt_discard, Opt_space_cache, Opt_space_cache_version, Opt_clear_cache,
-	Opt_user_subvol_rm_allowed, Opt_enospc_debug, Opt_subvolrootid,
-	Opt_defrag, Opt_inode_cache, Opt_no_space_cache, Opt_recovery,
-	Opt_skip_balance, Opt_check_integrity,
+	Opt_acl, Opt_noacl,
+	Opt_clear_cache,
+	Opt_commit_interval,
+	Opt_compress,
+	Opt_compress_force,
+	Opt_compress_force_type,
+	Opt_compress_type,
+	Opt_degraded,
+	Opt_device,
+	Opt_fatal_errors,
+	Opt_flushoncommit, Opt_noflushoncommit,
+	Opt_inode_cache, Opt_noinode_cache,
+	Opt_max_inline,
+	Opt_barrier, Opt_nobarrier,
+	Opt_datacow, Opt_nodatacow,
+	Opt_datasum, Opt_nodatasum,
+	Opt_defrag, Opt_nodefrag,
+	Opt_discard, Opt_nodiscard,
+	Opt_nologreplay,
+	Opt_norecovery,
+	Opt_ratio,
+	Opt_rescan_uuid_tree,
+	Opt_skip_balance,
+	Opt_space_cache, Opt_no_space_cache,
+	Opt_space_cache_version,
+	Opt_ssd, Opt_nossd,
+	Opt_ssd_spread, Opt_nossd_spread,
+	Opt_subvol,
+	Opt_subvolid,
+	Opt_thread_pool,
+	Opt_treelog, Opt_notreelog,
+	Opt_usebackuproot,
+	Opt_user_subvol_rm_allowed,
+
+	/* Deprecated options */
+	Opt_alloc_start,
+	Opt_recovery,
+	Opt_subvolrootid,
+
+	/* Debugging options */
+	Opt_check_integrity,
 	Opt_check_integrity_including_extent_data,
-	Opt_check_integrity_print_mask, Opt_fatal_errors, Opt_rescan_uuid_tree,
-	Opt_commit_interval, Opt_barrier, Opt_nodefrag, Opt_nodiscard,
-	Opt_noenospc_debug, Opt_noflushoncommit, Opt_acl, Opt_datacow,
-	Opt_datasum, Opt_treelog, Opt_noinode_cache, Opt_usebackuproot,
-	Opt_nologreplay, Opt_norecovery,
+	Opt_check_integrity_print_mask,
+	Opt_enospc_debug, Opt_noenospc_debug,
 #ifdef CONFIG_BTRFS_DEBUG
 	Opt_fragment_data, Opt_fragment_metadata, Opt_fragment_all,
 #endif
@@ -333,59 +362,63 @@ enum {
 };
 
 static const match_table_t tokens = {
-	{Opt_degraded, "degraded"},
-	{Opt_subvol, "subvol=%s"},
-	{Opt_subvolid, "subvolid=%s"},
-	{Opt_device, "device=%s"},
-	{Opt_nodatasum, "nodatasum"},
-	{Opt_datasum, "datasum"},
-	{Opt_nodatacow, "nodatacow"},
-	{Opt_datacow, "datacow"},
-	{Opt_nobarrier, "nobarrier"},
-	{Opt_barrier, "barrier"},
-	{Opt_max_inline, "max_inline=%s"},
-	{Opt_alloc_start, "alloc_start=%s"},		/* deprecated */
-	{Opt_thread_pool, "thread_pool=%u"},
+	{Opt_acl, "acl"},
+	{Opt_noacl, "noacl"},
+	{Opt_clear_cache, "clear_cache"},
+	{Opt_commit_interval, "commit=%u"},
 	{Opt_compress, "compress"},
 	{Opt_compress_type, "compress=%s"},
 	{Opt_compress_force, "compress-force"},
 	{Opt_compress_force_type, "compress-force=%s"},
-	{Opt_ssd, "ssd"},
-	{Opt_ssd_spread, "ssd_spread"},
-	{Opt_nossd, "nossd"},
-	{Opt_nossd_spread, "nossd_spread"},
-	{Opt_acl, "acl"},
-	{Opt_noacl, "noacl"},
-	{Opt_notreelog, "notreelog"},
-	{Opt_treelog, "treelog"},
-	{Opt_nologreplay, "nologreplay"},
-	{Opt_norecovery, "norecovery"},
+	{Opt_degraded, "degraded"},
+	{Opt_device, "device=%s"},
+	{Opt_fatal_errors, "fatal_errors=%s"},
 	{Opt_flushoncommit, "flushoncommit"},
 	{Opt_noflushoncommit, "noflushoncommit"},
-	{Opt_ratio, "metadata_ratio=%u"},
-	{Opt_discard, "discard"},
-	{Opt_nodiscard, "nodiscard"},
-	{Opt_space_cache, "space_cache"},
-	{Opt_space_cache_version, "space_cache=%s"},
-	{Opt_clear_cache, "clear_cache"},
-	{Opt_user_subvol_rm_allowed, "user_subvol_rm_allowed"},
-	{Opt_enospc_debug, "enospc_debug"},
-	{Opt_noenospc_debug, "noenospc_debug"},
-	{Opt_subvolrootid, "subvolrootid=%d"},		/* deprecated */
-	{Opt_defrag, "autodefrag"},
-	{Opt_nodefrag, "noautodefrag"},
 	{Opt_inode_cache, "inode_cache"},
 	{Opt_noinode_cache, "noinode_cache"},
-	{Opt_no_space_cache, "nospace_cache"},
-	{Opt_recovery, "recovery"}, /* deprecated */
-	{Opt_usebackuproot, "usebackuproot"},
+	{Opt_max_inline, "max_inline=%s"},
+	{Opt_barrier, "barrier"},
+	{Opt_nobarrier, "nobarrier"},
+	{Opt_datacow, "datacow"},
+	{Opt_nodatacow, "nodatacow"},
+	{Opt_datasum, "datasum"},
+	{Opt_nodatasum, "nodatasum"},
+	{Opt_defrag, "autodefrag"},
+	{Opt_nodefrag, "noautodefrag"},
+	{Opt_discard, "discard"},
+	{Opt_nodiscard, "nodiscard"},
+	{Opt_nologreplay, "nologreplay"},
+	{Opt_norecovery, "norecovery"},
+	{Opt_ratio, "metadata_ratio=%u"},
+	{Opt_rescan_uuid_tree, "rescan_uuid_tree"},
 	{Opt_skip_balance, "skip_balance"},
+	{Opt_space_cache, "space_cache"},
+	{Opt_no_space_cache, "nospace_cache"},
+	{Opt_space_cache_version, "space_cache=%s"},
+	{Opt_ssd, "ssd"},
+	{Opt_nossd, "nossd"},
+	{Opt_ssd_spread, "ssd_spread"},
+	{Opt_nossd_spread, "nossd_spread"},
+	{Opt_subvol, "subvol=%s"},
+	{Opt_subvolid, "subvolid=%s"},
+	{Opt_thread_pool, "thread_pool=%u"},
+	{Opt_treelog, "treelog"},
+	{Opt_notreelog, "notreelog"},
+	{Opt_usebackuproot, "usebackuproot"},
+	{Opt_user_subvol_rm_allowed, "user_subvol_rm_allowed"},
+
+	/* Deprecated options */
+	{Opt_alloc_start, "alloc_start=%s"},
+	{Opt_recovery, "recovery"},
+	{Opt_subvolrootid, "subvolrootid=%d"},
+
+	/* Debugging options */
 	{Opt_check_integrity, "check_int"},
 	{Opt_check_integrity_including_extent_data, "check_int_data"},
 	{Opt_check_integrity_print_mask, "check_int_print_mask=%u"},
-	{Opt_rescan_uuid_tree, "rescan_uuid_tree"},
-	{Opt_fatal_errors, "fatal_errors=%s"},
-	{Opt_commit_interval, "commit=%u"},
+	{Opt_enospc_debug, "enospc_debug"},
+	{Opt_noenospc_debug, "noenospc_debug"},
 #ifdef CONFIG_BTRFS_DEBUG
 	{Opt_fragment_data, "fragment=data"},
 	{Opt_fragment_metadata, "fragment=metadata"},

From 6ca1765b366e3a678e143de0decc3d1d39c15429 Mon Sep 17 00:00:00 2001
From: Liu Bo <bo.li.liu@oracle.com>
Date: Wed, 7 Mar 2018 12:08:09 -0700
Subject: [PATCH 093/164] Btrfs: scrub: batch rebuild for raid56

In case of raid56, writes and rebuilds always take BTRFS_STRIPE_LEN(64K)
as unit, however, scrub_extent() sets blocksize as unit, so rebuild
process may be triggered on every block on a same stripe.

A typical example would be that when we're replacing a disappeared disk,
all reads on the disks get -EIO, every block (size is 4K if blocksize is
4K) would go thru these,

scrub_handle_errored_block
  scrub_recheck_block # re-read pages one by one
  scrub_recheck_block # rebuild by calling raid56_parity_recover()
                        page by page

Although with raid56 stripe cache most of reads during rebuild can be
avoided, the parity recover calculation(xor or raid6 algorithms) needs to
be done $(BTRFS_STRIPE_LEN / blocksize) times.

This makes it smarter by doing raid56 scrub/replace on stripe length.

Signed-off-by: Liu Bo <bo.li.liu@oracle.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/scrub.c | 79 +++++++++++++++++++++++++++++++++++++-----------
 1 file changed, 61 insertions(+), 18 deletions(-)

diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index f4c9b30903b0..629313732521 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -1727,6 +1727,45 @@ static int scrub_submit_raid56_bio_wait(struct btrfs_fs_info *fs_info,
 	return blk_status_to_errno(bio->bi_status);
 }
 
+static void scrub_recheck_block_on_raid56(struct btrfs_fs_info *fs_info,
+					  struct scrub_block *sblock)
+{
+	struct scrub_page *first_page = sblock->pagev[0];
+	struct bio *bio;
+	int page_num;
+
+	/* All pages in sblock belong to the same stripe on the same device. */
+	ASSERT(first_page->dev);
+	if (!first_page->dev->bdev)
+		goto out;
+
+	bio = btrfs_io_bio_alloc(BIO_MAX_PAGES);
+	bio_set_dev(bio, first_page->dev->bdev);
+
+	for (page_num = 0; page_num < sblock->page_count; page_num++) {
+		struct scrub_page *page = sblock->pagev[page_num];
+
+		WARN_ON(!page->page);
+		bio_add_page(bio, page->page, PAGE_SIZE, 0);
+	}
+
+	if (scrub_submit_raid56_bio_wait(fs_info, bio, first_page)) {
+		bio_put(bio);
+		goto out;
+	}
+
+	bio_put(bio);
+
+	scrub_recheck_block_checksum(sblock);
+
+	return;
+out:
+	for (page_num = 0; page_num < sblock->page_count; page_num++)
+		sblock->pagev[page_num]->io_error = 1;
+
+	sblock->no_io_error_seen = 0;
+}
+
 /*
  * this function will check the on disk data for checksum errors, header
  * errors and read I/O errors. If any I/O errors happen, the exact pages
@@ -1742,6 +1781,10 @@ static void scrub_recheck_block(struct btrfs_fs_info *fs_info,
 
 	sblock->no_io_error_seen = 1;
 
+	/* short cut for raid56 */
+	if (!retry_failed_mirror && scrub_is_page_on_raid56(sblock->pagev[0]))
+		return scrub_recheck_block_on_raid56(fs_info, sblock);
+
 	for (page_num = 0; page_num < sblock->page_count; page_num++) {
 		struct bio *bio;
 		struct scrub_page *page = sblock->pagev[page_num];
@@ -1757,19 +1800,12 @@ static void scrub_recheck_block(struct btrfs_fs_info *fs_info,
 		bio_set_dev(bio, page->dev->bdev);
 
 		bio_add_page(bio, page->page, PAGE_SIZE, 0);
-		if (!retry_failed_mirror && scrub_is_page_on_raid56(page)) {
-			if (scrub_submit_raid56_bio_wait(fs_info, bio, page)) {
-				page->io_error = 1;
-				sblock->no_io_error_seen = 0;
-			}
-		} else {
-			bio->bi_iter.bi_sector = page->physical >> 9;
-			bio_set_op_attrs(bio, REQ_OP_READ, 0);
+		bio->bi_iter.bi_sector = page->physical >> 9;
+		bio->bi_opf = REQ_OP_READ;
 
-			if (btrfsic_submit_bio_wait(bio)) {
-				page->io_error = 1;
-				sblock->no_io_error_seen = 0;
-			}
+		if (btrfsic_submit_bio_wait(bio)) {
+			page->io_error = 1;
+			sblock->no_io_error_seen = 0;
 		}
 
 		bio_put(bio);
@@ -2737,7 +2773,8 @@ static int scrub_find_csum(struct scrub_ctx *sctx, u64 logical, u8 *csum)
 }
 
 /* scrub extent tries to collect up to 64 kB for each bio */
-static int scrub_extent(struct scrub_ctx *sctx, u64 logical, u64 len,
+static int scrub_extent(struct scrub_ctx *sctx, struct map_lookup *map,
+			u64 logical, u64 len,
 			u64 physical, struct btrfs_device *dev, u64 flags,
 			u64 gen, int mirror_num, u64 physical_for_dev_replace)
 {
@@ -2746,13 +2783,19 @@ static int scrub_extent(struct scrub_ctx *sctx, u64 logical, u64 len,
 	u32 blocksize;
 
 	if (flags & BTRFS_EXTENT_FLAG_DATA) {
-		blocksize = sctx->fs_info->sectorsize;
+		if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK)
+			blocksize = map->stripe_len;
+		else
+			blocksize = sctx->fs_info->sectorsize;
 		spin_lock(&sctx->stat_lock);
 		sctx->stat.data_extents_scrubbed++;
 		sctx->stat.data_bytes_scrubbed += len;
 		spin_unlock(&sctx->stat_lock);
 	} else if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
-		blocksize = sctx->fs_info->nodesize;
+		if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK)
+			blocksize = map->stripe_len;
+		else
+			blocksize = sctx->fs_info->nodesize;
 		spin_lock(&sctx->stat_lock);
 		sctx->stat.tree_extents_scrubbed++;
 		sctx->stat.tree_bytes_scrubbed += len;
@@ -2892,9 +2935,9 @@ static int scrub_extent_for_parity(struct scrub_parity *sparity,
 	}
 
 	if (flags & BTRFS_EXTENT_FLAG_DATA) {
-		blocksize = sctx->fs_info->sectorsize;
+		blocksize = sparity->stripe_len;
 	} else if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
-		blocksize = sctx->fs_info->nodesize;
+		blocksize = sparity->stripe_len;
 	} else {
 		blocksize = sctx->fs_info->sectorsize;
 		WARN_ON(1);
@@ -3604,7 +3647,7 @@ again:
 			if (ret)
 				goto out;
 
-			ret = scrub_extent(sctx, extent_logical, extent_len,
+			ret = scrub_extent(sctx, map, extent_logical, extent_len,
 					   extent_physical, extent_dev, flags,
 					   generation, extent_mirror_num,
 					   extent_logical - logical + physical);

From 5c2b1fd7531d837bb9a6079d4e730a889b0e303d Mon Sep 17 00:00:00 2001
From: David Sterba <dsterba@suse.com>
Date: Tue, 6 Jun 2017 19:22:55 +0200
Subject: [PATCH 094/164] btrfs: assume that bio_ret is always valid in
 submit_extent_page

All callers pass a valid pointer so we can drop the redundant checks.
The call to submit_one_bio never happend and can be removed.

Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/extent_io.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index da46e9372262..cf6253bc5126 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -2758,6 +2758,7 @@ static int merge_bio(struct extent_io_tree *tree, struct page *page,
 
 /*
  * @opf:	bio REQ_OP_* and REQ_* flags as one value
+ * @bio_ret:	must be valid pointer, newly allocated bio will be stored there
  */
 static int submit_extent_page(unsigned int opf, struct extent_io_tree *tree,
 			      struct writeback_control *wbc,
@@ -2778,7 +2779,9 @@ static int submit_extent_page(unsigned int opf, struct extent_io_tree *tree,
 	size_t page_size = min_t(size_t, size, PAGE_SIZE);
 	sector_t sector = offset >> 9;
 
-	if (bio_ret && *bio_ret) {
+	ASSERT(bio_ret);
+
+	if (*bio_ret) {
 		bio = *bio_ret;
 		if (old_compressed)
 			contig = bio->bi_iter.bi_sector == sector;
@@ -2813,10 +2816,7 @@ static int submit_extent_page(unsigned int opf, struct extent_io_tree *tree,
 		wbc_account_io(wbc, page, page_size);
 	}
 
-	if (bio_ret)
-		*bio_ret = bio;
-	else
-		ret = submit_one_bio(bio, mirror_num, bio_flags);
+	*bio_ret = bio;
 
 	return ret;
 }

From 8eec8296a086cf9949dc4b91494fc944778ba5a0 Mon Sep 17 00:00:00 2001
From: David Sterba <dsterba@suse.com>
Date: Tue, 6 Jun 2017 19:50:13 +0200
Subject: [PATCH 095/164] btrfs: remove redundant variable in __do_readpage

The value of page_end is only stored to end, no other use.

Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/extent_io.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index cf6253bc5126..079489892e46 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -2886,8 +2886,7 @@ static int __do_readpage(struct extent_io_tree *tree,
 {
 	struct inode *inode = page->mapping->host;
 	u64 start = page_offset(page);
-	u64 page_end = start + PAGE_SIZE - 1;
-	u64 end;
+	const u64 end = start + PAGE_SIZE - 1;
 	u64 cur = start;
 	u64 extent_offset;
 	u64 last_byte = i_size_read(inode);
@@ -2905,7 +2904,6 @@ static int __do_readpage(struct extent_io_tree *tree,
 
 	set_page_extent_mapped(page);
 
-	end = page_end;
 	if (!PageUptodate(page)) {
 		if (cleancache_get_page(page) == 0) {
 			BUG_ON(blocksize != PAGE_SIZE);

From 0c8508a6e7417e1bee3cb11a681184d907909549 Mon Sep 17 00:00:00 2001
From: David Sterba <dsterba@suse.com>
Date: Mon, 12 Jun 2017 20:00:43 +0200
Subject: [PATCH 096/164] btrfs: cleanup merging conditions in
 submit_extent_page

The merge call was factored out to a separate helper but it's a trivial
one and arguably we can opencode it and cache the value.

Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/extent_io.c | 26 +++++++++-----------------
 1 file changed, 9 insertions(+), 17 deletions(-)

diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 079489892e46..fecf1e5c4f97 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -2744,18 +2744,6 @@ static int __must_check submit_one_bio(struct bio *bio, int mirror_num,
 	return blk_status_to_errno(ret);
 }
 
-static int merge_bio(struct extent_io_tree *tree, struct page *page,
-		     unsigned long offset, size_t size, struct bio *bio,
-		     unsigned long bio_flags)
-{
-	int ret = 0;
-	if (tree->ops)
-		ret = tree->ops->merge_bio_hook(page, offset, size, bio,
-						bio_flags);
-	return ret;
-
-}
-
 /*
  * @opf:	bio REQ_OP_* and REQ_* flags as one value
  * @bio_ret:	must be valid pointer, newly allocated bio will be stored there
@@ -2774,23 +2762,27 @@ static int submit_extent_page(unsigned int opf, struct extent_io_tree *tree,
 {
 	int ret = 0;
 	struct bio *bio;
-	int contig = 0;
-	int old_compressed = prev_bio_flags & EXTENT_BIO_COMPRESSED;
 	size_t page_size = min_t(size_t, size, PAGE_SIZE);
 	sector_t sector = offset >> 9;
 
 	ASSERT(bio_ret);
 
 	if (*bio_ret) {
+		bool contig;
+		bool can_merge = true;
+
 		bio = *bio_ret;
-		if (old_compressed)
+		if (prev_bio_flags & EXTENT_BIO_COMPRESSED)
 			contig = bio->bi_iter.bi_sector == sector;
 		else
 			contig = bio_end_sector(bio) == sector;
 
-		if (prev_bio_flags != bio_flags || !contig ||
+		if (tree->ops && tree->ops->merge_bio_hook(page, offset,
+					page_size, bio, bio_flags))
+			can_merge = false;
+
+		if (prev_bio_flags != bio_flags || !contig || !can_merge ||
 		    force_bio_submit ||
-		    merge_bio(tree, page, pg_offset, page_size, bio, bio_flags) ||
 		    bio_add_page(bio, page, page_size, pg_offset) < page_size) {
 			ret = submit_one_bio(bio, mirror_num, prev_bio_flags);
 			if (ret < 0) {

From b8b3d625ce67e4d4b994ae8d09308b3ab4f651d8 Mon Sep 17 00:00:00 2001
From: David Sterba <dsterba@suse.com>
Date: Mon, 12 Jun 2017 19:50:41 +0200
Subject: [PATCH 097/164] btrfs: document more parameters of submit_extent_page

Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/extent_io.c | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index fecf1e5c4f97..f27bad003f8e 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -2746,7 +2746,19 @@ static int __must_check submit_one_bio(struct bio *bio, int mirror_num,
 
 /*
  * @opf:	bio REQ_OP_* and REQ_* flags as one value
+ * @tree:	tree so we can call our merge_bio hook
+ * @wbc:	optional writeback control for io accounting
+ * @page:	page to add to the bio
+ * @pg_offset:	offset of the new bio or to check whether we are adding
+ *              a contiguous page to the previous one
+ * @size:	portion of page that we want to write
+ * @offset:	starting offset in the page
+ * @bdev:	attach newly created bios to this bdev
  * @bio_ret:	must be valid pointer, newly allocated bio will be stored there
+ * @end_io_func:     end_io callback for new bio
+ * @mirror_num:	     desired mirror to read/write
+ * @prev_bio_flags:  flags of previous bio to see if we can merge the current one
+ * @bio_flags:	flags of the current bio to see if we can merge them
  */
 static int submit_extent_page(unsigned int opf, struct extent_io_tree *tree,
 			      struct writeback_control *wbc,

From 3ac6de1abd7a485f48948af9ef18139500eef9ae Mon Sep 17 00:00:00 2001
From: David Sterba <dsterba@suse.com>
Date: Mon, 5 Mar 2018 15:00:37 +0100
Subject: [PATCH 098/164] btrfs: drop fs_info parameter from
 tree_mod_log_set_node_key

It's provided by the extent_buffer.

Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/ctree.c | 19 ++++++++-----------
 1 file changed, 8 insertions(+), 11 deletions(-)

diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index a80fcd285b34..62aa776c77d1 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -877,13 +877,12 @@ tree_mod_log_eb_move(struct btrfs_fs_info *fs_info, struct extent_buffer *dst,
 	BUG_ON(ret < 0);
 }
 
-static noinline void
-tree_mod_log_set_node_key(struct btrfs_fs_info *fs_info,
-			  struct extent_buffer *eb, int slot, int atomic)
+static noinline void tree_mod_log_set_node_key(struct extent_buffer *eb,
+		int slot, int atomic)
 {
 	int ret;
 
-	ret = tree_mod_log_insert_key(fs_info, eb, slot,
+	ret = tree_mod_log_insert_key(eb->fs_info, eb, slot,
 					MOD_LOG_KEY_REPLACE,
 					atomic ? GFP_ATOMIC : GFP_NOFS);
 	BUG_ON(ret < 0);
@@ -2007,8 +2006,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
 		} else {
 			struct btrfs_disk_key right_key;
 			btrfs_node_key(right, &right_key, 0);
-			tree_mod_log_set_node_key(fs_info, parent,
-						  pslot + 1, 0);
+			tree_mod_log_set_node_key(parent, pslot + 1, 0);
 			btrfs_set_node_key(parent, &right_key, pslot + 1);
 			btrfs_mark_buffer_dirty(parent);
 		}
@@ -2052,7 +2050,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
 		/* update the parent key to reflect our changes */
 		struct btrfs_disk_key mid_key;
 		btrfs_node_key(mid, &mid_key, 0);
-		tree_mod_log_set_node_key(fs_info, parent, pslot, 0);
+		tree_mod_log_set_node_key(parent, pslot, 0);
 		btrfs_set_node_key(parent, &mid_key, pslot);
 		btrfs_mark_buffer_dirty(parent);
 	}
@@ -2153,7 +2151,7 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans,
 			struct btrfs_disk_key disk_key;
 			orig_slot += left_nr;
 			btrfs_node_key(mid, &disk_key, 0);
-			tree_mod_log_set_node_key(fs_info, parent, pslot, 0);
+			tree_mod_log_set_node_key(parent, pslot, 0);
 			btrfs_set_node_key(parent, &disk_key, pslot);
 			btrfs_mark_buffer_dirty(parent);
 			if (btrfs_header_nritems(left) > orig_slot) {
@@ -2207,8 +2205,7 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans,
 			struct btrfs_disk_key disk_key;
 
 			btrfs_node_key(right, &disk_key, 0);
-			tree_mod_log_set_node_key(fs_info, parent,
-						  pslot + 1, 0);
+			tree_mod_log_set_node_key(parent, pslot + 1, 0);
 			btrfs_set_node_key(parent, &disk_key, pslot + 1);
 			btrfs_mark_buffer_dirty(parent);
 
@@ -3167,7 +3164,7 @@ static void fixup_low_keys(struct btrfs_fs_info *fs_info,
 		if (!path->nodes[i])
 			break;
 		t = path->nodes[i];
-		tree_mod_log_set_node_key(fs_info, t, tslot, 1);
+		tree_mod_log_set_node_key(t, tslot, 1);
 		btrfs_set_node_key(t, key, tslot);
 		btrfs_mark_buffer_dirty(path->nodes[i]);
 		if (tslot != 0)

From 6074d45f60760f7a8aa34883f37cd4d54cd5855e Mon Sep 17 00:00:00 2001
From: David Sterba <dsterba@suse.com>
Date: Mon, 5 Mar 2018 15:03:52 +0100
Subject: [PATCH 099/164] btrfs: drop fs_info parameter from
 tree_mod_log_insert_move

It's provided by the extent_buffer.

Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/ctree.c | 23 ++++++++++-------------
 1 file changed, 10 insertions(+), 13 deletions(-)

diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 62aa776c77d1..0358ed3e1562 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -564,10 +564,8 @@ tree_mod_log_insert_key(struct btrfs_fs_info *fs_info,
 	return ret;
 }
 
-static noinline int
-tree_mod_log_insert_move(struct btrfs_fs_info *fs_info,
-			 struct extent_buffer *eb, int dst_slot, int src_slot,
-			 int nr_items)
+static noinline int tree_mod_log_insert_move(struct extent_buffer *eb,
+		int dst_slot, int src_slot, int nr_items)
 {
 	struct tree_mod_elem *tm = NULL;
 	struct tree_mod_elem **tm_list = NULL;
@@ -575,7 +573,7 @@ tree_mod_log_insert_move(struct btrfs_fs_info *fs_info,
 	int i;
 	int locked = 0;
 
-	if (!tree_mod_need_log(fs_info, eb))
+	if (!tree_mod_need_log(eb->fs_info, eb))
 		return 0;
 
 	tm_list = kcalloc(nr_items, sizeof(struct tree_mod_elem *), GFP_NOFS);
@@ -603,7 +601,7 @@ tree_mod_log_insert_move(struct btrfs_fs_info *fs_info,
 		}
 	}
 
-	if (tree_mod_dont_log(fs_info, eb))
+	if (tree_mod_dont_log(eb->fs_info, eb))
 		goto free_tms;
 	locked = 1;
 
@@ -613,26 +611,26 @@ tree_mod_log_insert_move(struct btrfs_fs_info *fs_info,
 	 * buffer, i.e. dst_slot < src_slot.
 	 */
 	for (i = 0; i + dst_slot < src_slot && i < nr_items; i++) {
-		ret = __tree_mod_log_insert(fs_info, tm_list[i]);
+		ret = __tree_mod_log_insert(eb->fs_info, tm_list[i]);
 		if (ret)
 			goto free_tms;
 	}
 
-	ret = __tree_mod_log_insert(fs_info, tm);
+	ret = __tree_mod_log_insert(eb->fs_info, tm);
 	if (ret)
 		goto free_tms;
-	tree_mod_log_write_unlock(fs_info);
+	tree_mod_log_write_unlock(eb->fs_info);
 	kfree(tm_list);
 
 	return 0;
 free_tms:
 	for (i = 0; i < nr_items; i++) {
 		if (tm_list[i] && !RB_EMPTY_NODE(&tm_list[i]->node))
-			rb_erase(&tm_list[i]->node, &fs_info->tree_mod_log);
+			rb_erase(&tm_list[i]->node, &eb->fs_info->tree_mod_log);
 		kfree(tm_list[i]);
 	}
 	if (locked)
-		tree_mod_log_write_unlock(fs_info);
+		tree_mod_log_write_unlock(eb->fs_info);
 	kfree(tm_list);
 	kfree(tm);
 
@@ -872,8 +870,7 @@ tree_mod_log_eb_move(struct btrfs_fs_info *fs_info, struct extent_buffer *dst,
 		     int dst_offset, int src_offset, int nr_items)
 {
 	int ret;
-	ret = tree_mod_log_insert_move(fs_info, dst, dst_offset, src_offset,
-				       nr_items);
+	ret = tree_mod_log_insert_move(dst, dst_offset, src_offset, nr_items);
 	BUG_ON(ret < 0);
 }
 

From e09c2efe7eba1498bd3f9300b4aadef9b939a259 Mon Sep 17 00:00:00 2001
From: David Sterba <dsterba@suse.com>
Date: Mon, 5 Mar 2018 15:09:03 +0100
Subject: [PATCH 100/164] btrfs: drop fs_info parameter from
 tree_mod_log_insert_key

It's provided by the extent_buffer.

Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/ctree.c | 27 ++++++++++++---------------
 1 file changed, 12 insertions(+), 15 deletions(-)

diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 0358ed3e1562..960d2c1008bf 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -536,28 +536,26 @@ alloc_tree_mod_elem(struct extent_buffer *eb, int slot,
 	return tm;
 }
 
-static noinline int
-tree_mod_log_insert_key(struct btrfs_fs_info *fs_info,
-			struct extent_buffer *eb, int slot,
-			enum mod_log_op op, gfp_t flags)
+static noinline int tree_mod_log_insert_key(struct extent_buffer *eb, int slot,
+		enum mod_log_op op, gfp_t flags)
 {
 	struct tree_mod_elem *tm;
 	int ret;
 
-	if (!tree_mod_need_log(fs_info, eb))
+	if (!tree_mod_need_log(eb->fs_info, eb))
 		return 0;
 
 	tm = alloc_tree_mod_elem(eb, slot, op, flags);
 	if (!tm)
 		return -ENOMEM;
 
-	if (tree_mod_dont_log(fs_info, eb)) {
+	if (tree_mod_dont_log(eb->fs_info, eb)) {
 		kfree(tm);
 		return 0;
 	}
 
-	ret = __tree_mod_log_insert(fs_info, tm);
-	tree_mod_log_write_unlock(fs_info);
+	ret = __tree_mod_log_insert(eb->fs_info, tm);
+	tree_mod_log_write_unlock(eb->fs_info);
 	if (ret)
 		kfree(tm);
 
@@ -879,8 +877,7 @@ static noinline void tree_mod_log_set_node_key(struct extent_buffer *eb,
 {
 	int ret;
 
-	ret = tree_mod_log_insert_key(eb->fs_info, eb, slot,
-					MOD_LOG_KEY_REPLACE,
+	ret = tree_mod_log_insert_key(eb, slot, MOD_LOG_KEY_REPLACE,
 					atomic ? GFP_ATOMIC : GFP_NOFS);
 	BUG_ON(ret < 0);
 }
@@ -1178,7 +1175,7 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
 		add_root_to_dirty_list(root);
 	} else {
 		WARN_ON(trans->transid != btrfs_header_generation(parent));
-		tree_mod_log_insert_key(fs_info, parent, parent_slot,
+		tree_mod_log_insert_key(parent, parent_slot,
 					MOD_LOG_KEY_REPLACE, GFP_NOFS);
 		btrfs_set_node_blockptr(parent, parent_slot,
 					cow->start);
@@ -3441,8 +3438,8 @@ static void insert_ptr(struct btrfs_trans_handle *trans,
 			      (nritems - slot) * sizeof(struct btrfs_key_ptr));
 	}
 	if (level) {
-		ret = tree_mod_log_insert_key(fs_info, lower, slot,
-					      MOD_LOG_KEY_ADD, GFP_NOFS);
+		ret = tree_mod_log_insert_key(lower, slot, MOD_LOG_KEY_ADD,
+				GFP_NOFS);
 		BUG_ON(ret < 0);
 	}
 	btrfs_set_node_key(lower, key, slot);
@@ -4914,8 +4911,8 @@ static void del_ptr(struct btrfs_root *root, struct btrfs_path *path,
 			      sizeof(struct btrfs_key_ptr) *
 			      (nritems - slot - 1));
 	} else if (level) {
-		ret = tree_mod_log_insert_key(fs_info, parent, slot,
-					      MOD_LOG_KEY_REMOVE, GFP_NOFS);
+		ret = tree_mod_log_insert_key(parent, slot, MOD_LOG_KEY_REMOVE,
+				GFP_NOFS);
 		BUG_ON(ret < 0);
 	}
 

From db7279a20b0982587729cb2a23780e42d536721b Mon Sep 17 00:00:00 2001
From: David Sterba <dsterba@suse.com>
Date: Mon, 5 Mar 2018 15:14:25 +0100
Subject: [PATCH 101/164] btrfs: drop fs_info parameter from
 tree_mod_log_free_eb

It's provided by the extent_buffer.

Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/ctree.c | 15 ++++++---------
 1 file changed, 6 insertions(+), 9 deletions(-)

diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 960d2c1008bf..b3c4305b5ecd 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -41,8 +41,6 @@ static int balance_node_right(struct btrfs_trans_handle *trans,
 			      struct extent_buffer *src_buf);
 static void del_ptr(struct btrfs_root *root, struct btrfs_path *path,
 		    int level, int slot);
-static int tree_mod_log_free_eb(struct btrfs_fs_info *fs_info,
-				 struct extent_buffer *eb);
 
 struct btrfs_path *btrfs_alloc_path(void)
 {
@@ -882,8 +880,7 @@ static noinline void tree_mod_log_set_node_key(struct extent_buffer *eb,
 	BUG_ON(ret < 0);
 }
 
-static noinline int
-tree_mod_log_free_eb(struct btrfs_fs_info *fs_info, struct extent_buffer *eb)
+static noinline int tree_mod_log_free_eb(struct extent_buffer *eb)
 {
 	struct tree_mod_elem **tm_list = NULL;
 	int nritems = 0;
@@ -893,7 +890,7 @@ tree_mod_log_free_eb(struct btrfs_fs_info *fs_info, struct extent_buffer *eb)
 	if (btrfs_header_level(eb) == 0)
 		return 0;
 
-	if (!tree_mod_need_log(fs_info, NULL))
+	if (!tree_mod_need_log(eb->fs_info, NULL))
 		return 0;
 
 	nritems = btrfs_header_nritems(eb);
@@ -910,11 +907,11 @@ tree_mod_log_free_eb(struct btrfs_fs_info *fs_info, struct extent_buffer *eb)
 		}
 	}
 
-	if (tree_mod_dont_log(fs_info, eb))
+	if (tree_mod_dont_log(eb->fs_info, eb))
 		goto free_tms;
 
-	ret = __tree_mod_log_free_eb(fs_info, tm_list, nritems);
-	tree_mod_log_write_unlock(fs_info);
+	ret = __tree_mod_log_free_eb(eb->fs_info, tm_list, nritems);
+	tree_mod_log_write_unlock(eb->fs_info);
 	if (ret)
 		goto free_tms;
 	kfree(tm_list);
@@ -1183,7 +1180,7 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
 					      trans->transid);
 		btrfs_mark_buffer_dirty(parent);
 		if (last_ref) {
-			ret = tree_mod_log_free_eb(fs_info, buf);
+			ret = tree_mod_log_free_eb(buf);
 			if (ret) {
 				btrfs_abort_transaction(trans, ret);
 				return ret;

From 95b757c164820d7c7262c1e546b47d79dd256c96 Mon Sep 17 00:00:00 2001
From: David Sterba <dsterba@suse.com>
Date: Mon, 5 Mar 2018 15:22:30 +0100
Subject: [PATCH 102/164] btrfs: drop fs_info parameter from
 tree_mod_log_free_eb

It's provided by the extent_buffer.

Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/ctree.c | 11 ++++-------
 1 file changed, 4 insertions(+), 7 deletions(-)

diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index b3c4305b5ecd..217e672bafb4 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -654,12 +654,10 @@ __tree_mod_log_free_eb(struct btrfs_fs_info *fs_info,
 	return 0;
 }
 
-static noinline int
-tree_mod_log_insert_root(struct btrfs_fs_info *fs_info,
-			 struct extent_buffer *old_root,
-			 struct extent_buffer *new_root,
-			 int log_removal)
+static noinline int tree_mod_log_insert_root(struct extent_buffer *old_root,
+			 struct extent_buffer *new_root, int log_removal)
 {
+	struct btrfs_fs_info *fs_info = old_root->fs_info;
 	struct tree_mod_elem *tm = NULL;
 	struct tree_mod_elem **tm_list = NULL;
 	int nritems = 0;
@@ -932,8 +930,7 @@ tree_mod_log_set_root_pointer(struct btrfs_root *root,
 			      int log_removal)
 {
 	int ret;
-	ret = tree_mod_log_insert_root(root->fs_info, root->node,
-				       new_root_node, log_removal);
+	ret = tree_mod_log_insert_root(root->node, new_root_node, log_removal);
 	BUG_ON(ret < 0);
 }
 

From a446a979ff4ec6784b59583b929e4656fe2f8e32 Mon Sep 17 00:00:00 2001
From: David Sterba <dsterba@suse.com>
Date: Mon, 5 Mar 2018 15:26:29 +0100
Subject: [PATCH 103/164] btrfs: drop unused fs_info parameter from
 tree_mod_log_eb_move

Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/ctree.c | 13 ++++++-------
 1 file changed, 6 insertions(+), 7 deletions(-)

diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 217e672bafb4..53dfce12e3fb 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -859,8 +859,7 @@ free_tms:
 	return ret;
 }
 
-static inline void
-tree_mod_log_eb_move(struct btrfs_fs_info *fs_info, struct extent_buffer *dst,
+static inline void tree_mod_log_eb_move(struct extent_buffer *dst,
 		     int dst_offset, int src_offset, int nr_items)
 {
 	int ret;
@@ -3305,7 +3304,7 @@ static int balance_node_right(struct btrfs_trans_handle *trans,
 	if (max_push < push_items)
 		push_items = max_push;
 
-	tree_mod_log_eb_move(fs_info, dst, push_items, 0, dst_nritems);
+	tree_mod_log_eb_move(dst, push_items, 0, dst_nritems);
 	memmove_extent_buffer(dst, btrfs_node_key_ptr_offset(push_items),
 				      btrfs_node_key_ptr_offset(0),
 				      (dst_nritems) *
@@ -3424,8 +3423,8 @@ static void insert_ptr(struct btrfs_trans_handle *trans,
 	BUG_ON(nritems == BTRFS_NODEPTRS_PER_BLOCK(fs_info));
 	if (slot != nritems) {
 		if (level)
-			tree_mod_log_eb_move(fs_info, lower, slot + 1,
-					     slot, nritems - slot);
+			tree_mod_log_eb_move(lower, slot + 1, slot,
+					nritems - slot);
 		memmove_extent_buffer(lower,
 			      btrfs_node_key_ptr_offset(slot + 1),
 			      btrfs_node_key_ptr_offset(slot),
@@ -4897,8 +4896,8 @@ static void del_ptr(struct btrfs_root *root, struct btrfs_path *path,
 	nritems = btrfs_header_nritems(parent);
 	if (slot != nritems - 1) {
 		if (level)
-			tree_mod_log_eb_move(fs_info, parent, slot,
-					     slot + 1, nritems - slot - 1);
+			tree_mod_log_eb_move(parent, slot, slot + 1,
+					nritems - slot - 1);
 		memmove_extent_buffer(parent,
 			      btrfs_node_key_ptr_offset(slot),
 			      btrfs_node_key_ptr_offset(slot + 1),

From b6dfa35bd56762581bfc60b7eb9b1a4d7d10c289 Mon Sep 17 00:00:00 2001
From: David Sterba <dsterba@suse.com>
Date: Mon, 5 Mar 2018 15:31:18 +0100
Subject: [PATCH 104/164] btrfs: embed tree_mod_move structure to tree_mod_elem

The tree_mod_move is not used anywhere and can be embedded as anonymous
structure.

Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/ctree.c | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 53dfce12e3fb..6b0192ebf01f 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -299,11 +299,6 @@ enum mod_log_op {
 	MOD_LOG_ROOT_REPLACE,
 };
 
-struct tree_mod_move {
-	int dst_slot;
-	int nr_items;
-};
-
 struct tree_mod_root {
 	u64 logical;
 	u8 level;
@@ -326,7 +321,10 @@ struct tree_mod_elem {
 	u64 blockptr;
 
 	/* this is used for op == MOD_LOG_MOVE_KEYS */
-	struct tree_mod_move move;
+	struct {
+		int dst_slot;
+		int nr_items;
+	} move;
 
 	/* this is used for op == MOD_LOG_ROOT_REPLACE */
 	struct tree_mod_root old_root;

From bcd24dabe080a627333fb058fdfe760dc579edf1 Mon Sep 17 00:00:00 2001
From: David Sterba <dsterba@suse.com>
Date: Mon, 5 Mar 2018 15:33:18 +0100
Subject: [PATCH 105/164] btrfs: drop fs_info parameter from
 __tree_mod_log_oldest_root

It's provided by the extent_buffer.

Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/ctree.c | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 6b0192ebf01f..dbd0f976e59d 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -1195,9 +1195,8 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
  * returns the logical address of the oldest predecessor of the given root.
  * entries older than time_seq are ignored.
  */
-static struct tree_mod_elem *
-__tree_mod_log_oldest_root(struct btrfs_fs_info *fs_info,
-			   struct extent_buffer *eb_root, u64 time_seq)
+static struct tree_mod_elem *__tree_mod_log_oldest_root(
+		struct extent_buffer *eb_root, u64 time_seq)
 {
 	struct tree_mod_elem *tm;
 	struct tree_mod_elem *found = NULL;
@@ -1214,7 +1213,7 @@ __tree_mod_log_oldest_root(struct btrfs_fs_info *fs_info,
 	 * first operation that's logged for this root.
 	 */
 	while (1) {
-		tm = tree_mod_log_search_oldest(fs_info, root_logical,
+		tm = tree_mod_log_search_oldest(eb_root->fs_info, root_logical,
 						time_seq);
 		if (!looped && !tm)
 			return NULL;
@@ -1404,7 +1403,7 @@ get_old_root(struct btrfs_root *root, u64 time_seq)
 	u64 logical;
 
 	eb_root = btrfs_read_lock_root_node(root);
-	tm = __tree_mod_log_oldest_root(fs_info, eb_root, time_seq);
+	tm = __tree_mod_log_oldest_root(eb_root, time_seq);
 	if (!tm)
 		return eb_root;
 
@@ -1468,7 +1467,7 @@ int btrfs_old_root_level(struct btrfs_root *root, u64 time_seq)
 	int level;
 	struct extent_buffer *eb_root = btrfs_root_node(root);
 
-	tm = __tree_mod_log_oldest_root(root->fs_info, eb_root, time_seq);
+	tm = __tree_mod_log_oldest_root(eb_root, time_seq);
 	if (tm && tm->op == MOD_LOG_ROOT_REPLACE) {
 		level = tm->old_root.level;
 	} else {

From b1a09f1ec540408abf3a50d15dff5d9506932693 Mon Sep 17 00:00:00 2001
From: David Sterba <dsterba@suse.com>
Date: Mon, 5 Mar 2018 15:43:41 +0100
Subject: [PATCH 106/164] btrfs: remove trivial locking wrappers of tree mod
 log

The wrappers are trivial and do not bring any extra value on top of the
plain locking primitives.

Reviewed-by: Nikolay Borisov <nborisov@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/ctree.c | 58 ++++++++++++++++--------------------------------
 1 file changed, 19 insertions(+), 39 deletions(-)

diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index dbd0f976e59d..a1c987a26827 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -330,26 +330,6 @@ struct tree_mod_elem {
 	struct tree_mod_root old_root;
 };
 
-static inline void tree_mod_log_read_lock(struct btrfs_fs_info *fs_info)
-{
-	read_lock(&fs_info->tree_mod_log_lock);
-}
-
-static inline void tree_mod_log_read_unlock(struct btrfs_fs_info *fs_info)
-{
-	read_unlock(&fs_info->tree_mod_log_lock);
-}
-
-static inline void tree_mod_log_write_lock(struct btrfs_fs_info *fs_info)
-{
-	write_lock(&fs_info->tree_mod_log_lock);
-}
-
-static inline void tree_mod_log_write_unlock(struct btrfs_fs_info *fs_info)
-{
-	write_unlock(&fs_info->tree_mod_log_lock);
-}
-
 /*
  * Pull a new tree mod seq number for our operation.
  */
@@ -369,14 +349,14 @@ static inline u64 btrfs_inc_tree_mod_seq(struct btrfs_fs_info *fs_info)
 u64 btrfs_get_tree_mod_seq(struct btrfs_fs_info *fs_info,
 			   struct seq_list *elem)
 {
-	tree_mod_log_write_lock(fs_info);
+	write_lock(&fs_info->tree_mod_log_lock);
 	spin_lock(&fs_info->tree_mod_seq_lock);
 	if (!elem->seq) {
 		elem->seq = btrfs_inc_tree_mod_seq(fs_info);
 		list_add_tail(&elem->list, &fs_info->tree_mod_seq_list);
 	}
 	spin_unlock(&fs_info->tree_mod_seq_lock);
-	tree_mod_log_write_unlock(fs_info);
+	write_unlock(&fs_info->tree_mod_log_lock);
 
 	return elem->seq;
 }
@@ -418,7 +398,7 @@ void btrfs_put_tree_mod_seq(struct btrfs_fs_info *fs_info,
 	 * anything that's lower than the lowest existing (read: blocked)
 	 * sequence number can be removed from the tree.
 	 */
-	tree_mod_log_write_lock(fs_info);
+	write_lock(&fs_info->tree_mod_log_lock);
 	tm_root = &fs_info->tree_mod_log;
 	for (node = rb_first(tm_root); node; node = next) {
 		next = rb_next(node);
@@ -428,7 +408,7 @@ void btrfs_put_tree_mod_seq(struct btrfs_fs_info *fs_info,
 		rb_erase(node, tm_root);
 		kfree(tm);
 	}
-	tree_mod_log_write_unlock(fs_info);
+	write_unlock(&fs_info->tree_mod_log_lock);
 }
 
 /*
@@ -439,7 +419,7 @@ void btrfs_put_tree_mod_seq(struct btrfs_fs_info *fs_info,
  * for root replace operations, or the logical address of the affected
  * block for all other operations.
  *
- * Note: must be called with write lock (tree_mod_log_write_lock).
+ * Note: must be called with write lock for fs_info::tree_mod_log_lock.
  */
 static noinline int
 __tree_mod_log_insert(struct btrfs_fs_info *fs_info, struct tree_mod_elem *tm)
@@ -477,7 +457,7 @@ __tree_mod_log_insert(struct btrfs_fs_info *fs_info, struct tree_mod_elem *tm)
  * Determines if logging can be omitted. Returns 1 if it can. Otherwise, it
  * returns zero with the tree_mod_log_lock acquired. The caller must hold
  * this until all tree mod log insertions are recorded in the rb tree and then
- * call tree_mod_log_write_unlock() to release.
+ * write unlock fs_info::tree_mod_log_lock.
  */
 static inline int tree_mod_dont_log(struct btrfs_fs_info *fs_info,
 				    struct extent_buffer *eb) {
@@ -487,9 +467,9 @@ static inline int tree_mod_dont_log(struct btrfs_fs_info *fs_info,
 	if (eb && btrfs_header_level(eb) == 0)
 		return 1;
 
-	tree_mod_log_write_lock(fs_info);
+	write_lock(&fs_info->tree_mod_log_lock);
 	if (list_empty(&(fs_info)->tree_mod_seq_list)) {
-		tree_mod_log_write_unlock(fs_info);
+		write_unlock(&fs_info->tree_mod_log_lock);
 		return 1;
 	}
 
@@ -551,7 +531,7 @@ static noinline int tree_mod_log_insert_key(struct extent_buffer *eb, int slot,
 	}
 
 	ret = __tree_mod_log_insert(eb->fs_info, tm);
-	tree_mod_log_write_unlock(eb->fs_info);
+	write_unlock(&eb->fs_info->tree_mod_log_lock);
 	if (ret)
 		kfree(tm);
 
@@ -613,7 +593,7 @@ static noinline int tree_mod_log_insert_move(struct extent_buffer *eb,
 	ret = __tree_mod_log_insert(eb->fs_info, tm);
 	if (ret)
 		goto free_tms;
-	tree_mod_log_write_unlock(eb->fs_info);
+	write_unlock(&eb->fs_info->tree_mod_log_lock);
 	kfree(tm_list);
 
 	return 0;
@@ -624,7 +604,7 @@ free_tms:
 		kfree(tm_list[i]);
 	}
 	if (locked)
-		tree_mod_log_write_unlock(eb->fs_info);
+		write_unlock(&eb->fs_info->tree_mod_log_lock);
 	kfree(tm_list);
 	kfree(tm);
 
@@ -703,7 +683,7 @@ static noinline int tree_mod_log_insert_root(struct extent_buffer *old_root,
 	if (!ret)
 		ret = __tree_mod_log_insert(fs_info, tm);
 
-	tree_mod_log_write_unlock(fs_info);
+	write_unlock(&fs_info->tree_mod_log_lock);
 	if (ret)
 		goto free_tms;
 	kfree(tm_list);
@@ -730,7 +710,7 @@ __tree_mod_log_search(struct btrfs_fs_info *fs_info, u64 start, u64 min_seq,
 	struct tree_mod_elem *cur = NULL;
 	struct tree_mod_elem *found = NULL;
 
-	tree_mod_log_read_lock(fs_info);
+	read_lock(&fs_info->tree_mod_log_lock);
 	tm_root = &fs_info->tree_mod_log;
 	node = tm_root->rb_node;
 	while (node) {
@@ -758,7 +738,7 @@ __tree_mod_log_search(struct btrfs_fs_info *fs_info, u64 start, u64 min_seq,
 			break;
 		}
 	}
-	tree_mod_log_read_unlock(fs_info);
+	read_unlock(&fs_info->tree_mod_log_lock);
 
 	return found;
 }
@@ -839,7 +819,7 @@ tree_mod_log_eb_copy(struct btrfs_fs_info *fs_info, struct extent_buffer *dst,
 			goto free_tms;
 	}
 
-	tree_mod_log_write_unlock(fs_info);
+	write_unlock(&fs_info->tree_mod_log_lock);
 	kfree(tm_list);
 
 	return 0;
@@ -851,7 +831,7 @@ free_tms:
 		kfree(tm_list[i]);
 	}
 	if (locked)
-		tree_mod_log_write_unlock(fs_info);
+		write_unlock(&fs_info->tree_mod_log_lock);
 	kfree(tm_list);
 
 	return ret;
@@ -906,7 +886,7 @@ static noinline int tree_mod_log_free_eb(struct extent_buffer *eb)
 		goto free_tms;
 
 	ret = __tree_mod_log_free_eb(eb->fs_info, tm_list, nritems);
-	tree_mod_log_write_unlock(eb->fs_info);
+	write_unlock(&eb->fs_info->tree_mod_log_lock);
 	if (ret)
 		goto free_tms;
 	kfree(tm_list);
@@ -1262,7 +1242,7 @@ __tree_mod_log_rewind(struct btrfs_fs_info *fs_info, struct extent_buffer *eb,
 	unsigned long p_size = sizeof(struct btrfs_key_ptr);
 
 	n = btrfs_header_nritems(eb);
-	tree_mod_log_read_lock(fs_info);
+	read_lock(&fs_info->tree_mod_log_lock);
 	while (tm && tm->seq >= time_seq) {
 		/*
 		 * all the operations are recorded with the operator used for
@@ -1317,7 +1297,7 @@ __tree_mod_log_rewind(struct btrfs_fs_info *fs_info, struct extent_buffer *eb,
 		if (tm->logical != first_tm->logical)
 			break;
 	}
-	tree_mod_log_read_unlock(fs_info);
+	read_unlock(&fs_info->tree_mod_log_lock);
 	btrfs_set_header_nritems(eb, n);
 }
 

From bf1d342510bdba21b5f0c68057cd9fe6f68d0dc1 Mon Sep 17 00:00:00 2001
From: David Sterba <dsterba@suse.com>
Date: Mon, 5 Mar 2018 15:47:39 +0100
Subject: [PATCH 107/164] btrfs: kill trivial wrapper tree_mod_log_eb_move

The wrapper is effectively an alias for tree_mod_log_insert_move but
also hides the missing error handling. To make that more visible, lift
the BUG_ON to the callers.

Reviewed-by: Nikolay Borisov <nborisov@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/ctree.c | 27 ++++++++++++---------------
 1 file changed, 12 insertions(+), 15 deletions(-)

diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index a1c987a26827..1e6a3281befe 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -837,14 +837,6 @@ free_tms:
 	return ret;
 }
 
-static inline void tree_mod_log_eb_move(struct extent_buffer *dst,
-		     int dst_offset, int src_offset, int nr_items)
-{
-	int ret;
-	ret = tree_mod_log_insert_move(dst, dst_offset, src_offset, nr_items);
-	BUG_ON(ret < 0);
-}
-
 static noinline void tree_mod_log_set_node_key(struct extent_buffer *eb,
 		int slot, int atomic)
 {
@@ -3225,8 +3217,8 @@ static int push_node_left(struct btrfs_trans_handle *trans,
 
 	if (push_items < src_nritems) {
 		/*
-		 * don't call tree_mod_log_eb_move here, key removal was already
-		 * fully logged by tree_mod_log_eb_copy above.
+		 * Don't call tree_mod_log_insert_move here, key removal was
+		 * already fully logged by tree_mod_log_eb_copy above.
 		 */
 		memmove_extent_buffer(src, btrfs_node_key_ptr_offset(0),
 				      btrfs_node_key_ptr_offset(push_items),
@@ -3281,7 +3273,8 @@ static int balance_node_right(struct btrfs_trans_handle *trans,
 	if (max_push < push_items)
 		push_items = max_push;
 
-	tree_mod_log_eb_move(dst, push_items, 0, dst_nritems);
+	ret = tree_mod_log_insert_move(dst, push_items, 0, dst_nritems);
+	BUG_ON(ret < 0);
 	memmove_extent_buffer(dst, btrfs_node_key_ptr_offset(push_items),
 				      btrfs_node_key_ptr_offset(0),
 				      (dst_nritems) *
@@ -3399,9 +3392,11 @@ static void insert_ptr(struct btrfs_trans_handle *trans,
 	BUG_ON(slot > nritems);
 	BUG_ON(nritems == BTRFS_NODEPTRS_PER_BLOCK(fs_info));
 	if (slot != nritems) {
-		if (level)
-			tree_mod_log_eb_move(lower, slot + 1, slot,
+		if (level) {
+			ret = tree_mod_log_insert_move(lower, slot + 1, slot,
 					nritems - slot);
+			BUG_ON(ret < 0);
+		}
 		memmove_extent_buffer(lower,
 			      btrfs_node_key_ptr_offset(slot + 1),
 			      btrfs_node_key_ptr_offset(slot),
@@ -4872,9 +4867,11 @@ static void del_ptr(struct btrfs_root *root, struct btrfs_path *path,
 
 	nritems = btrfs_header_nritems(parent);
 	if (slot != nritems - 1) {
-		if (level)
-			tree_mod_log_eb_move(parent, slot, slot + 1,
+		if (level) {
+			ret = tree_mod_log_insert_move(parent, slot, slot + 1,
 					nritems - slot - 1);
+			BUG_ON(ret < 0);
+		}
 		memmove_extent_buffer(parent,
 			      btrfs_node_key_ptr_offset(slot),
 			      btrfs_node_key_ptr_offset(slot + 1),

From 0e82bcfe3c5e557ff9262eac59526e5208b84c60 Mon Sep 17 00:00:00 2001
From: David Sterba <dsterba@suse.com>
Date: Mon, 5 Mar 2018 16:16:54 +0100
Subject: [PATCH 108/164] btrfs: kill tree_mod_log_set_node_key helper

A trivial wrapper that can be simply opencoded and makes the GFP
allocation request more visible. The error handling is now moved to the
callers.

Reviewed-by: Nikolay Borisov <nborisov@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/ctree.c | 32 +++++++++++++++++---------------
 1 file changed, 17 insertions(+), 15 deletions(-)

diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 1e6a3281befe..260ca86a5cf8 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -837,16 +837,6 @@ free_tms:
 	return ret;
 }
 
-static noinline void tree_mod_log_set_node_key(struct extent_buffer *eb,
-		int slot, int atomic)
-{
-	int ret;
-
-	ret = tree_mod_log_insert_key(eb, slot, MOD_LOG_KEY_REPLACE,
-					atomic ? GFP_ATOMIC : GFP_NOFS);
-	BUG_ON(ret < 0);
-}
-
 static noinline int tree_mod_log_free_eb(struct extent_buffer *eb)
 {
 	struct tree_mod_elem **tm_list = NULL;
@@ -1962,7 +1952,9 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
 		} else {
 			struct btrfs_disk_key right_key;
 			btrfs_node_key(right, &right_key, 0);
-			tree_mod_log_set_node_key(parent, pslot + 1, 0);
+			ret = tree_mod_log_insert_key(parent, pslot + 1,
+					MOD_LOG_KEY_REPLACE, GFP_NOFS);
+			BUG_ON(ret < 0);
 			btrfs_set_node_key(parent, &right_key, pslot + 1);
 			btrfs_mark_buffer_dirty(parent);
 		}
@@ -2006,7 +1998,9 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
 		/* update the parent key to reflect our changes */
 		struct btrfs_disk_key mid_key;
 		btrfs_node_key(mid, &mid_key, 0);
-		tree_mod_log_set_node_key(parent, pslot, 0);
+		ret = tree_mod_log_insert_key(parent, pslot,
+				MOD_LOG_KEY_REPLACE, GFP_NOFS);
+		BUG_ON(ret < 0);
 		btrfs_set_node_key(parent, &mid_key, pslot);
 		btrfs_mark_buffer_dirty(parent);
 	}
@@ -2107,7 +2101,9 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans,
 			struct btrfs_disk_key disk_key;
 			orig_slot += left_nr;
 			btrfs_node_key(mid, &disk_key, 0);
-			tree_mod_log_set_node_key(parent, pslot, 0);
+			ret = tree_mod_log_insert_key(parent, pslot,
+					MOD_LOG_KEY_REPLACE, GFP_NOFS);
+			BUG_ON(ret < 0);
 			btrfs_set_node_key(parent, &disk_key, pslot);
 			btrfs_mark_buffer_dirty(parent);
 			if (btrfs_header_nritems(left) > orig_slot) {
@@ -2161,7 +2157,9 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans,
 			struct btrfs_disk_key disk_key;
 
 			btrfs_node_key(right, &disk_key, 0);
-			tree_mod_log_set_node_key(parent, pslot + 1, 0);
+			ret = tree_mod_log_insert_key(parent, pslot + 1,
+					MOD_LOG_KEY_REPLACE, GFP_NOFS);
+			BUG_ON(ret < 0);
 			btrfs_set_node_key(parent, &disk_key, pslot + 1);
 			btrfs_mark_buffer_dirty(parent);
 
@@ -3114,13 +3112,17 @@ static void fixup_low_keys(struct btrfs_fs_info *fs_info,
 {
 	int i;
 	struct extent_buffer *t;
+	int ret;
 
 	for (i = level; i < BTRFS_MAX_LEVEL; i++) {
 		int tslot = path->slots[i];
+
 		if (!path->nodes[i])
 			break;
 		t = path->nodes[i];
-		tree_mod_log_set_node_key(t, tslot, 1);
+		ret = tree_mod_log_insert_key(t, tslot, MOD_LOG_KEY_REPLACE,
+				GFP_ATOMIC);
+		BUG_ON(ret < 0);
 		btrfs_set_node_key(t, key, tslot);
 		btrfs_mark_buffer_dirty(path->nodes[i]);
 		if (tslot != 0)

From d9d19a010b8b186668ce1182bcf92ab53d11c084 Mon Sep 17 00:00:00 2001
From: David Sterba <dsterba@suse.com>
Date: Mon, 5 Mar 2018 16:35:29 +0100
Subject: [PATCH 109/164] btrfs: kill tree_mod_log_set_root_pointer helper

A useless wrapper around tree_mod_log_insert_root that hides missing
error handling. Move it to the callers.

Reviewed-by: Nikolay Borisov <nborisov@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/ctree.c | 20 +++++++-------------
 1 file changed, 7 insertions(+), 13 deletions(-)

diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 260ca86a5cf8..1ef6b67f893a 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -883,16 +883,6 @@ free_tms:
 	return ret;
 }
 
-static noinline void
-tree_mod_log_set_root_pointer(struct btrfs_root *root,
-			      struct extent_buffer *new_root_node,
-			      int log_removal)
-{
-	int ret;
-	ret = tree_mod_log_insert_root(root->node, new_root_node, log_removal);
-	BUG_ON(ret < 0);
-}
-
 /*
  * check if the tree block can be shared by multiple trees
  */
@@ -1119,7 +1109,8 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
 			parent_start = buf->start;
 
 		extent_buffer_get(cow);
-		tree_mod_log_set_root_pointer(root, cow, 1);
+		ret = tree_mod_log_insert_root(root->node, cow, 1);
+		BUG_ON(ret < 0);
 		rcu_assign_pointer(root->node, cow);
 
 		btrfs_free_tree_block(trans, root, buf, parent_start,
@@ -1873,7 +1864,8 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
 			goto enospc;
 		}
 
-		tree_mod_log_set_root_pointer(root, child, 1);
+		ret = tree_mod_log_insert_root(root->node, child, 1);
+		BUG_ON(ret < 0);
 		rcu_assign_pointer(root->node, child);
 
 		add_root_to_dirty_list(root);
@@ -3319,6 +3311,7 @@ static noinline int insert_new_root(struct btrfs_trans_handle *trans,
 	struct extent_buffer *c;
 	struct extent_buffer *old;
 	struct btrfs_disk_key lower_key;
+	int ret;
 
 	BUG_ON(path->nodes[level]);
 	BUG_ON(path->nodes[level-1] != root->node);
@@ -3357,7 +3350,8 @@ static noinline int insert_new_root(struct btrfs_trans_handle *trans,
 	btrfs_mark_buffer_dirty(c);
 
 	old = root->node;
-	tree_mod_log_set_root_pointer(root, c, 0);
+	ret = tree_mod_log_insert_root(root->node, c, 0);
+	BUG_ON(ret < 0);
 	rcu_assign_pointer(root->node, c);
 
 	/* the super has an extra ref to root->node */

From a758781d4b76c38374f155e2f2cf902e13b9e50e Mon Sep 17 00:00:00 2001
From: David Sterba <dsterba@suse.com>
Date: Fri, 23 Jun 2017 03:05:23 +0200
Subject: [PATCH 110/164] btrfs: separate types for submit_bio_start and
 submit_bio_done

The callbacks make use of different parameters that are passed to the
other type unnecessarily. This patch adds separate types for each and
the unused parameters will be removed.

The type extent_submit_bio_hook_t keeps all parameters and can be used
where the start/done types are not appropriate.

Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/disk-io.c   | 8 ++++----
 fs/btrfs/disk-io.h   | 4 ++--
 fs/btrfs/extent_io.h | 9 +++++++++
 3 files changed, 15 insertions(+), 6 deletions(-)

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index bbbcbf87ac93..ee5d29a0219f 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -124,8 +124,8 @@ struct async_submit_bio {
 	void *private_data;
 	struct btrfs_fs_info *fs_info;
 	struct bio *bio;
-	extent_submit_bio_hook_t *submit_bio_start;
-	extent_submit_bio_hook_t *submit_bio_done;
+	extent_submit_bio_start_t *submit_bio_start;
+	extent_submit_bio_done_t *submit_bio_done;
 	int mirror_num;
 	unsigned long bio_flags;
 	/*
@@ -751,8 +751,8 @@ static void run_one_async_free(struct btrfs_work *work)
 blk_status_t btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct bio *bio,
 				 int mirror_num, unsigned long bio_flags,
 				 u64 bio_offset, void *private_data,
-				 extent_submit_bio_hook_t *submit_bio_start,
-				 extent_submit_bio_hook_t *submit_bio_done)
+				 extent_submit_bio_start_t *submit_bio_start,
+				 extent_submit_bio_done_t *submit_bio_done)
 {
 	struct async_submit_bio *async;
 
diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
index aaf99529883d..70a88d61b547 100644
--- a/fs/btrfs/disk-io.h
+++ b/fs/btrfs/disk-io.h
@@ -131,8 +131,8 @@ blk_status_t btrfs_bio_wq_end_io(struct btrfs_fs_info *info, struct bio *bio,
 blk_status_t btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct bio *bio,
 			int mirror_num, unsigned long bio_flags,
 			u64 bio_offset, void *private_data,
-			extent_submit_bio_hook_t *submit_bio_start,
-			extent_submit_bio_hook_t *submit_bio_done);
+			extent_submit_bio_start_t *submit_bio_start,
+			extent_submit_bio_done_t *submit_bio_done);
 int btrfs_write_tree_block(struct extent_buffer *buf);
 void btrfs_wait_tree_block_writeback(struct extent_buffer *buf);
 int btrfs_init_log_root_tree(struct btrfs_trans_handle *trans,
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index c82a5842d524..bbfae2abfb39 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -95,6 +95,15 @@ struct io_failure_record;
 typedef	blk_status_t (extent_submit_bio_hook_t)(void *private_data, struct bio *bio,
 				       int mirror_num, unsigned long bio_flags,
 				       u64 bio_offset);
+
+typedef blk_status_t (extent_submit_bio_start_t)(void *private_data,
+		struct bio *bio, int mirror_num, unsigned long bio_flags,
+		u64 bio_offset);
+
+typedef blk_status_t (extent_submit_bio_done_t)(void *private_data,
+		struct bio *bio, int mirror_num, unsigned long bio_flags,
+		u64 bio_offset);
+
 struct extent_io_ops {
 	/*
 	 * The following callbacks must be allways defined, the function

From d0779291b1e9666aa4aac46ffd8062e3c3b0f2ab Mon Sep 17 00:00:00 2001
From: David Sterba <dsterba@suse.com>
Date: Thu, 8 Mar 2018 13:47:33 +0100
Subject: [PATCH 111/164] btrfs: remove unused parameters from
 extent_submit_bio_start_t

Remove parameters not used by any of the callbacks.

Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/disk-io.c   | 2 --
 fs/btrfs/extent_io.h | 3 +--
 fs/btrfs/inode.c     | 4 +---
 3 files changed, 2 insertions(+), 7 deletions(-)

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index ee5d29a0219f..6fe5a959d56b 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -717,7 +717,6 @@ static void run_one_async_start(struct btrfs_work *work)
 
 	async = container_of(work, struct  async_submit_bio, work);
 	ret = async->submit_bio_start(async->private_data, async->bio,
-				      async->mirror_num, async->bio_flags,
 				      async->bio_offset);
 	if (ret)
 		async->status = ret;
@@ -800,7 +799,6 @@ static blk_status_t btree_csum_one_bio(struct bio *bio)
 }
 
 static blk_status_t __btree_submit_bio_start(void *private_data, struct bio *bio,
-					     int mirror_num, unsigned long bio_flags,
 					     u64 bio_offset)
 {
 	/*
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index bbfae2abfb39..6596b697b827 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -97,8 +97,7 @@ typedef	blk_status_t (extent_submit_bio_hook_t)(void *private_data, struct bio *
 				       u64 bio_offset);
 
 typedef blk_status_t (extent_submit_bio_start_t)(void *private_data,
-		struct bio *bio, int mirror_num, unsigned long bio_flags,
-		u64 bio_offset);
+		struct bio *bio, u64 bio_offset);
 
 typedef blk_status_t (extent_submit_bio_done_t)(void *private_data,
 		struct bio *bio, int mirror_num, unsigned long bio_flags,
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index fc5b7d82b842..08ae94415ed4 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -1942,7 +1942,6 @@ int btrfs_merge_bio_hook(struct page *page, unsigned long offset,
  * are inserted into the btree
  */
 static blk_status_t __btrfs_submit_bio_start(void *private_data, struct bio *bio,
-				    int mirror_num, unsigned long bio_flags,
 				    u64 bio_offset)
 {
 	struct inode *inode = private_data;
@@ -8222,8 +8221,7 @@ static void btrfs_endio_direct_write(struct bio *bio)
 }
 
 static blk_status_t __btrfs_submit_bio_start_direct_io(void *private_data,
-				    struct bio *bio, int mirror_num,
-				    unsigned long bio_flags, u64 offset)
+				    struct bio *bio, u64 offset)
 {
 	struct inode *inode = private_data;
 	blk_status_t ret;

From 6c553435870bf351c594437b4ba8babbdb0bb37e Mon Sep 17 00:00:00 2001
From: David Sterba <dsterba@suse.com>
Date: Thu, 8 Mar 2018 13:47:33 +0100
Subject: [PATCH 112/164] btrfs: remove unused parameters from
 extent_submit_bio_done_t

Remove parameters not used by any of the callbacks.

Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/disk-io.c   | 6 ++----
 fs/btrfs/extent_io.h | 3 +--
 fs/btrfs/inode.c     | 3 +--
 3 files changed, 4 insertions(+), 8 deletions(-)

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 6fe5a959d56b..7cec8a003838 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -735,8 +735,7 @@ static void run_one_async_done(struct btrfs_work *work)
 		return;
 	}
 
-	async->submit_bio_done(async->private_data, async->bio, async->mirror_num,
-			       async->bio_flags, async->bio_offset);
+	async->submit_bio_done(async->private_data, async->bio, async->mirror_num);
 }
 
 static void run_one_async_free(struct btrfs_work *work)
@@ -809,8 +808,7 @@ static blk_status_t __btree_submit_bio_start(void *private_data, struct bio *bio
 }
 
 static blk_status_t __btree_submit_bio_done(void *private_data, struct bio *bio,
-					    int mirror_num, unsigned long bio_flags,
-					    u64 bio_offset)
+					    int mirror_num)
 {
 	struct inode *inode = private_data;
 	blk_status_t ret;
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index 6596b697b827..b77d84909863 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -100,8 +100,7 @@ typedef blk_status_t (extent_submit_bio_start_t)(void *private_data,
 		struct bio *bio, u64 bio_offset);
 
 typedef blk_status_t (extent_submit_bio_done_t)(void *private_data,
-		struct bio *bio, int mirror_num, unsigned long bio_flags,
-		u64 bio_offset);
+		struct bio *bio, int mirror_num);
 
 struct extent_io_ops {
 	/*
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 08ae94415ed4..a03712d496d6 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -1961,8 +1961,7 @@ static blk_status_t __btrfs_submit_bio_start(void *private_data, struct bio *bio
  * are inserted into the btree
  */
 static blk_status_t __btrfs_submit_bio_done(void *private_data, struct bio *bio,
-			  int mirror_num, unsigned long bio_flags,
-			  u64 bio_offset)
+			  int mirror_num)
 {
 	struct inode *inode = private_data;
 	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);

From d0ee39349311ce823c36ece7fa066827ab8eb7af Mon Sep 17 00:00:00 2001
From: David Sterba <dsterba@suse.com>
Date: Thu, 8 Mar 2018 14:35:48 +0100
Subject: [PATCH 113/164] btrfs: rename submit callbacks and drop double
 underscores

Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/disk-io.c |  8 ++++----
 fs/btrfs/inode.c   | 23 +++++++++++------------
 2 files changed, 15 insertions(+), 16 deletions(-)

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 7cec8a003838..156116655a32 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -797,7 +797,7 @@ static blk_status_t btree_csum_one_bio(struct bio *bio)
 	return errno_to_blk_status(ret);
 }
 
-static blk_status_t __btree_submit_bio_start(void *private_data, struct bio *bio,
+static blk_status_t btree_submit_bio_start(void *private_data, struct bio *bio,
 					     u64 bio_offset)
 {
 	/*
@@ -807,7 +807,7 @@ static blk_status_t __btree_submit_bio_start(void *private_data, struct bio *bio
 	return btree_csum_one_bio(bio);
 }
 
-static blk_status_t __btree_submit_bio_done(void *private_data, struct bio *bio,
+static blk_status_t btree_submit_bio_done(void *private_data, struct bio *bio,
 					    int mirror_num)
 {
 	struct inode *inode = private_data;
@@ -867,8 +867,8 @@ static blk_status_t btree_submit_bio_hook(void *private_data, struct bio *bio,
 		 */
 		ret = btrfs_wq_submit_bio(fs_info, bio, mirror_num, 0,
 					  bio_offset, private_data,
-					  __btree_submit_bio_start,
-					  __btree_submit_bio_done);
+					  btree_submit_bio_start,
+					  btree_submit_bio_done);
 	}
 
 	if (ret)
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index a03712d496d6..6c08c03fc03c 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -1941,7 +1941,7 @@ int btrfs_merge_bio_hook(struct page *page, unsigned long offset,
  * At IO completion time the cums attached on the ordered extent record
  * are inserted into the btree
  */
-static blk_status_t __btrfs_submit_bio_start(void *private_data, struct bio *bio,
+static blk_status_t btrfs_submit_bio_start(void *private_data, struct bio *bio,
 				    u64 bio_offset)
 {
 	struct inode *inode = private_data;
@@ -1960,7 +1960,7 @@ static blk_status_t __btrfs_submit_bio_start(void *private_data, struct bio *bio
  * At IO completion time the cums attached on the ordered extent record
  * are inserted into the btree
  */
-static blk_status_t __btrfs_submit_bio_done(void *private_data, struct bio *bio,
+static blk_status_t btrfs_submit_bio_done(void *private_data, struct bio *bio,
 			  int mirror_num)
 {
 	struct inode *inode = private_data;
@@ -2033,8 +2033,8 @@ static blk_status_t btrfs_submit_bio_hook(void *private_data, struct bio *bio,
 		/* we're doing a write, do the async checksumming */
 		ret = btrfs_wq_submit_bio(fs_info, bio, mirror_num, bio_flags,
 					  bio_offset, inode,
-					  __btrfs_submit_bio_start,
-					  __btrfs_submit_bio_done);
+					  btrfs_submit_bio_start,
+					  btrfs_submit_bio_done);
 		goto out;
 	} else if (!skip_sum) {
 		ret = btrfs_csum_one_bio(inode, bio, 0, 0);
@@ -8219,7 +8219,7 @@ static void btrfs_endio_direct_write(struct bio *bio)
 	bio_put(bio);
 }
 
-static blk_status_t __btrfs_submit_bio_start_direct_io(void *private_data,
+static blk_status_t btrfs_submit_bio_start_direct_io(void *private_data,
 				    struct bio *bio, u64 offset)
 {
 	struct inode *inode = private_data;
@@ -8300,9 +8300,8 @@ static inline blk_status_t btrfs_lookup_and_bind_dio_csum(struct inode *inode,
 	return 0;
 }
 
-static inline blk_status_t
-__btrfs_submit_dio_bio(struct bio *bio, struct inode *inode, u64 file_offset,
-		       int async_submit)
+static inline blk_status_t btrfs_submit_dio_bio(struct bio *bio,
+		struct inode *inode, u64 file_offset, int async_submit)
 {
 	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
 	struct btrfs_dio_private *dip = bio->bi_private;
@@ -8325,8 +8324,8 @@ __btrfs_submit_dio_bio(struct bio *bio, struct inode *inode, u64 file_offset,
 	if (write && async_submit) {
 		ret = btrfs_wq_submit_bio(fs_info, bio, 0, 0,
 					  file_offset, inode,
-					  __btrfs_submit_bio_start_direct_io,
-					  __btrfs_submit_bio_done);
+					  btrfs_submit_bio_start_direct_io,
+					  btrfs_submit_bio_done);
 		goto err;
 	} else if (write) {
 		/*
@@ -8412,7 +8411,7 @@ static int btrfs_submit_direct_hook(struct btrfs_dio_private *dip)
 		 */
 		atomic_inc(&dip->pending_bios);
 
-		status = __btrfs_submit_dio_bio(bio, inode, file_offset,
+		status = btrfs_submit_dio_bio(bio, inode, file_offset,
 						async_submit);
 		if (status) {
 			bio_put(bio);
@@ -8432,7 +8431,7 @@ static int btrfs_submit_direct_hook(struct btrfs_dio_private *dip)
 	} while (submit_len > 0);
 
 submit:
-	status = __btrfs_submit_dio_bio(bio, inode, file_offset, async_submit);
+	status = btrfs_submit_dio_bio(bio, inode, file_offset, async_submit);
 	if (!status)
 		return 0;
 

From 776c4a7ce86f4c95d9fc6307d44a4e9e1a740e40 Mon Sep 17 00:00:00 2001
From: Nikolay Borisov <nborisov@suse.com>
Date: Tue, 13 Mar 2018 10:26:06 +0200
Subject: [PATCH 114/164] btrfs: Use sizeof directly instead of a constant
 variable

The kernel would like to have all stack VLA usage removed[1].
Unfortunately using an integer constant variable as the size of an
array is still considered a VLA. Instead let's use directly sizeof(var)
which removes the VLA usage. Use the occasion to remove csum_size
altogether and use sizeof() also for the size passed to memcmp

[1]: https://lkml.org/lkml/2018/3/7/621

Signed-off-by: Nikolay Borisov <nborisov@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/disk-io.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 156116655a32..b203982d66bc 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -403,8 +403,7 @@ static int btrfs_check_super_csum(struct btrfs_fs_info *fs_info,
 
 	if (csum_type == BTRFS_CSUM_TYPE_CRC32) {
 		u32 crc = ~(u32)0;
-		const int csum_size = sizeof(crc);
-		char result[csum_size];
+		char result[sizeof(crc)];
 
 		/*
 		 * The super_block structure does not span the whole
@@ -415,7 +414,7 @@ static int btrfs_check_super_csum(struct btrfs_fs_info *fs_info,
 				crc, BTRFS_SUPER_INFO_SIZE - BTRFS_CSUM_SIZE);
 		btrfs_csum_final(crc, result);
 
-		if (memcmp(raw_disk_sb, result, csum_size))
+		if (memcmp(raw_disk_sb, result, sizeof(result)))
 			ret = 1;
 	}
 

From d87ff75863e92a500538ab53318c5740f196631e Mon Sep 17 00:00:00 2001
From: Nikolay Borisov <nborisov@suse.com>
Date: Mon, 12 Mar 2018 14:48:09 +0200
Subject: [PATCH 115/164] btrfs: Handle error from btrfs_uuid_tree_rem call in
 _btrfs_ioctl_set_received_subvol

As with every function which deals with modifying the btree
btrfs_uuid_tree_rem can fail for any number of reasons (ie. EIO/ENOMEM).
Handle return error value from this function gracefully by aborting the
transaction.

Fixes: dd5f9615fc5c ("Btrfs: maintain subvolume items in the UUID tree")
Signed-off-by: Nikolay Borisov <nborisov@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/ioctl.c | 15 +++++++++++----
 1 file changed, 11 insertions(+), 4 deletions(-)

diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 5011b6272cfa..94bcc1bf71ca 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -5048,10 +5048,17 @@ static long _btrfs_ioctl_set_received_subvol(struct file *file,
 	received_uuid_changed = memcmp(root_item->received_uuid, sa->uuid,
 				       BTRFS_UUID_SIZE);
 	if (received_uuid_changed &&
-	    !btrfs_is_empty_uuid(root_item->received_uuid))
-		btrfs_uuid_tree_rem(trans, fs_info, root_item->received_uuid,
-				    BTRFS_UUID_KEY_RECEIVED_SUBVOL,
-				    root->root_key.objectid);
+	    !btrfs_is_empty_uuid(root_item->received_uuid)) {
+		ret = btrfs_uuid_tree_rem(trans, fs_info,
+					  root_item->received_uuid,
+					  BTRFS_UUID_KEY_RECEIVED_SUBVOL,
+					  root->root_key.objectid);
+		if (ret && ret != -ENOENT) {
+		        btrfs_abort_transaction(trans, ret);
+		        btrfs_end_transaction(trans);
+		        goto out;
+		}
+	}
 	memcpy(root_item->received_uuid, sa->uuid, BTRFS_UUID_SIZE);
 	btrfs_set_root_stransid(root_item, sa->stransid);
 	btrfs_set_root_rtransid(root_item, sa->rtransid);

From 6f47c706d9d4055ecdd125023577a20449c12b24 Mon Sep 17 00:00:00 2001
From: Nikolay Borisov <nborisov@suse.com>
Date: Tue, 13 Mar 2018 12:22:32 +0200
Subject: [PATCH 116/164] btrfs: Document parameters of btrfs_reserve_extent

This function is the entry to the extent allocator and as such has
quite a number of parameters. Some of those have subtle effects on the
allocation algorithm. Document the parameters.

Signed-off-by: Nikolay Borisov <nborisov@suse.com>
Reviewed-by: Anand Jain <anand.jain@oracle.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/extent-tree.c | 45 ++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 45 insertions(+)

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 1469117a8e37..0e46142d70f6 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -7968,6 +7968,51 @@ again:
 	up_read(&info->groups_sem);
 }
 
+/*
+ * btrfs_reserve_extent - entry point to the extent allocator. Tries to find a
+ *			  hole that is at least as big as @num_bytes.
+ *
+ * @root           -	The root that will contain this extent
+ *
+ * @ram_bytes      -	The amount of space in ram that @num_bytes take. This
+ *			is used for accounting purposes. This value differs
+ *			from @num_bytes only in the case of compressed extents.
+ *
+ * @num_bytes      -	Number of bytes to allocate on-disk.
+ *
+ * @min_alloc_size -	Indicates the minimum amount of space that the
+ *			allocator should try to satisfy. In some cases
+ *			@num_bytes may be larger than what is required and if
+ *			the filesystem is fragmented then allocation fails.
+ *			However, the presence of @min_alloc_size gives a
+ *			chance to try and satisfy the smaller allocation.
+ *
+ * @empty_size     -	A hint that you plan on doing more COW. This is the
+ *			size in bytes the allocator should try to find free
+ *			next to the block it returns.  This is just a hint and
+ *			may be ignored by the allocator.
+ *
+ * @hint_byte      -	Hint to the allocator to start searching above the byte
+ *			address passed. It might be ignored.
+ *
+ * @ins            -	This key is modified to record the found hole. It will
+ *			have the following values:
+ *			ins->objectid == start position
+ *			ins->flags = BTRFS_EXTENT_ITEM_KEY
+ *			ins->offset == the size of the hole.
+ *
+ * @is_data        -	Boolean flag indicating whether an extent is
+ *			allocated for data (true) or metadata (false)
+ *
+ * @delalloc       -	Boolean flag indicating whether this allocation is for
+ *			delalloc or not. If 'true' data_rwsem of block groups
+ *			is going to be acquired.
+ *
+ *
+ * Returns 0 when an allocation succeeded or < 0 when an error occurred. In
+ * case -ENOSPC is returned then @ins->offset will contain the size of the
+ * largest available hole the allocator managed to find.
+ */
 int btrfs_reserve_extent(struct btrfs_root *root, u64 ram_bytes,
 			 u64 num_bytes, u64 min_alloc_size,
 			 u64 empty_size, u64 hint_byte,

From ba89b80268c162eca8f23c5d71eb710d65a132fe Mon Sep 17 00:00:00 2001
From: Qu Wenruo <wqu@suse.com>
Date: Wed, 31 Jan 2018 13:56:15 +0800
Subject: [PATCH 117/164] btrfs: volumes: Remove the meaningless condition of
 minimal nr_devs when allocating a chunk

When checking the minimal nr_devs, there is one dead and meaningless
condition:

if (ndevs < devs_increment * sub_stripes || ndevs < devs_min) {
    ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

This condition is meaningless, @devs_increment has nothing to do with
@sub_stripes.

In fact, in btrfs_raid_array[], profile with sub_stripes larger than 1
(RAID10) already has the @devs_increment set to 2.
So no need to multiple it by @sub_stripes.

And above condition is also dead.
For RAID10, @devs_increment * @sub_stripes equals 4, which is also the
@devs_min of RAID10.
For other profiles, @sub_stripes is always 1, and since @ndevs is
rounded down to @devs_increment, the condition will always be true.

Remove the meaningless condition to make later reader wander less.

Signed-off-by: Qu Wenruo <wqu@suse.com>
Reviewed-by: Nikolay Borisov <nborisov@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/volumes.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 1e72357bdfa8..dcc2b486b2c5 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -4837,13 +4837,12 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
 	/* round down to number of usable stripes */
 	ndevs = round_down(ndevs, devs_increment);
 
-	if (ndevs < devs_increment * sub_stripes || ndevs < devs_min) {
+	if (ndevs < devs_min) {
 		ret = -ENOSPC;
 		if (btrfs_test_opt(info, ENOSPC_DEBUG)) {
 			btrfs_debug(info,
 	"%s: not enough devices with free space: have=%d minimum required=%d",
-				    __func__, ndevs, min(devs_min,
-				    devs_increment * sub_stripes));
+				    __func__, ndevs, devs_min);
 		}
 		goto error;
 	}

From a6dbceafb915e86d1a34d8d61403c59e01c2875c Mon Sep 17 00:00:00 2001
From: Nikolay Borisov <nborisov@suse.com>
Date: Thu, 15 Mar 2018 14:36:22 +0200
Subject: [PATCH 118/164] btrfs: Remove unused op_key var from add_delayed_refs

Added as part of 86d5f9944252 ("btrfs: convert prelimary reference
tracking to use rbtrees") but never used. tmp_op_key essentially
subsumed that variable.

Signed-off-by: Nikolay Borisov <nborisov@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/backref.c | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c
index 4a33448cbb01..6007dd6b799e 100644
--- a/fs/btrfs/backref.c
+++ b/fs/btrfs/backref.c
@@ -773,15 +773,12 @@ static int add_delayed_refs(const struct btrfs_fs_info *fs_info,
 	struct btrfs_delayed_extent_op *extent_op = head->extent_op;
 	struct btrfs_key key;
 	struct btrfs_key tmp_op_key;
-	struct btrfs_key *op_key = NULL;
 	struct rb_node *n;
 	int count;
 	int ret = 0;
 
-	if (extent_op && extent_op->update_key) {
+	if (extent_op && extent_op->update_key)
 		btrfs_disk_key_to_cpu(&tmp_op_key, &extent_op->key);
-		op_key = &tmp_op_key;
-	}
 
 	spin_lock(&head->lock);
 	for (n = rb_first(&head->ref_tree); n; n = rb_next(n)) {

From d6e823a578970381bec576d376eea16dac9868a1 Mon Sep 17 00:00:00 2001
From: Nikolay Borisov <nborisov@suse.com>
Date: Thu, 15 Mar 2018 14:36:23 +0200
Subject: [PATCH 119/164] btrfs: Remove unused length var from
 scrub_handle_errored_block

Added in b5d67f64f9bc ("Btrfs: change scrub to support big blocks") but
rendered redundant by be50a8ddaae1 ("Btrfs: Simplify
scrub_setup_recheck_block()'s argument").

Signed-off-by: Nikolay Borisov <nborisov@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/scrub.c | 2 --
 1 file changed, 2 deletions(-)

diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index 629313732521..4ab4a68dbc06 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -1111,7 +1111,6 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
 	struct scrub_ctx *sctx = sblock_to_check->sctx;
 	struct btrfs_device *dev;
 	struct btrfs_fs_info *fs_info;
-	u64 length;
 	u64 logical;
 	unsigned int failed_mirror_index;
 	unsigned int is_metadata;
@@ -1139,7 +1138,6 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
 		spin_unlock(&sctx->stat_lock);
 		return 0;
 	}
-	length = sblock_to_check->page_count * PAGE_SIZE;
 	logical = sblock_to_check->pagev[0]->logical;
 	BUG_ON(sblock_to_check->pagev[0]->mirror_num < 1);
 	failed_mirror_index = sblock_to_check->pagev[0]->mirror_num - 1;

From 4eeb97c67a90a2193cb622f4df8076cba7c42023 Mon Sep 17 00:00:00 2001
From: Nikolay Borisov <nborisov@suse.com>
Date: Thu, 15 Mar 2018 14:36:24 +0200
Subject: [PATCH 120/164] btrfs: Remove unused tot_len var from lzo_decompress

Added already unused in a6fa6fae40ec ("btrfs: Add lzo compression
support").

Signed-off-by: Nikolay Borisov <nborisov@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/lzo.c | 2 --
 1 file changed, 2 deletions(-)

diff --git a/fs/btrfs/lzo.c b/fs/btrfs/lzo.c
index 6c7f18cd3b61..1c7f7f70caf4 100644
--- a/fs/btrfs/lzo.c
+++ b/fs/btrfs/lzo.c
@@ -382,14 +382,12 @@ static int lzo_decompress(struct list_head *ws, unsigned char *data_in,
 	struct workspace *workspace = list_entry(ws, struct workspace, list);
 	size_t in_len;
 	size_t out_len;
-	size_t tot_len;
 	int ret = 0;
 	char *kaddr;
 	unsigned long bytes;
 
 	BUG_ON(srclen < LZO_LEN);
 
-	tot_len = read_compress_length(data_in);
 	data_in += LZO_LEN;
 
 	in_len = read_compress_length(data_in);

From 8535dc196769b85ac6cc93f1e06986a57cb08e3d Mon Sep 17 00:00:00 2001
From: Nikolay Borisov <nborisov@suse.com>
Date: Thu, 15 Mar 2018 14:36:25 +0200
Subject: [PATCH 121/164] btrfs: Remove unused root var from
 relink_file_extents

Added in 38c227d87c49 ("Btrfs: snapshot-aware defrag") but subsequently
made redundant by 0b246afa62b0 ("btrfs: root->fs_info cleanup, add
fs_info convenience variables").

Signed-off-by: Nikolay Borisov <nborisov@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/inode.c | 2 --
 1 file changed, 2 deletions(-)

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 6c08c03fc03c..1e9cceeaf4c4 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -2772,12 +2772,10 @@ static void relink_file_extents(struct new_sa_defrag_extent *new)
 	struct sa_defrag_extent_backref *backref;
 	struct sa_defrag_extent_backref *prev = NULL;
 	struct inode *inode;
-	struct btrfs_root *root;
 	struct rb_node *node;
 	int ret;
 
 	inode = new->inode;
-	root = BTRFS_I(inode)->root;
 
 	path = btrfs_alloc_path();
 	if (!path)

From 338dae1ae64f413bc058d0f2e964480add5ddacc Mon Sep 17 00:00:00 2001
From: Nikolay Borisov <nborisov@suse.com>
Date: Thu, 15 Mar 2018 14:36:26 +0200
Subject: [PATCH 122/164] btrfs: remove max_active var from open_ctree

Introduced by 5cdc7ad337fb ("btrfs: Replace fs_info->workers with
btrfs_workqueue.") but obsoleted by 2a4581983f90 ("btrfs: factor
btrfs_init_workqueues() out of open_ctree()").

Signed-off-by: Nikolay Borisov <nborisov@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/disk-io.c | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index b203982d66bc..04834cb65272 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -2389,7 +2389,6 @@ int open_ctree(struct super_block *sb,
 	int err = -EINVAL;
 	int num_backups_tried = 0;
 	int backup_index = 0;
-	u32 max_active;
 	int clear_free_space_tree = 0;
 
 	tree_root = fs_info->tree_root = btrfs_alloc_root(fs_info, GFP_KERNEL);
@@ -2698,8 +2697,6 @@ int open_ctree(struct super_block *sb,
 		goto fail_alloc;
 	}
 
-	max_active = fs_info->thread_pool_size;
-
 	ret = btrfs_init_workqueues(fs_info, fs_devices);
 	if (ret) {
 		err = ret;

From 101d2dc0b2a976a9979a247eb2cf4d1e4e9ec341 Mon Sep 17 00:00:00 2001
From: Nikolay Borisov <nborisov@suse.com>
Date: Thu, 15 Mar 2018 14:36:28 +0200
Subject: [PATCH 123/164] btrfs: Remove unused extent_root var from
 caching_thread

Added by b4570aa994b8 ("btrfs: fix compiling with CONFIG_BTRFS_DEBUG
enabled.") and obsoleted by 2ff7e61e0d30 ("btrfs: take an fs_info
directly when the root is not used otherwise").

Signed-off-by: Nikolay Borisov <nborisov@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/extent-tree.c | 2 --
 1 file changed, 2 deletions(-)

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 0e46142d70f6..9f4e6330d7c5 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -535,13 +535,11 @@ static noinline void caching_thread(struct btrfs_work *work)
 	struct btrfs_block_group_cache *block_group;
 	struct btrfs_fs_info *fs_info;
 	struct btrfs_caching_control *caching_ctl;
-	struct btrfs_root *extent_root;
 	int ret;
 
 	caching_ctl = container_of(work, struct btrfs_caching_control, work);
 	block_group = caching_ctl->block_group;
 	fs_info = block_group->fs_info;
-	extent_root = fs_info->extent_root;
 
 	mutex_lock(&caching_ctl->mutex);
 	down_read(&fs_info->commit_root_sem);

From 39d7d09dc2d60a42c70f78ebf9e56ed123a9050e Mon Sep 17 00:00:00 2001
From: Nikolay Borisov <nborisov@suse.com>
Date: Thu, 15 Mar 2018 14:36:29 +0200
Subject: [PATCH 124/164] btrfs: Remove unused flush var in shrink_delalloc

Added by 08e007d2e577 ("Btrfs: improve the noflush reservation") and
made redundant by 17024ad0a0fd ("Btrfs: fix early ENOSPC due to
delalloc").

Signed-off-by: Nikolay Borisov <nborisov@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/extent-tree.c | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 9f4e6330d7c5..63c679e9b541 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -4829,7 +4829,6 @@ static void shrink_delalloc(struct btrfs_fs_info *fs_info, u64 to_reclaim,
 	long time_left;
 	unsigned long nr_pages;
 	int loops;
-	enum btrfs_reserve_flush_enum flush;
 
 	/* Calc the number of the pages we need flush for space reservation */
 	items = calc_reclaim_items_nr(fs_info, to_reclaim);
@@ -4870,10 +4869,6 @@ static void shrink_delalloc(struct btrfs_fs_info *fs_info, u64 to_reclaim,
 			   atomic_read(&fs_info->async_delalloc_pages) <=
 			   (int)max_reclaim);
 skip_async:
-		if (!trans)
-			flush = BTRFS_RESERVE_FLUSH_ALL;
-		else
-			flush = BTRFS_RESERVE_NO_FLUSH;
 		spin_lock(&space_info->lock);
 		if (list_empty(&space_info->tickets) &&
 		    list_empty(&space_info->priority_tickets)) {

From c79a70b1330b374d6f4d88f266552054a4b58d08 Mon Sep 17 00:00:00 2001
From: Nikolay Borisov <nborisov@suse.com>
Date: Thu, 15 Mar 2018 17:27:37 +0200
Subject: [PATCH 125/164] btrfs: drop fs_info parameter from
 btrfs_run_delayed_refs

It's provided by the transaction handle.

Signed-off-by: Nikolay Borisov <nborisov@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/ctree.h       |  2 +-
 fs/btrfs/extent-tree.c |  9 +++++----
 fs/btrfs/inode.c       |  4 +---
 fs/btrfs/transaction.c | 23 +++++++++++------------
 4 files changed, 18 insertions(+), 20 deletions(-)

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 50c068fd3b38..b0573cdb4d20 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -2623,7 +2623,7 @@ void btrfs_dec_nocow_writers(struct btrfs_fs_info *fs_info, u64 bytenr);
 void btrfs_wait_nocow_writers(struct btrfs_block_group_cache *bg);
 void btrfs_put_block_group(struct btrfs_block_group_cache *cache);
 int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
-			   struct btrfs_fs_info *fs_info, unsigned long count);
+			   unsigned long count);
 int btrfs_async_run_delayed_refs(struct btrfs_fs_info *fs_info,
 				 unsigned long count, u64 transid, int wait);
 int btrfs_lookup_data_extent(struct btrfs_fs_info *fs_info, u64 start, u64 len);
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 63c679e9b541..7b1c7c0e3d75 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -2992,7 +2992,7 @@ static void delayed_ref_async_start(struct btrfs_work *work)
 	if (trans->transid > async->transid)
 		goto end;
 
-	ret = btrfs_run_delayed_refs(trans, fs_info, async->count);
+	ret = btrfs_run_delayed_refs(trans, async->count);
 	if (ret)
 		async->error = ret;
 end:
@@ -3051,8 +3051,9 @@ int btrfs_async_run_delayed_refs(struct btrfs_fs_info *fs_info,
  * Returns <0 on error and aborts the transaction
  */
 int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
-			   struct btrfs_fs_info *fs_info, unsigned long count)
+			   unsigned long count)
 {
+	struct btrfs_fs_info *fs_info = trans->fs_info;
 	struct rb_node *node;
 	struct btrfs_delayed_ref_root *delayed_refs;
 	struct btrfs_delayed_ref_head *head;
@@ -3799,7 +3800,7 @@ again:
 	 * go through delayed refs for all the stuff we've just kicked off
 	 * and then loop back (just once)
 	 */
-	ret = btrfs_run_delayed_refs(trans, fs_info, 0);
+	ret = btrfs_run_delayed_refs(trans, 0);
 	if (!ret && loops == 0) {
 		loops++;
 		spin_lock(&cur_trans->dirty_bgs_lock);
@@ -3881,7 +3882,7 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
 		cache_save_setup(cache, trans, path);
 
 		if (!ret)
-			ret = btrfs_run_delayed_refs(trans, fs_info,
+			ret = btrfs_run_delayed_refs(trans,
 						     (unsigned long) -1);
 
 		if (!ret && cache->disk_cache_state == BTRFS_DC_SETUP) {
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 1e9cceeaf4c4..1a83b71685d1 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -4730,7 +4730,6 @@ delete:
 				if (updates) {
 					trans->delayed_ref_updates = 0;
 					ret = btrfs_run_delayed_refs(trans,
-								   fs_info,
 								   updates * 2);
 					if (ret && !err)
 						err = ret;
@@ -4770,8 +4769,7 @@ error:
 		unsigned long updates = trans->delayed_ref_updates;
 		if (updates) {
 			trans->delayed_ref_updates = 0;
-			ret = btrfs_run_delayed_refs(trans, fs_info,
-						     updates * 2);
+			ret = btrfs_run_delayed_refs(trans, updates * 2);
 			if (ret && !err)
 				err = ret;
 		}
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index f4b1225a3bec..6534fab417ee 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -781,7 +781,6 @@ static int should_end_transaction(struct btrfs_trans_handle *trans)
 int btrfs_should_end_transaction(struct btrfs_trans_handle *trans)
 {
 	struct btrfs_transaction *cur_trans = trans->transaction;
-	struct btrfs_fs_info *fs_info = trans->fs_info;
 	int updates;
 	int err;
 
@@ -793,7 +792,7 @@ int btrfs_should_end_transaction(struct btrfs_trans_handle *trans)
 	updates = trans->delayed_ref_updates;
 	trans->delayed_ref_updates = 0;
 	if (updates) {
-		err = btrfs_run_delayed_refs(trans, fs_info, updates * 2);
+		err = btrfs_run_delayed_refs(trans, updates * 2);
 		if (err) /* Error code will also eval true */
 			return err;
 	}
@@ -1161,7 +1160,7 @@ static noinline int commit_cowonly_roots(struct btrfs_trans_handle *trans)
 	if (ret)
 		return ret;
 
-	ret = btrfs_run_delayed_refs(trans, fs_info, (unsigned long)-1);
+	ret = btrfs_run_delayed_refs(trans, (unsigned long)-1);
 	if (ret)
 		return ret;
 
@@ -1180,7 +1179,7 @@ static noinline int commit_cowonly_roots(struct btrfs_trans_handle *trans)
 		return ret;
 
 	/* run_qgroups might have added some more refs */
-	ret = btrfs_run_delayed_refs(trans, fs_info, (unsigned long)-1);
+	ret = btrfs_run_delayed_refs(trans, (unsigned long)-1);
 	if (ret)
 		return ret;
 again:
@@ -1197,7 +1196,7 @@ again:
 		ret = update_cowonly_root(trans, root);
 		if (ret)
 			return ret;
-		ret = btrfs_run_delayed_refs(trans, fs_info, (unsigned long)-1);
+		ret = btrfs_run_delayed_refs(trans, (unsigned long)-1);
 		if (ret)
 			return ret;
 	}
@@ -1206,7 +1205,7 @@ again:
 		ret = btrfs_write_dirty_block_groups(trans, fs_info);
 		if (ret)
 			return ret;
-		ret = btrfs_run_delayed_refs(trans, fs_info, (unsigned long)-1);
+		ret = btrfs_run_delayed_refs(trans, (unsigned long)-1);
 		if (ret)
 			return ret;
 	}
@@ -1617,7 +1616,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
 		goto fail;
 	}
 
-	ret = btrfs_run_delayed_refs(trans, fs_info, (unsigned long)-1);
+	ret = btrfs_run_delayed_refs(trans, (unsigned long)-1);
 	if (ret) {
 		btrfs_abort_transaction(trans, ret);
 		goto fail;
@@ -1671,7 +1670,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
 		}
 	}
 
-	ret = btrfs_run_delayed_refs(trans, fs_info, (unsigned long)-1);
+	ret = btrfs_run_delayed_refs(trans, (unsigned long)-1);
 	if (ret) {
 		btrfs_abort_transaction(trans, ret);
 		goto fail;
@@ -1954,7 +1953,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
 	/* make a pass through all the delayed refs we have so far
 	 * any runnings procs may add more while we are here
 	 */
-	ret = btrfs_run_delayed_refs(trans, fs_info, 0);
+	ret = btrfs_run_delayed_refs(trans, 0);
 	if (ret) {
 		btrfs_end_transaction(trans);
 		return ret;
@@ -1975,7 +1974,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
 	if (!list_empty(&trans->new_bgs))
 		btrfs_create_pending_block_groups(trans);
 
-	ret = btrfs_run_delayed_refs(trans, fs_info, 0);
+	ret = btrfs_run_delayed_refs(trans, 0);
 	if (ret) {
 		btrfs_end_transaction(trans);
 		return ret;
@@ -2124,7 +2123,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
 		goto scrub_continue;
 	}
 
-	ret = btrfs_run_delayed_refs(trans, fs_info, (unsigned long)-1);
+	ret = btrfs_run_delayed_refs(trans, (unsigned long)-1);
 	if (ret) {
 		mutex_unlock(&fs_info->reloc_mutex);
 		goto scrub_continue;
@@ -2175,7 +2174,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
 	 * commit_fs_roots() can call btrfs_save_ino_cache(), which generates
 	 * new delayed refs. Must handle them or qgroup can be wrong.
 	 */
-	ret = btrfs_run_delayed_refs(trans, fs_info, (unsigned long)-1);
+	ret = btrfs_run_delayed_refs(trans, (unsigned long)-1);
 	if (ret) {
 		mutex_unlock(&fs_info->tree_log_mutex);
 		mutex_unlock(&fs_info->reloc_mutex);

From 460fb20a4bba040d7a95629ef7a4e9b97bfdbb6e Mon Sep 17 00:00:00 2001
From: Nikolay Borisov <nborisov@suse.com>
Date: Thu, 15 Mar 2018 16:00:25 +0200
Subject: [PATCH 126/164] btrfs: Drop fs_info parameter from
 btrfs_qgroup_account_extents

It's provided by the transaction handle.

Signed-off-by: Nikolay Borisov <nborisov@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/qgroup.c      | 4 ++--
 fs/btrfs/qgroup.h      | 3 +--
 fs/btrfs/transaction.c | 4 ++--
 3 files changed, 5 insertions(+), 6 deletions(-)

diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index 0fa4f07b80b8..569f9ab65ceb 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -2014,9 +2014,9 @@ out_free:
 	return ret;
 }
 
-int btrfs_qgroup_account_extents(struct btrfs_trans_handle *trans,
-				 struct btrfs_fs_info *fs_info)
+int btrfs_qgroup_account_extents(struct btrfs_trans_handle *trans)
 {
+	struct btrfs_fs_info *fs_info = trans->fs_info;
 	struct btrfs_qgroup_extent_record *record;
 	struct btrfs_delayed_ref_root *delayed_refs;
 	struct ulist *new_roots = NULL;
diff --git a/fs/btrfs/qgroup.h b/fs/btrfs/qgroup.h
index d9984e87cddf..ad003483d20c 100644
--- a/fs/btrfs/qgroup.h
+++ b/fs/btrfs/qgroup.h
@@ -220,8 +220,7 @@ btrfs_qgroup_account_extent(struct btrfs_trans_handle *trans,
 			    struct btrfs_fs_info *fs_info,
 			    u64 bytenr, u64 num_bytes,
 			    struct ulist *old_roots, struct ulist *new_roots);
-int btrfs_qgroup_account_extents(struct btrfs_trans_handle *trans,
-				 struct btrfs_fs_info *fs_info);
+int btrfs_qgroup_account_extents(struct btrfs_trans_handle *trans);
 int btrfs_run_qgroups(struct btrfs_trans_handle *trans,
 		      struct btrfs_fs_info *fs_info);
 int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans,
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 6534fab417ee..7c815885ac01 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -1369,7 +1369,7 @@ static int qgroup_account_snapshot(struct btrfs_trans_handle *trans,
 	ret = commit_fs_roots(trans);
 	if (ret)
 		goto out;
-	ret = btrfs_qgroup_account_extents(trans, fs_info);
+	ret = btrfs_qgroup_account_extents(trans);
 	if (ret < 0)
 		goto out;
 
@@ -2185,7 +2185,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
 	 * Since fs roots are all committed, we can get a quite accurate
 	 * new_roots. So let's do quota accounting.
 	 */
-	ret = btrfs_qgroup_account_extents(trans, fs_info);
+	ret = btrfs_qgroup_account_extents(trans);
 	if (ret < 0) {
 		mutex_unlock(&fs_info->tree_log_mutex);
 		mutex_unlock(&fs_info->reloc_mutex);

From 5ead2dd02c776e2acf50d5a8cd31a90513f45433 Mon Sep 17 00:00:00 2001
From: Nikolay Borisov <nborisov@suse.com>
Date: Thu, 15 Mar 2018 16:00:26 +0200
Subject: [PATCH 127/164] btrfs: Drop fs_info parameter from
 btrfs_finish_extent_commit

It's provided by the transaction handle.

Signed-off-by: Nikolay Borisov <nborisov@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/ctree.h       | 3 +--
 fs/btrfs/extent-tree.c | 4 ++--
 fs/btrfs/transaction.c | 2 +-
 3 files changed, 4 insertions(+), 5 deletions(-)

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index b0573cdb4d20..03be51d77357 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -2682,8 +2682,7 @@ int btrfs_free_reserved_extent(struct btrfs_fs_info *fs_info,
 int btrfs_free_and_pin_reserved_extent(struct btrfs_fs_info *fs_info,
 				       u64 start, u64 len);
 void btrfs_prepare_extent_commit(struct btrfs_fs_info *fs_info);
-int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
-			       struct btrfs_fs_info *fs_info);
+int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans);
 int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
 			 struct btrfs_root *root,
 			 u64 bytenr, u64 num_bytes, u64 parent,
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 7b1c7c0e3d75..9100a534d539 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -6770,9 +6770,9 @@ static int unpin_extent_range(struct btrfs_fs_info *fs_info,
 	return 0;
 }
 
-int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
-			       struct btrfs_fs_info *fs_info)
+int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans)
 {
+	struct btrfs_fs_info *fs_info = trans->fs_info;
 	struct btrfs_block_group_cache *block_group, *tmp;
 	struct list_head *deleted_bgs;
 	struct extent_io_tree *unpin;
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 7c815885ac01..15f6541303bc 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -2268,7 +2268,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
 	if (ret)
 		goto scrub_continue;
 
-	btrfs_finish_extent_commit(trans, fs_info);
+	btrfs_finish_extent_commit(trans);
 
 	if (test_bit(BTRFS_TRANS_HAVE_FREE_BGS, &cur_trans->flags))
 		btrfs_clear_space_info_full(fs_info);

From 0a1e458a1e49dc6cf13b05f81098e01cf79ceaa1 Mon Sep 17 00:00:00 2001
From: Nikolay Borisov <nborisov@suse.com>
Date: Thu, 15 Mar 2018 16:00:27 +0200
Subject: [PATCH 128/164] btrfs: Drop fs_info parameter from
 __btrfs_run_delayed_refs

It's provided by transaction handle.

Signed-off-by: Nikolay Borisov <nborisov@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/extent-tree.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 9100a534d539..1b54d9a9d938 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -2650,9 +2650,9 @@ static int cleanup_ref_head(struct btrfs_trans_handle *trans,
  * Returns -ENOMEM or -EIO on failure and will abort the transaction.
  */
 static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
-					     struct btrfs_fs_info *fs_info,
 					     unsigned long nr)
 {
+	struct btrfs_fs_info *fs_info = trans->fs_info;
 	struct btrfs_delayed_ref_root *delayed_refs;
 	struct btrfs_delayed_ref_node *ref;
 	struct btrfs_delayed_ref_head *locked_ref = NULL;
@@ -3077,7 +3077,7 @@ again:
 	delayed_refs->run_delayed_start = find_middle(&delayed_refs->root);
 #endif
 	trans->can_flush_pending_bgs = false;
-	ret = __btrfs_run_delayed_refs(trans, fs_info, count);
+	ret = __btrfs_run_delayed_refs(trans, count);
 	if (ret < 0) {
 		btrfs_abort_transaction(trans, ret);
 		return ret;

From 99f92a7c1eba4e3aa82b361bbe92a7ac088fc176 Mon Sep 17 00:00:00 2001
From: Anand Jain <Anand.Jain@oracle.com>
Date: Wed, 14 Mar 2018 16:29:12 +0800
Subject: [PATCH 129/164] btrfs: drop num argument from find_live_mirror()

Obtain the stripes info from the map directly and so no need
to pass it as an argument.

Signed-off-by: Anand Jain <anand.jain@oracle.com>
Reviewed-by: Nikolay Borisov <nborisov@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/volumes.c | 16 ++++++++++++----
 1 file changed, 12 insertions(+), 4 deletions(-)

diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index dcc2b486b2c5..82c21f6eeb3e 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -5274,13 +5274,22 @@ int btrfs_is_parity_mirror(struct btrfs_fs_info *fs_info, u64 logical, u64 len)
 }
 
 static int find_live_mirror(struct btrfs_fs_info *fs_info,
-			    struct map_lookup *map, int first, int num,
+			    struct map_lookup *map, int first,
 			    int optimal, int dev_replace_is_ongoing)
 {
 	int i;
+	int num_stripes;
 	int tolerance;
 	struct btrfs_device *srcdev;
 
+	ASSERT((map->type &
+		 (BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10)));
+
+	if (map->type & BTRFS_BLOCK_GROUP_RAID10)
+		num_stripes = map->sub_stripes;
+	else
+		num_stripes = map->num_stripes;
+
 	if (dev_replace_is_ongoing &&
 	    fs_info->dev_replace.cont_reading_from_srcdev_mode ==
 	     BTRFS_DEV_REPLACE_ITEM_CONT_READING_FROM_SRCDEV_MODE_AVOID)
@@ -5297,7 +5306,7 @@ static int find_live_mirror(struct btrfs_fs_info *fs_info,
 		if (map->stripes[optimal].dev->bdev &&
 		    (tolerance || map->stripes[optimal].dev != srcdev))
 			return optimal;
-		for (i = first; i < first + num; i++) {
+		for (i = first; i < first + num_stripes; i++) {
 			if (map->stripes[i].dev->bdev &&
 			    (tolerance || map->stripes[i].dev != srcdev))
 				return i;
@@ -5834,7 +5843,6 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info,
 			stripe_index = mirror_num - 1;
 		else {
 			stripe_index = find_live_mirror(fs_info, map, 0,
-					    map->num_stripes,
 					    current->pid % map->num_stripes,
 					    dev_replace_is_ongoing);
 			mirror_num = stripe_index + 1;
@@ -5863,7 +5871,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info,
 			int old_stripe_index = stripe_index;
 			stripe_index = find_live_mirror(fs_info, map,
 					      stripe_index,
-					      map->sub_stripes, stripe_index +
+					      stripe_index +
 					      current->pid % map->sub_stripes,
 					      dev_replace_is_ongoing);
 			mirror_num = stripe_index - old_stripe_index + 1;

From 8ba0ae78218ad8fba73ca3be725303be50f936f1 Mon Sep 17 00:00:00 2001
From: Anand Jain <Anand.Jain@oracle.com>
Date: Wed, 14 Mar 2018 16:29:13 +0800
Subject: [PATCH 130/164] btrfs: drop optimal argument from find_live_mirror()

Drop optimal argument from the function find_live_mirror() as we can
deduce it in the function itself. Also rename optimal to
preferred_mirror.

Signed-off-by: Anand Jain <anand.jain@oracle.com>
Reviewed-by: Nikolay Borisov <nborisov@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/volumes.c | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 82c21f6eeb3e..73de042158f1 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -5275,10 +5275,11 @@ int btrfs_is_parity_mirror(struct btrfs_fs_info *fs_info, u64 logical, u64 len)
 
 static int find_live_mirror(struct btrfs_fs_info *fs_info,
 			    struct map_lookup *map, int first,
-			    int optimal, int dev_replace_is_ongoing)
+			    int dev_replace_is_ongoing)
 {
 	int i;
 	int num_stripes;
+	int preferred_mirror;
 	int tolerance;
 	struct btrfs_device *srcdev;
 
@@ -5290,6 +5291,8 @@ static int find_live_mirror(struct btrfs_fs_info *fs_info,
 	else
 		num_stripes = map->num_stripes;
 
+	preferred_mirror = first + current->pid % num_stripes;
+
 	if (dev_replace_is_ongoing &&
 	    fs_info->dev_replace.cont_reading_from_srcdev_mode ==
 	     BTRFS_DEV_REPLACE_ITEM_CONT_READING_FROM_SRCDEV_MODE_AVOID)
@@ -5303,9 +5306,9 @@ static int find_live_mirror(struct btrfs_fs_info *fs_info,
 	 * mirror is available
 	 */
 	for (tolerance = 0; tolerance < 2; tolerance++) {
-		if (map->stripes[optimal].dev->bdev &&
-		    (tolerance || map->stripes[optimal].dev != srcdev))
-			return optimal;
+		if (map->stripes[preferred_mirror].dev->bdev &&
+		    (tolerance || map->stripes[preferred_mirror].dev != srcdev))
+			return preferred_mirror;
 		for (i = first; i < first + num_stripes; i++) {
 			if (map->stripes[i].dev->bdev &&
 			    (tolerance || map->stripes[i].dev != srcdev))
@@ -5316,7 +5319,7 @@ static int find_live_mirror(struct btrfs_fs_info *fs_info,
 	/* we couldn't find one that doesn't fail.  Just return something
 	 * and the io error handling code will clean up eventually
 	 */
-	return optimal;
+	return preferred_mirror;
 }
 
 static inline int parity_smaller(u64 a, u64 b)
@@ -5843,7 +5846,6 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info,
 			stripe_index = mirror_num - 1;
 		else {
 			stripe_index = find_live_mirror(fs_info, map, 0,
-					    current->pid % map->num_stripes,
 					    dev_replace_is_ongoing);
 			mirror_num = stripe_index + 1;
 		}
@@ -5871,8 +5873,6 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info,
 			int old_stripe_index = stripe_index;
 			stripe_index = find_live_mirror(fs_info, map,
 					      stripe_index,
-					      stripe_index +
-					      current->pid % map->sub_stripes,
 					      dev_replace_is_ongoing);
 			mirror_num = stripe_index - old_stripe_index + 1;
 		}

From 8a5a916d9a35e13576d79cc16e24611821b13e34 Mon Sep 17 00:00:00 2001
From: Jeff Mahoney <jeffm@suse.com>
Date: Fri, 16 Mar 2018 14:36:27 -0400
Subject: [PATCH 131/164] btrfs: fix lockdep splat in
 btrfs_alloc_subvolume_writers

While running btrfs/011, I hit the following lockdep splat.

This is the important bit:
   pcpu_alloc+0x1ac/0x5e0
   __percpu_counter_init+0x4e/0xb0
   btrfs_init_fs_root+0x99/0x1c0 [btrfs]
   btrfs_get_fs_root.part.54+0x5b/0x150 [btrfs]
   resolve_indirect_refs+0x130/0x830 [btrfs]
   find_parent_nodes+0x69e/0xff0 [btrfs]
   btrfs_find_all_roots_safe+0xa0/0x110 [btrfs]
   btrfs_find_all_roots+0x50/0x70 [btrfs]
   btrfs_qgroup_prepare_account_extents+0x53/0x90 [btrfs]
   btrfs_commit_transaction+0x3ce/0x9b0 [btrfs]

The percpu_counter_init call in btrfs_alloc_subvolume_writers
uses GFP_KERNEL, which we can't do during transaction commit.

This switches it to GFP_NOFS.

========================================================
WARNING: possible irq lock inversion dependency detected
4.12.14-kvmsmall #8 Tainted: G        W
--------------------------------------------------------
kswapd0/50 just changed the state of lock:
 (&delayed_node->mutex){+.+.-.}, at: [<ffffffffc06994fa>] __btrfs_release_delayed_node+0x3a/0x1f0 [btrfs]
but this lock took another, RECLAIM_FS-unsafe lock in the past:
 (pcpu_alloc_mutex){+.+.+.}

and interrupts could create inverse lock ordering between them.

other info that might help us debug this:
Chain exists of:
  &delayed_node->mutex --> &found->groups_sem --> pcpu_alloc_mutex

 Possible interrupt unsafe locking scenario:

       CPU0                    CPU1
       ----                    ----
  lock(pcpu_alloc_mutex);
                               local_irq_disable();
                               lock(&delayed_node->mutex);
                               lock(&found->groups_sem);
  <Interrupt>
    lock(&delayed_node->mutex);

 *** DEADLOCK ***

2 locks held by kswapd0/50:
 #0:  (shrinker_rwsem){++++..}, at: [<ffffffff811dc11f>] shrink_slab+0x7f/0x5b0
 #1:  (&type->s_umount_key#30){+++++.}, at: [<ffffffff8126dec6>] trylock_super+0x16/0x50

the shortest dependencies between 2nd lock and 1st lock:
   -> (pcpu_alloc_mutex){+.+.+.} ops: 4904 {
      HARDIRQ-ON-W at:
                          __mutex_lock+0x4e/0x8c0
                          pcpu_alloc+0x1ac/0x5e0
                          alloc_kmem_cache_cpus.isra.70+0x25/0xa0
                          __do_tune_cpucache+0x2c/0x220
                          do_tune_cpucache+0x26/0xc0
                          enable_cpucache+0x6d/0xf0
                          kmem_cache_init_late+0x42/0x75
                          start_kernel+0x343/0x4cb
                          x86_64_start_kernel+0x127/0x134
                          secondary_startup_64+0xa5/0xb0
      SOFTIRQ-ON-W at:
                          __mutex_lock+0x4e/0x8c0
                          pcpu_alloc+0x1ac/0x5e0
                          alloc_kmem_cache_cpus.isra.70+0x25/0xa0
                          __do_tune_cpucache+0x2c/0x220
                          do_tune_cpucache+0x26/0xc0
                          enable_cpucache+0x6d/0xf0
                          kmem_cache_init_late+0x42/0x75
                          start_kernel+0x343/0x4cb
                          x86_64_start_kernel+0x127/0x134
                          secondary_startup_64+0xa5/0xb0
      RECLAIM_FS-ON-W at:
                             __kmalloc+0x47/0x310
                             pcpu_extend_area_map+0x2b/0xc0
                             pcpu_alloc+0x3ec/0x5e0
                             alloc_kmem_cache_cpus.isra.70+0x25/0xa0
                             __do_tune_cpucache+0x2c/0x220
                             do_tune_cpucache+0x26/0xc0
                             enable_cpucache+0x6d/0xf0
                             __kmem_cache_create+0x1bf/0x390
                             create_cache+0xba/0x1b0
                             kmem_cache_create+0x1f8/0x2b0
                             ksm_init+0x6f/0x19d
                             do_one_initcall+0x50/0x1b0
                             kernel_init_freeable+0x201/0x289
                             kernel_init+0xa/0x100
                             ret_from_fork+0x3a/0x50
      INITIAL USE at:
                         __mutex_lock+0x4e/0x8c0
                         pcpu_alloc+0x1ac/0x5e0
                         alloc_kmem_cache_cpus.isra.70+0x25/0xa0
                         setup_cpu_cache+0x2f/0x1f0
                         __kmem_cache_create+0x1bf/0x390
                         create_boot_cache+0x8b/0xb1
                         kmem_cache_init+0xa1/0x19e
                         start_kernel+0x270/0x4cb
                         x86_64_start_kernel+0x127/0x134
                         secondary_startup_64+0xa5/0xb0
    }
    ... key      at: [<ffffffff821d8e70>] pcpu_alloc_mutex+0x70/0xa0
    ... acquired at:
   pcpu_alloc+0x1ac/0x5e0
   __percpu_counter_init+0x4e/0xb0
   btrfs_init_fs_root+0x99/0x1c0 [btrfs]
   btrfs_get_fs_root.part.54+0x5b/0x150 [btrfs]
   resolve_indirect_refs+0x130/0x830 [btrfs]
   find_parent_nodes+0x69e/0xff0 [btrfs]
   btrfs_find_all_roots_safe+0xa0/0x110 [btrfs]
   btrfs_find_all_roots+0x50/0x70 [btrfs]
   btrfs_qgroup_prepare_account_extents+0x53/0x90 [btrfs]
   btrfs_commit_transaction+0x3ce/0x9b0 [btrfs]
   transaction_kthread+0x176/0x1b0 [btrfs]
   kthread+0x102/0x140
   ret_from_fork+0x3a/0x50

  -> (&fs_info->commit_root_sem){++++..} ops: 1566382 {
     HARDIRQ-ON-W at:
                        down_write+0x3e/0xa0
                        cache_block_group+0x287/0x420 [btrfs]
                        find_free_extent+0x106c/0x12d0 [btrfs]
                        btrfs_reserve_extent+0xd8/0x170 [btrfs]
                        cow_file_range.isra.66+0x133/0x470 [btrfs]
                        run_delalloc_range+0x121/0x410 [btrfs]
                        writepage_delalloc.isra.50+0xfe/0x180 [btrfs]
                        __extent_writepage+0x19a/0x360 [btrfs]
                        extent_write_cache_pages.constprop.56+0x249/0x3e0 [btrfs]
                        extent_writepages+0x4d/0x60 [btrfs]
                        do_writepages+0x1a/0x70
                        __filemap_fdatawrite_range+0xa7/0xe0
                        btrfs_rename+0x5ee/0xdb0 [btrfs]
                        vfs_rename+0x52a/0x7e0
                        SyS_rename+0x351/0x3b0
                        do_syscall_64+0x79/0x1e0
                        entry_SYSCALL_64_after_hwframe+0x42/0xb7
     HARDIRQ-ON-R at:
                        down_read+0x35/0x90
                        caching_thread+0x57/0x560 [btrfs]
                        normal_work_helper+0x1c0/0x5e0 [btrfs]
                        process_one_work+0x1e0/0x5c0
                        worker_thread+0x44/0x390
                        kthread+0x102/0x140
                        ret_from_fork+0x3a/0x50
     SOFTIRQ-ON-W at:
                        down_write+0x3e/0xa0
                        cache_block_group+0x287/0x420 [btrfs]
                        find_free_extent+0x106c/0x12d0 [btrfs]
                        btrfs_reserve_extent+0xd8/0x170 [btrfs]
                        cow_file_range.isra.66+0x133/0x470 [btrfs]
                        run_delalloc_range+0x121/0x410 [btrfs]
                        writepage_delalloc.isra.50+0xfe/0x180 [btrfs]
                        __extent_writepage+0x19a/0x360 [btrfs]
                        extent_write_cache_pages.constprop.56+0x249/0x3e0 [btrfs]
                        extent_writepages+0x4d/0x60 [btrfs]
                        do_writepages+0x1a/0x70
                        __filemap_fdatawrite_range+0xa7/0xe0
                        btrfs_rename+0x5ee/0xdb0 [btrfs]
                        vfs_rename+0x52a/0x7e0
                        SyS_rename+0x351/0x3b0
                        do_syscall_64+0x79/0x1e0
                        entry_SYSCALL_64_after_hwframe+0x42/0xb7
     SOFTIRQ-ON-R at:
                        down_read+0x35/0x90
                        caching_thread+0x57/0x560 [btrfs]
                        normal_work_helper+0x1c0/0x5e0 [btrfs]
                        process_one_work+0x1e0/0x5c0
                        worker_thread+0x44/0x390
                        kthread+0x102/0x140
                        ret_from_fork+0x3a/0x50
     INITIAL USE at:
                       down_write+0x3e/0xa0
                       cache_block_group+0x287/0x420 [btrfs]
                       find_free_extent+0x106c/0x12d0 [btrfs]
                       btrfs_reserve_extent+0xd8/0x170 [btrfs]
                       cow_file_range.isra.66+0x133/0x470 [btrfs]
                       run_delalloc_range+0x121/0x410 [btrfs]
                       writepage_delalloc.isra.50+0xfe/0x180 [btrfs]
                       __extent_writepage+0x19a/0x360 [btrfs]
                       extent_write_cache_pages.constprop.56+0x249/0x3e0 [btrfs]
                       extent_writepages+0x4d/0x60 [btrfs]
                       do_writepages+0x1a/0x70
                       __filemap_fdatawrite_range+0xa7/0xe0
                       btrfs_rename+0x5ee/0xdb0 [btrfs]
                       vfs_rename+0x52a/0x7e0
                       SyS_rename+0x351/0x3b0
                       do_syscall_64+0x79/0x1e0
                       entry_SYSCALL_64_after_hwframe+0x42/0xb7
   }
   ... key      at: [<ffffffffc0729578>] __key.61970+0x0/0xfffffffffff9aa88 [btrfs]
   ... acquired at:
   cache_block_group+0x287/0x420 [btrfs]
   find_free_extent+0x106c/0x12d0 [btrfs]
   btrfs_reserve_extent+0xd8/0x170 [btrfs]
   btrfs_alloc_tree_block+0x12f/0x4c0 [btrfs]
   btrfs_create_tree+0xbb/0x2a0 [btrfs]
   btrfs_create_uuid_tree+0x37/0x140 [btrfs]
   open_ctree+0x23c0/0x2660 [btrfs]
   btrfs_mount+0xd36/0xf90 [btrfs]
   mount_fs+0x3a/0x160
   vfs_kern_mount+0x66/0x150
   btrfs_mount+0x18c/0xf90 [btrfs]
   mount_fs+0x3a/0x160
   vfs_kern_mount+0x66/0x150
   do_mount+0x1c1/0xcc0
   SyS_mount+0x7e/0xd0
   do_syscall_64+0x79/0x1e0
   entry_SYSCALL_64_after_hwframe+0x42/0xb7

 -> (&found->groups_sem){++++..} ops: 2134587 {
    HARDIRQ-ON-W at:
                      down_write+0x3e/0xa0
                      __link_block_group+0x34/0x130 [btrfs]
                      btrfs_read_block_groups+0x33d/0x7b0 [btrfs]
                      open_ctree+0x2054/0x2660 [btrfs]
                      btrfs_mount+0xd36/0xf90 [btrfs]
                      mount_fs+0x3a/0x160
                      vfs_kern_mount+0x66/0x150
                      btrfs_mount+0x18c/0xf90 [btrfs]
                      mount_fs+0x3a/0x160
                      vfs_kern_mount+0x66/0x150
                      do_mount+0x1c1/0xcc0
                      SyS_mount+0x7e/0xd0
                      do_syscall_64+0x79/0x1e0
                      entry_SYSCALL_64_after_hwframe+0x42/0xb7
    HARDIRQ-ON-R at:
                      down_read+0x35/0x90
                      btrfs_calc_num_tolerated_disk_barrier_failures+0x113/0x1f0 [btrfs]
                      open_ctree+0x207b/0x2660 [btrfs]
                      btrfs_mount+0xd36/0xf90 [btrfs]
                      mount_fs+0x3a/0x160
                      vfs_kern_mount+0x66/0x150
                      btrfs_mount+0x18c/0xf90 [btrfs]
                      mount_fs+0x3a/0x160
                      vfs_kern_mount+0x66/0x150
                      do_mount+0x1c1/0xcc0
                      SyS_mount+0x7e/0xd0
                      do_syscall_64+0x79/0x1e0
                      entry_SYSCALL_64_after_hwframe+0x42/0xb7
    SOFTIRQ-ON-W at:
                      down_write+0x3e/0xa0
                      __link_block_group+0x34/0x130 [btrfs]
                      btrfs_read_block_groups+0x33d/0x7b0 [btrfs]
                      open_ctree+0x2054/0x2660 [btrfs]
                      btrfs_mount+0xd36/0xf90 [btrfs]
                      mount_fs+0x3a/0x160
                      vfs_kern_mount+0x66/0x150
                      btrfs_mount+0x18c/0xf90 [btrfs]
                      mount_fs+0x3a/0x160
                      vfs_kern_mount+0x66/0x150
                      do_mount+0x1c1/0xcc0
                      SyS_mount+0x7e/0xd0
                      do_syscall_64+0x79/0x1e0
                      entry_SYSCALL_64_after_hwframe+0x42/0xb7
    SOFTIRQ-ON-R at:
                      down_read+0x35/0x90
                      btrfs_calc_num_tolerated_disk_barrier_failures+0x113/0x1f0 [btrfs]
                      open_ctree+0x207b/0x2660 [btrfs]
                      btrfs_mount+0xd36/0xf90 [btrfs]
                      mount_fs+0x3a/0x160
                      vfs_kern_mount+0x66/0x150
                      btrfs_mount+0x18c/0xf90 [btrfs]
                      mount_fs+0x3a/0x160
                      vfs_kern_mount+0x66/0x150
                      do_mount+0x1c1/0xcc0
                      SyS_mount+0x7e/0xd0
                      do_syscall_64+0x79/0x1e0
                      entry_SYSCALL_64_after_hwframe+0x42/0xb7
    INITIAL USE at:
                     down_write+0x3e/0xa0
                     __link_block_group+0x34/0x130 [btrfs]
                     btrfs_read_block_groups+0x33d/0x7b0 [btrfs]
                     open_ctree+0x2054/0x2660 [btrfs]
                     btrfs_mount+0xd36/0xf90 [btrfs]
                     mount_fs+0x3a/0x160
                     vfs_kern_mount+0x66/0x150
                     btrfs_mount+0x18c/0xf90 [btrfs]
                     mount_fs+0x3a/0x160
                     vfs_kern_mount+0x66/0x150
                     do_mount+0x1c1/0xcc0
                     SyS_mount+0x7e/0xd0
                     do_syscall_64+0x79/0x1e0
                     entry_SYSCALL_64_after_hwframe+0x42/0xb7
  }
  ... key      at: [<ffffffffc0729488>] __key.59101+0x0/0xfffffffffff9ab78 [btrfs]
  ... acquired at:
   find_free_extent+0xcb4/0x12d0 [btrfs]
   btrfs_reserve_extent+0xd8/0x170 [btrfs]
   btrfs_alloc_tree_block+0x12f/0x4c0 [btrfs]
   __btrfs_cow_block+0x110/0x5b0 [btrfs]
   btrfs_cow_block+0xd7/0x290 [btrfs]
   btrfs_search_slot+0x1f6/0x960 [btrfs]
   btrfs_lookup_inode+0x2a/0x90 [btrfs]
   __btrfs_update_delayed_inode+0x65/0x210 [btrfs]
   btrfs_commit_inode_delayed_inode+0x121/0x130 [btrfs]
   btrfs_evict_inode+0x3fe/0x6a0 [btrfs]
   evict+0xc4/0x190
   __dentry_kill+0xbf/0x170
   dput+0x2ae/0x2f0
   SyS_rename+0x2a6/0x3b0
   do_syscall_64+0x79/0x1e0
   entry_SYSCALL_64_after_hwframe+0x42/0xb7

-> (&delayed_node->mutex){+.+.-.} ops: 5580204 {
   HARDIRQ-ON-W at:
                    __mutex_lock+0x4e/0x8c0
                    btrfs_delayed_update_inode+0x46/0x6e0 [btrfs]
                    btrfs_update_inode+0x83/0x110 [btrfs]
                    btrfs_dirty_inode+0x62/0xe0 [btrfs]
                    touch_atime+0x8c/0xb0
                    do_generic_file_read+0x818/0xb10
                    __vfs_read+0xdc/0x150
                    vfs_read+0x8a/0x130
                    SyS_read+0x45/0xa0
                    do_syscall_64+0x79/0x1e0
                    entry_SYSCALL_64_after_hwframe+0x42/0xb7
   SOFTIRQ-ON-W at:
                    __mutex_lock+0x4e/0x8c0
                    btrfs_delayed_update_inode+0x46/0x6e0 [btrfs]
                    btrfs_update_inode+0x83/0x110 [btrfs]
                    btrfs_dirty_inode+0x62/0xe0 [btrfs]
                    touch_atime+0x8c/0xb0
                    do_generic_file_read+0x818/0xb10
                    __vfs_read+0xdc/0x150
                    vfs_read+0x8a/0x130
                    SyS_read+0x45/0xa0
                    do_syscall_64+0x79/0x1e0
                    entry_SYSCALL_64_after_hwframe+0x42/0xb7
   IN-RECLAIM_FS-W at:
                       __mutex_lock+0x4e/0x8c0
                       __btrfs_release_delayed_node+0x3a/0x1f0 [btrfs]
                       btrfs_evict_inode+0x22c/0x6a0 [btrfs]
                       evict+0xc4/0x190
                       dispose_list+0x35/0x50
                       prune_icache_sb+0x42/0x50
                       super_cache_scan+0x139/0x190
                       shrink_slab+0x262/0x5b0
                       shrink_node+0x2eb/0x2f0
                       kswapd+0x2eb/0x890
                       kthread+0x102/0x140
                       ret_from_fork+0x3a/0x50
   INITIAL USE at:
                   __mutex_lock+0x4e/0x8c0
                   btrfs_delayed_update_inode+0x46/0x6e0 [btrfs]
                   btrfs_update_inode+0x83/0x110 [btrfs]
                   btrfs_dirty_inode+0x62/0xe0 [btrfs]
                   touch_atime+0x8c/0xb0
                   do_generic_file_read+0x818/0xb10
                   __vfs_read+0xdc/0x150
                   vfs_read+0x8a/0x130
                   SyS_read+0x45/0xa0
                   do_syscall_64+0x79/0x1e0
                   entry_SYSCALL_64_after_hwframe+0x42/0xb7
 }
 ... key      at: [<ffffffffc072d488>] __key.56935+0x0/0xfffffffffff96b78 [btrfs]
 ... acquired at:
   __lock_acquire+0x264/0x11c0
   lock_acquire+0xbd/0x1e0
   __mutex_lock+0x4e/0x8c0
   __btrfs_release_delayed_node+0x3a/0x1f0 [btrfs]
   btrfs_evict_inode+0x22c/0x6a0 [btrfs]
   evict+0xc4/0x190
   dispose_list+0x35/0x50
   prune_icache_sb+0x42/0x50
   super_cache_scan+0x139/0x190
   shrink_slab+0x262/0x5b0
   shrink_node+0x2eb/0x2f0
   kswapd+0x2eb/0x890
   kthread+0x102/0x140
   ret_from_fork+0x3a/0x50

stack backtrace:
CPU: 1 PID: 50 Comm: kswapd0 Tainted: G        W        4.12.14-kvmsmall #8 SLE15 (unreleased)
Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.0.0-prebuilt.qemu-project.org 04/01/2014
Call Trace:
 dump_stack+0x78/0xb7
 print_irq_inversion_bug.part.38+0x19f/0x1aa
 check_usage_forwards+0x102/0x120
 ? ret_from_fork+0x3a/0x50
 ? check_usage_backwards+0x110/0x110
 mark_lock+0x16c/0x270
 __lock_acquire+0x264/0x11c0
 ? pagevec_lookup_entries+0x1a/0x30
 ? truncate_inode_pages_range+0x2b3/0x7f0
 lock_acquire+0xbd/0x1e0
 ? __btrfs_release_delayed_node+0x3a/0x1f0 [btrfs]
 __mutex_lock+0x4e/0x8c0
 ? __btrfs_release_delayed_node+0x3a/0x1f0 [btrfs]
 ? __btrfs_release_delayed_node+0x3a/0x1f0 [btrfs]
 ? btrfs_evict_inode+0x1f6/0x6a0 [btrfs]
 __btrfs_release_delayed_node+0x3a/0x1f0 [btrfs]
 btrfs_evict_inode+0x22c/0x6a0 [btrfs]
 evict+0xc4/0x190
 dispose_list+0x35/0x50
 prune_icache_sb+0x42/0x50
 super_cache_scan+0x139/0x190
 shrink_slab+0x262/0x5b0
 shrink_node+0x2eb/0x2f0
 kswapd+0x2eb/0x890
 kthread+0x102/0x140
 ? mem_cgroup_shrink_node+0x2c0/0x2c0
 ? kthread_create_on_node+0x40/0x40
 ret_from_fork+0x3a/0x50

Signed-off-by: Jeff Mahoney <jeffm@suse.com>
Reviewed-by: Liu Bo <bo.liu@linux.alibaba.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/disk-io.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 04834cb65272..1657d6aa4fa6 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -1095,7 +1095,7 @@ static struct btrfs_subvolume_writers *btrfs_alloc_subvolume_writers(void)
 	if (!writers)
 		return ERR_PTR(-ENOMEM);
 
-	ret = percpu_counter_init(&writers->counter, 0, GFP_KERNEL);
+	ret = percpu_counter_init(&writers->counter, 0, GFP_NOFS);
 	if (ret < 0) {
 		kfree(writers);
 		return ERR_PTR(ret);

From 580c6efaf91f89fae1efda53892df41ae1e41559 Mon Sep 17 00:00:00 2001
From: Liu Bo <bo.liu@linux.alibaba.com>
Date: Thu, 22 Mar 2018 09:20:11 +0800
Subject: [PATCH 132/164] Btrfs: replace: cache rbio when rebuild data on
 missing device

Rebuild on missing device is as same as recover, after it's done, rbio
has data which is consistent with on-disk data, so it can be cached to
avoid further reads.

Signed-off-by: Liu Bo <bo.li.liu@oracle.com>
Signed-off-by: Liu Bo <bo.liu@linux.alibaba.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/raid56.c | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c
index 002bddc7fa09..c3a2bc8af675 100644
--- a/fs/btrfs/raid56.c
+++ b/fs/btrfs/raid56.c
@@ -1987,7 +1987,13 @@ cleanup:
 	kfree(pointers);
 
 cleanup_io:
-	if (rbio->operation == BTRFS_RBIO_READ_REBUILD) {
+	/*
+	 * Similar to READ_REBUILD, REBUILD_MISSING at this point also has a
+	 * valid rbio which is consistent with ondisk content, thus such a
+	 * valid rbio can be cached to avoid further disk reads.
+	 */
+	if (rbio->operation == BTRFS_RBIO_READ_REBUILD ||
+	    rbio->operation == BTRFS_RBIO_REBUILD_MISSING) {
 		/*
 		 * - In case of two failures, where rbio->failb != -1:
 		 *
@@ -2008,8 +2014,6 @@ cleanup_io:
 		else
 			clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags);
 
-		rbio_orig_end_io(rbio, err);
-	} else if (rbio->operation == BTRFS_RBIO_REBUILD_MISSING) {
 		rbio_orig_end_io(rbio, err);
 	} else if (err == BLK_STS_OK) {
 		rbio->faila = -1;

From dc2d3005d27da41247d6c42077e335a777afc79c Mon Sep 17 00:00:00 2001
From: Jeff Mahoney <jeffm@suse.com>
Date: Tue, 20 Mar 2018 15:25:25 -0400
Subject: [PATCH 133/164] btrfs: remove dead create_space_info calls

Since commit 2be12ef79 (btrfs: Separate space_info create/update), we've
separated out the creation and updating of the space info structures.
That commit was a straightforward refactoring of the two parts of
update_space_info, but we can go a step further.  Since commits
c59021f84 (Btrfs: fix OOPS of empty filesystem after balance) and
b742bb82f (Btrfs: Link block groups of different raid types), we know
that the space_info structures will be created at mount and there will
only ever be, at most, three of them.

This patch cleans out the create_space_info calls after __find_space_info
returns NULL since __find_space_info *can't* return NULL.

The initial cause for reviewing this was the kobject_add calls from
create_space_info occuring in sites where fs-reclaim wasn't allowed.  Now
we are certain they occur only early in the mount process and are safe.

Signed-off-by: Jeff Mahoney <jeffm@suse.com>
Reviewed-by: Nikolay Borisov <nborisov@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/ctree.h       |  6 +++---
 fs/btrfs/extent-tree.c | 16 ++--------------
 2 files changed, 5 insertions(+), 17 deletions(-)

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 03be51d77357..ffa72ca5755d 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -954,9 +954,9 @@ struct btrfs_fs_info {
 	struct btrfs_fs_devices *fs_devices;
 
 	/*
-	 * the space_info list is almost entirely read only.  It only changes
-	 * when we add a new raid type to the FS, and that happens
-	 * very rarely.  RCU is used to protect it.
+	 * The space_info list is effectively read only after initial
+	 * setup.  It is populated at mount time and cleaned up after
+	 * all block groups are removed.  RCU is used to protect it.
 	 */
 	struct list_head space_info;
 
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 1b54d9a9d938..e10f74dac493 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -4606,11 +4606,7 @@ static int do_chunk_alloc(struct btrfs_trans_handle *trans,
 		return -ENOSPC;
 
 	space_info = __find_space_info(fs_info, flags);
-	if (!space_info) {
-		ret = create_space_info(fs_info, flags, &space_info);
-		if (ret)
-			return ret;
-	}
+	ASSERT(space_info);
 
 again:
 	spin_lock(&space_info->lock);
@@ -10265,15 +10261,7 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
 	 * with its ->space_info set.
 	 */
 	cache->space_info = __find_space_info(fs_info, cache->flags);
-	if (!cache->space_info) {
-		ret = create_space_info(fs_info, cache->flags,
-				       &cache->space_info);
-		if (ret) {
-			btrfs_remove_free_space_cache(cache);
-			btrfs_put_block_group(cache);
-			return ret;
-		}
-	}
+	ASSERT(cache->space_info);
 
 	ret = btrfs_add_block_group_cache(fs_info, cache);
 	if (ret) {

From 75cb379d2635215ad2c67750693f7dc45ad19a5f Mon Sep 17 00:00:00 2001
From: Jeff Mahoney <jeffm@suse.com>
Date: Tue, 20 Mar 2018 15:25:26 -0400
Subject: [PATCH 134/164] btrfs: defer adding raid type kobject until after
 chunk relocation

Any time the first block group of a new type is created, we add a new
kobject to sysfs to hold the attributes for that type.  Kobject-internal
allocations always use GFP_KERNEL, making them prone to fs-reclaim races.
While it appears as if this can occur any time a block group is created,
the only times the first block group of a new type can be created in
memory is at mount and when we create the first new block group during
raid conversion.

This patch adds a new list to track pending kobject additions and then
handles them after we do chunk relocation.  Between relocating the
target chunk (or forcing allocation of a new chunk in the case of data)
and removing the old chunk, we're in a safe place for fs-reclaim to
occur.  We're holding the volume mutex, which is already held across
page faults, and the delete_unused_bgs_mutex, which will only stall
the cleaner thread.

Signed-off-by: Jeff Mahoney <jeffm@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/ctree.h       |  6 ++++-
 fs/btrfs/disk-io.c     |  2 ++
 fs/btrfs/extent-tree.c | 60 +++++++++++++++++++++++++++++-------------
 fs/btrfs/sysfs.c       |  2 +-
 fs/btrfs/volumes.c     | 12 +++++++++
 5 files changed, 62 insertions(+), 20 deletions(-)

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index ffa72ca5755d..8d3aa56b928b 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -385,8 +385,9 @@ struct btrfs_dev_replace {
 
 /* For raid type sysfs entries */
 struct raid_kobject {
-	int raid_type;
+	u64 flags;
 	struct kobject kobj;
+	struct list_head list;
 };
 
 struct btrfs_space_info {
@@ -940,6 +941,8 @@ struct btrfs_fs_info {
 	u32 thread_pool_size;
 
 	struct kobject *space_info_kobj;
+	struct list_head pending_raid_kobjs;
+	spinlock_t pending_raid_kobjs_lock; /* uncontended */
 
 	u64 total_pinned;
 
@@ -2700,6 +2703,7 @@ int btrfs_can_relocate(struct btrfs_fs_info *fs_info, u64 bytenr);
 int btrfs_make_block_group(struct btrfs_trans_handle *trans,
 			   struct btrfs_fs_info *fs_info, u64 bytes_used,
 			   u64 type, u64 chunk_offset, u64 size);
+void btrfs_add_raid_kobjects(struct btrfs_fs_info *fs_info);
 struct btrfs_trans_handle *btrfs_start_trans_remove_block_group(
 				struct btrfs_fs_info *fs_info,
 				const u64 chunk_offset);
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 1657d6aa4fa6..38b387ae78f8 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -2431,6 +2431,8 @@ int open_ctree(struct super_block *sb,
 	INIT_LIST_HEAD(&fs_info->delayed_iputs);
 	INIT_LIST_HEAD(&fs_info->delalloc_roots);
 	INIT_LIST_HEAD(&fs_info->caching_block_groups);
+	INIT_LIST_HEAD(&fs_info->pending_raid_kobjs);
+	spin_lock_init(&fs_info->pending_raid_kobjs_lock);
 	spin_lock_init(&fs_info->delalloc_root_lock);
 	spin_lock_init(&fs_info->trans_lock);
 	spin_lock_init(&fs_info->fs_roots_radix_lock);
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index e10f74dac493..0b1f01dd02de 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -9918,9 +9918,39 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info)
 	return 0;
 }
 
+/* link_block_group will queue up kobjects to add when we're reclaim-safe */
+void btrfs_add_raid_kobjects(struct btrfs_fs_info *fs_info)
+{
+	struct btrfs_space_info *space_info;
+	struct raid_kobject *rkobj;
+	LIST_HEAD(list);
+	int index;
+	int ret = 0;
+
+	spin_lock(&fs_info->pending_raid_kobjs_lock);
+	list_splice_init(&fs_info->pending_raid_kobjs, &list);
+	spin_unlock(&fs_info->pending_raid_kobjs_lock);
+
+	list_for_each_entry(rkobj, &list, list) {
+		space_info = __find_space_info(fs_info, rkobj->flags);
+		index = btrfs_bg_flags_to_raid_index(rkobj->flags);
+
+		ret = kobject_add(&rkobj->kobj, &space_info->kobj,
+				  "%s", get_raid_name(index));
+		if (ret) {
+			kobject_put(&rkobj->kobj);
+			break;
+		}
+	}
+	if (ret)
+		btrfs_warn(fs_info,
+			   "failed to add kobject for block cache, ignoring");
+}
+
 static void link_block_group(struct btrfs_block_group_cache *cache)
 {
 	struct btrfs_space_info *space_info = cache->space_info;
+	struct btrfs_fs_info *fs_info = cache->fs_info;
 	int index = btrfs_bg_flags_to_raid_index(cache->flags);
 	bool first = false;
 
@@ -9931,27 +9961,20 @@ static void link_block_group(struct btrfs_block_group_cache *cache)
 	up_write(&space_info->groups_sem);
 
 	if (first) {
-		struct raid_kobject *rkobj;
-		int ret;
-
-		rkobj = kzalloc(sizeof(*rkobj), GFP_NOFS);
-		if (!rkobj)
-			goto out_err;
-		rkobj->raid_type = index;
-		kobject_init(&rkobj->kobj, &btrfs_raid_ktype);
-		ret = kobject_add(&rkobj->kobj, &space_info->kobj,
-				  "%s", get_raid_name(index));
-		if (ret) {
-			kobject_put(&rkobj->kobj);
-			goto out_err;
+		struct raid_kobject *rkobj = kzalloc(sizeof(*rkobj), GFP_NOFS);
+		if (!rkobj) {
+			btrfs_warn(cache->fs_info,
+				"couldn't alloc memory for raid level kobject");
+			return;
 		}
+		rkobj->flags = cache->flags;
+		kobject_init(&rkobj->kobj, &btrfs_raid_ktype);
+
+		spin_lock(&fs_info->pending_raid_kobjs_lock);
+		list_add_tail(&rkobj->list, &fs_info->pending_raid_kobjs);
+		spin_unlock(&fs_info->pending_raid_kobjs_lock);
 		space_info->block_group_kobjs[index] = &rkobj->kobj;
 	}
-
-	return;
-out_err:
-	btrfs_warn(cache->fs_info,
-		   "failed to add kobject for block cache, ignoring");
 }
 
 static struct btrfs_block_group_cache *
@@ -10167,6 +10190,7 @@ int btrfs_read_block_groups(struct btrfs_fs_info *info)
 			inc_block_group_ro(cache, 1);
 	}
 
+	btrfs_add_raid_kobjects(info);
 	init_global_block_rsv(info);
 	ret = 0;
 error:
diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c
index 6af7b58e1a90..ca067471cd46 100644
--- a/fs/btrfs/sysfs.c
+++ b/fs/btrfs/sysfs.c
@@ -272,7 +272,7 @@ static ssize_t raid_bytes_show(struct kobject *kobj,
 {
 	struct btrfs_space_info *sinfo = to_space_info(kobj->parent);
 	struct btrfs_block_group_cache *block_group;
-	int index = to_raid_kobj(kobj)->raid_type;
+	int index = btrfs_bg_flags_to_raid_index(to_raid_kobj(kobj)->flags);
 	u64 val = 0;
 
 	down_read(&sinfo->groups_sem);
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 73de042158f1..4fc6acf65220 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -3003,6 +3003,16 @@ static int btrfs_relocate_chunk(struct btrfs_fs_info *fs_info, u64 chunk_offset)
 	if (ret)
 		return ret;
 
+	/*
+	 * We add the kobjects here (and after forcing data chunk creation)
+	 * since relocation is the only place we'll create chunks of a new
+	 * type at runtime.  The only place where we'll remove the last
+	 * chunk of a type is the call immediately below this one.  Even
+	 * so, we're protected against races with the cleaner thread since
+	 * we're covered by the delete_unused_bgs_mutex.
+	 */
+	btrfs_add_raid_kobjects(fs_info);
+
 	trans = btrfs_start_trans_remove_block_group(root->fs_info,
 						     chunk_offset);
 	if (IS_ERR(trans)) {
@@ -3130,6 +3140,8 @@ static int btrfs_may_alloc_data_chunk(struct btrfs_fs_info *fs_info,
 			if (ret < 0)
 				return ret;
 
+			btrfs_add_raid_kobjects(fs_info);
+
 			return 1;
 		}
 	}

From 4408ea7c5fd92cbdff3b5890601b9be6610bbb33 Mon Sep 17 00:00:00 2001
From: "Misono, Tomohiro" <misono.tomohiro@jp.fujitsu.com>
Date: Tue, 20 Mar 2018 15:47:06 +0900
Subject: [PATCH 135/164] btrfs: ctree.h: Fix wrong comment position about csum
 size

Signed-off-by: Tomohiro Misono <misono.tomohiro@jp.fujitsu.com>
Reviewed-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/ctree.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 8d3aa56b928b..f9f512be9d41 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -89,9 +89,9 @@ struct btrfs_ordered_sum;
  */
 #define BTRFS_LINK_MAX 65535U
 
+/* four bytes for CRC32 */
 static const int btrfs_csum_sizes[] = { 4 };
 
-/* four bytes for CRC32 */
 #define BTRFS_EMPTY_DIR_SIZE 0
 
 /* ioprio of readahead is set to idle */

From 0a0d4415e3389b6a2e89896808dea27d2402d154 Mon Sep 17 00:00:00 2001
From: Omar Sandoval <osandov@fb.com>
Date: Thu, 25 Jan 2018 15:56:17 -0800
Subject: [PATCH 136/164] Btrfs: delete dead code in btrfs_orphan_add()

btrfs_orphan_add() has had this case commented out since it was first
introduced in commit d68fc57b7e32 ("Btrfs: Metadata reservation for
orphan inodes"). Most of the orphan cleanup code has been rewritten
since then, so it's safe to say that this code isn't needed.

Signed-off-by: Omar Sandoval <osandov@fb.com>
Reviewed-by: Nikolay Borisov <nborisov@suse.com>
[ switch to bool ]
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/inode.c | 29 ++++-------------------------
 1 file changed, 4 insertions(+), 25 deletions(-)

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 1a83b71685d1..a9a47387e53f 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -3366,7 +3366,7 @@ int btrfs_orphan_add(struct btrfs_trans_handle *trans,
 	struct btrfs_root *root = inode->root;
 	struct btrfs_block_rsv *block_rsv = NULL;
 	int reserve = 0;
-	int insert = 0;
+	bool insert = false;
 	int ret;
 
 	if (!root->orphan_block_rsv) {
@@ -3377,20 +3377,8 @@ int btrfs_orphan_add(struct btrfs_trans_handle *trans,
 	}
 
 	if (!test_and_set_bit(BTRFS_INODE_HAS_ORPHAN_ITEM,
-			      &inode->runtime_flags)) {
-#if 0
-		/*
-		 * For proper ENOSPC handling, we should do orphan
-		 * cleanup when mounting. But this introduces backward
-		 * compatibility issue.
-		 */
-		if (!xchg(&root->orphan_item_inserted, 1))
-			insert = 2;
-		else
-			insert = 1;
-#endif
-		insert = 1;
-	}
+			      &inode->runtime_flags))
+		insert = true;
 
 	if (!test_and_set_bit(BTRFS_INODE_ORPHAN_META_RESERVED,
 			      &inode->runtime_flags))
@@ -3430,7 +3418,7 @@ int btrfs_orphan_add(struct btrfs_trans_handle *trans,
 	}
 
 	/* insert an orphan item to track this unlinked/truncated file */
-	if (insert >= 1) {
+	if (insert) {
 		ret = btrfs_insert_orphan_item(trans, root, btrfs_ino(inode));
 		if (ret) {
 			if (reserve) {
@@ -3454,15 +3442,6 @@ int btrfs_orphan_add(struct btrfs_trans_handle *trans,
 		ret = 0;
 	}
 
-	/* insert an orphan item to track subvolume contains orphan files */
-	if (insert >= 2) {
-		ret = btrfs_insert_orphan_item(trans, fs_info->tree_root,
-					       root->root_key.objectid);
-		if (ret && ret != -EEXIST) {
-			btrfs_abort_transaction(trans, ret);
-			return ret;
-		}
-	}
 	return 0;
 }
 

From d4e5c92055d8933e9a2030fcbe6d0dbbec538c58 Mon Sep 17 00:00:00 2001
From: Qu Wenruo <wqu@suse.com>
Date: Tue, 12 Dec 2017 15:34:23 +0800
Subject: [PATCH 137/164] btrfs: qgroup: Skeleton to support separate qgroup
 reservation type

Instead of single qgroup->reserved, use a new structure btrfs_qgroup_rsv
to store different types of reservation.

This patch only updates the header needed to compile.

Signed-off-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/qgroup.c | 16 ++++++++++------
 fs/btrfs/qgroup.h | 27 +++++++++++++++++++++++++--
 2 files changed, 35 insertions(+), 8 deletions(-)

diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index 569f9ab65ceb..d6ab07f40f82 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -2434,7 +2434,8 @@ out:
 }
 
 void btrfs_qgroup_free_refroot(struct btrfs_fs_info *fs_info,
-			       u64 ref_root, u64 num_bytes)
+			       u64 ref_root, u64 num_bytes,
+			       enum btrfs_qgroup_rsv_type type)
 {
 	struct btrfs_root *quota_root;
 	struct btrfs_qgroup *qgroup;
@@ -2925,7 +2926,8 @@ static int qgroup_free_reserved_data(struct inode *inode,
 			goto out;
 		freed += changeset.bytes_changed;
 	}
-	btrfs_qgroup_free_refroot(root->fs_info, root->objectid, freed);
+	btrfs_qgroup_free_refroot(root->fs_info, root->objectid, freed,
+				  BTRFS_QGROUP_RSV_DATA);
 	ret = freed;
 out:
 	extent_changeset_release(&changeset);
@@ -2957,7 +2959,7 @@ static int __btrfs_qgroup_release_data(struct inode *inode,
 	if (free)
 		btrfs_qgroup_free_refroot(BTRFS_I(inode)->root->fs_info,
 				BTRFS_I(inode)->root->objectid,
-				changeset.bytes_changed);
+				changeset.bytes_changed, BTRFS_QGROUP_RSV_DATA);
 	ret = changeset.bytes_changed;
 out:
 	extent_changeset_release(&changeset);
@@ -3034,7 +3036,8 @@ void btrfs_qgroup_free_meta_all(struct btrfs_root *root)
 	if (reserved == 0)
 		return;
 	trace_qgroup_meta_reserve(root, -(s64)reserved);
-	btrfs_qgroup_free_refroot(fs_info, root->objectid, reserved);
+	btrfs_qgroup_free_refroot(fs_info, root->objectid, reserved,
+				  BTRFS_QGROUP_RSV_META);
 }
 
 void btrfs_qgroup_free_meta(struct btrfs_root *root, int num_bytes)
@@ -3049,7 +3052,8 @@ void btrfs_qgroup_free_meta(struct btrfs_root *root, int num_bytes)
 	WARN_ON(atomic64_read(&root->qgroup_meta_rsv) < num_bytes);
 	atomic64_sub(num_bytes, &root->qgroup_meta_rsv);
 	trace_qgroup_meta_reserve(root, -(s64)num_bytes);
-	btrfs_qgroup_free_refroot(fs_info, root->objectid, num_bytes);
+	btrfs_qgroup_free_refroot(fs_info, root->objectid, num_bytes,
+				  BTRFS_QGROUP_RSV_META);
 }
 
 /*
@@ -3077,7 +3081,7 @@ void btrfs_qgroup_check_reserved_leak(struct inode *inode)
 		}
 		btrfs_qgroup_free_refroot(BTRFS_I(inode)->root->fs_info,
 				BTRFS_I(inode)->root->objectid,
-				changeset.bytes_changed);
+				changeset.bytes_changed, BTRFS_QGROUP_RSV_DATA);
 
 	}
 	extent_changeset_release(&changeset);
diff --git a/fs/btrfs/qgroup.h b/fs/btrfs/qgroup.h
index ad003483d20c..b5ee229b3ef1 100644
--- a/fs/btrfs/qgroup.h
+++ b/fs/btrfs/qgroup.h
@@ -61,6 +61,26 @@ struct btrfs_qgroup_extent_record {
 	struct ulist *old_roots;
 };
 
+enum btrfs_qgroup_rsv_type {
+	BTRFS_QGROUP_RSV_DATA = 0,
+	BTRFS_QGROUP_RSV_META,
+	BTRFS_QGROUP_RSV_LAST,
+};
+
+/*
+ * Represents how many bytes we have reserved for this qgroup.
+ *
+ * Each type should have different reservation behavior.
+ * E.g, data follows its io_tree flag modification, while
+ * *currently* meta is just reserve-and-clear during transcation.
+ *
+ * TODO: Add new type for reservation which can survive transaction commit.
+ * Currect metadata reservation behavior is not suitable for such case.
+ */
+struct btrfs_qgroup_rsv {
+	u64 values[BTRFS_QGROUP_RSV_LAST];
+};
+
 /*
  * one struct for each qgroup, organized in fs_info->qgroup_tree.
  */
@@ -88,6 +108,7 @@ struct btrfs_qgroup {
 	 * reservation tracking
 	 */
 	u64 reserved;
+	struct btrfs_qgroup_rsv rsv;
 
 	/*
 	 * lists
@@ -227,12 +248,14 @@ int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans,
 			 struct btrfs_fs_info *fs_info, u64 srcid, u64 objectid,
 			 struct btrfs_qgroup_inherit *inherit);
 void btrfs_qgroup_free_refroot(struct btrfs_fs_info *fs_info,
-			       u64 ref_root, u64 num_bytes);
+			       u64 ref_root, u64 num_bytes,
+			       enum btrfs_qgroup_rsv_type type);
 static inline void btrfs_qgroup_free_delayed_ref(struct btrfs_fs_info *fs_info,
 						 u64 ref_root, u64 num_bytes)
 {
 	trace_btrfs_qgroup_free_delayed_ref(fs_info, ref_root, num_bytes);
-	btrfs_qgroup_free_refroot(fs_info, ref_root, num_bytes);
+	btrfs_qgroup_free_refroot(fs_info, ref_root, num_bytes,
+				  BTRFS_QGROUP_RSV_DATA);
 }
 
 #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS

From f59c0347d4be22dad5812b5b14bf94ac0efd371a Mon Sep 17 00:00:00 2001
From: Qu Wenruo <wqu@suse.com>
Date: Tue, 12 Dec 2017 15:34:24 +0800
Subject: [PATCH 138/164] btrfs: qgroup: Introduce helpers to update and access
 new qgroup rsv

Introduce helpers to:

1) Get total reserved space
   For limit calculation
2) Add/release reserved space for given type
   With underflow detection and warning
3) Add/release reserved space according to child qgroup

Signed-off-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/qgroup.c | 68 +++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 68 insertions(+)

diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index d6ab07f40f82..252af87340cc 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -47,6 +47,74 @@
  *  - check all ioctl parameters
  */
 
+/*
+ * Helpers to access qgroup reservation
+ *
+ * Callers should ensure the lock context and type are valid
+ */
+
+static u64 qgroup_rsv_total(const struct btrfs_qgroup *qgroup)
+{
+	u64 ret = 0;
+	int i;
+
+	for (i = 0; i < BTRFS_QGROUP_RSV_LAST; i++)
+		ret += qgroup->rsv.values[i];
+
+	return ret;
+}
+
+#ifdef CONFIG_BTRFS_DEBUG
+static const char *qgroup_rsv_type_str(enum btrfs_qgroup_rsv_type type)
+{
+	if (type == BTRFS_QGROUP_RSV_DATA)
+		return "data";
+	if (type == BTRFS_QGROUP_RSV_META)
+		return "meta";
+	return NULL;
+}
+#endif
+
+static void qgroup_rsv_add(struct btrfs_qgroup *qgroup, u64 num_bytes,
+			   enum btrfs_qgroup_rsv_type type)
+{
+	qgroup->rsv.values[type] += num_bytes;
+}
+
+static void qgroup_rsv_release(struct btrfs_qgroup *qgroup, u64 num_bytes,
+			       enum btrfs_qgroup_rsv_type type)
+{
+	if (qgroup->rsv.values[type] >= num_bytes) {
+		qgroup->rsv.values[type] -= num_bytes;
+		return;
+	}
+#ifdef CONFIG_BTRFS_DEBUG
+	WARN_RATELIMIT(1,
+		"qgroup %llu %s reserved space underflow, have %llu to free %llu",
+		qgroup->qgroupid, qgroup_rsv_type_str(type),
+		qgroup->rsv.values[type], num_bytes);
+#endif
+	qgroup->rsv.values[type] = 0;
+}
+
+static void qgroup_rsv_add_by_qgroup(struct btrfs_qgroup *dest,
+					  struct btrfs_qgroup *src)
+{
+	int i;
+
+	for (i = 0; i < BTRFS_QGROUP_RSV_LAST; i++)
+		qgroup_rsv_add(dest, src->rsv.values[i], i);
+}
+
+static void qgroup_rsv_release_by_qgroup(struct btrfs_qgroup *dest,
+					  struct btrfs_qgroup *src)
+{
+	int i;
+
+	for (i = 0; i < BTRFS_QGROUP_RSV_LAST; i++)
+		qgroup_rsv_release(dest, src->rsv.values[i], i);
+}
+
 static void btrfs_qgroup_update_old_refcnt(struct btrfs_qgroup *qg, u64 seq,
 					   int mod)
 {

From dba213242fbcfc5495004ab76ca27c35ce1bf304 Mon Sep 17 00:00:00 2001
From: Qu Wenruo <wqu@suse.com>
Date: Tue, 12 Dec 2017 15:34:25 +0800
Subject: [PATCH 139/164] btrfs: qgroup: Make qgroup_reserve and its callers to
 use separate reservation type

Since most callers of qgroup_reserve() are already defined by type,
converting qgroup_reserve() is quite an easy work.

Signed-off-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/qgroup.c | 20 +++++++++-----------
 1 file changed, 9 insertions(+), 11 deletions(-)

diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index 252af87340cc..58a8b6930960 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -2391,17 +2391,18 @@ out:
 static bool qgroup_check_limits(const struct btrfs_qgroup *qg, u64 num_bytes)
 {
 	if ((qg->lim_flags & BTRFS_QGROUP_LIMIT_MAX_RFER) &&
-	    qg->reserved + (s64)qg->rfer + num_bytes > qg->max_rfer)
+	    qgroup_rsv_total(qg) + (s64)qg->rfer + num_bytes > qg->max_rfer)
 		return false;
 
 	if ((qg->lim_flags & BTRFS_QGROUP_LIMIT_MAX_EXCL) &&
-	    qg->reserved + (s64)qg->excl + num_bytes > qg->max_excl)
+	    qgroup_rsv_total(qg) + (s64)qg->excl + num_bytes > qg->max_excl)
 		return false;
 
 	return true;
 }
 
-static int qgroup_reserve(struct btrfs_root *root, u64 num_bytes, bool enforce)
+static int qgroup_reserve(struct btrfs_root *root, u64 num_bytes, bool enforce,
+			  enum btrfs_qgroup_rsv_type type)
 {
 	struct btrfs_root *quota_root;
 	struct btrfs_qgroup *qgroup;
@@ -2453,7 +2454,7 @@ retry:
 			 * Commit the tree and retry, since we may have
 			 * deletions which would free up space.
 			 */
-			if (!retried && qg->reserved > 0) {
+			if (!retried && qgroup_rsv_total(qg) > 0) {
 				struct btrfs_trans_handle *trans;
 
 				spin_unlock(&fs_info->qgroup_lock);
@@ -2493,7 +2494,7 @@ retry:
 		qg = unode_aux_to_qgroup(unode);
 
 		trace_qgroup_update_reserve(fs_info, qg, num_bytes);
-		qg->reserved += num_bytes;
+		qgroup_rsv_add(qg, num_bytes, type);
 	}
 
 out:
@@ -2540,10 +2541,7 @@ void btrfs_qgroup_free_refroot(struct btrfs_fs_info *fs_info,
 		qg = unode_aux_to_qgroup(unode);
 
 		trace_qgroup_update_reserve(fs_info, qg, -(s64)num_bytes);
-		if (qg->reserved < num_bytes)
-			report_reserved_underflow(fs_info, qg, num_bytes);
-		else
-			qg->reserved -= num_bytes;
+		qgroup_rsv_release(qg, num_bytes, type);
 
 		list_for_each_entry(glist, &qg->groups, next_group) {
 			ret = ulist_add(fs_info->qgroup_ulist,
@@ -2931,7 +2929,7 @@ int btrfs_qgroup_reserve_data(struct inode *inode,
 					to_reserve, QGROUP_RESERVE);
 	if (ret < 0)
 		goto cleanup;
-	ret = qgroup_reserve(root, to_reserve, true);
+	ret = qgroup_reserve(root, to_reserve, true, BTRFS_QGROUP_RSV_DATA);
 	if (ret < 0)
 		goto cleanup;
 
@@ -3084,7 +3082,7 @@ int btrfs_qgroup_reserve_meta(struct btrfs_root *root, int num_bytes,
 
 	BUG_ON(num_bytes != round_down(num_bytes, fs_info->nodesize));
 	trace_qgroup_meta_reserve(root, (s64)num_bytes);
-	ret = qgroup_reserve(root, num_bytes, enforce);
+	ret = qgroup_reserve(root, num_bytes, enforce, BTRFS_QGROUP_RSV_META);
 	if (ret < 0)
 		return ret;
 	atomic64_add(num_bytes, &root->qgroup_meta_rsv);

From 429d6275d50199dd4c3a5876754003ae06c7f927 Mon Sep 17 00:00:00 2001
From: Qu Wenruo <wqu@suse.com>
Date: Tue, 12 Dec 2017 15:34:26 +0800
Subject: [PATCH 140/164] btrfs: qgroup: Fix wrong qgroup reservation update
 for relationship modification

When modifying qgroup relationship, for qgroup which only owns exclusive
extents, we will go through quick update path.

In this path, we will add/subtract exclusive and reference number for
parent qgroup, since the source (child) qgroup only has exclusive
extents, destination (parent) qgroup will also own or lose those extents
exclusively.

The same should be the same for reservation, since later reservation
adding/releasing will also affect parent qgroup, without the reservation
carried from child, parent will underflow reservation or have dead
reservation which will never be freed.

However original code doesn't do the same thing for reservation.
It handles qgroup reservation quite differently:

It removes qgroup reservation, as it's allocating space from the
reserved qgroup for relationship adding.
But does nothing for qgroup reservation if we're removing a qgroup
relationship.

According to the original code, it looks just like because we're adding
qgroup->rfer, the code assumes we're writing new data, so it's follows
the normal write routine, by reducing qgroup->reserved and adding
qgroup->rfer/excl.

This old behavior is wrong, and should be fixed to follow the same
excl/rfer behavior.

Just fix it by using the correct behavior described above.

Fixes: 31193213f1f9 ("Btrfs: qgroup: Introduce a may_use to account space_info->bytes_may_use.")
Signed-off-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/qgroup.c | 44 +++++++++++++++++++++++---------------------
 1 file changed, 23 insertions(+), 21 deletions(-)

diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index 58a8b6930960..87672d03c8ac 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -1076,21 +1076,30 @@ static void report_reserved_underflow(struct btrfs_fs_info *fs_info,
 #endif
 	qgroup->reserved = 0;
 }
+
 /*
- * The easy accounting, if we are adding/removing the only ref for an extent
- * then this qgroup and all of the parent qgroups get their reference and
- * exclusive counts adjusted.
+ * The easy accounting, we're updating qgroup relationship whose child qgroup
+ * only has exclusive extents.
+ *
+ * In this case, all exclsuive extents will also be exlusive for parent, so
+ * excl/rfer just get added/removed.
+ *
+ * So is qgroup reservation space, which should also be added/removed to
+ * parent.
+ * Or when child tries to release reservation space, parent will underflow its
+ * reservation (for relationship adding case).
  *
  * Caller should hold fs_info->qgroup_lock.
  */
 static int __qgroup_excl_accounting(struct btrfs_fs_info *fs_info,
 				    struct ulist *tmp, u64 ref_root,
-				    u64 num_bytes, int sign)
+				    struct btrfs_qgroup *src, int sign)
 {
 	struct btrfs_qgroup *qgroup;
 	struct btrfs_qgroup_list *glist;
 	struct ulist_node *unode;
 	struct ulist_iterator uiter;
+	u64 num_bytes = src->excl;
 	int ret = 0;
 
 	qgroup = find_qgroup_rb(fs_info, ref_root);
@@ -1103,13 +1112,11 @@ static int __qgroup_excl_accounting(struct btrfs_fs_info *fs_info,
 	WARN_ON(sign < 0 && qgroup->excl < num_bytes);
 	qgroup->excl += sign * num_bytes;
 	qgroup->excl_cmpr += sign * num_bytes;
-	if (sign > 0) {
-		trace_qgroup_update_reserve(fs_info, qgroup, -(s64)num_bytes);
-		if (qgroup->reserved < num_bytes)
-			report_reserved_underflow(fs_info, qgroup, num_bytes);
-		else
-			qgroup->reserved -= num_bytes;
-	}
+
+	if (sign > 0)
+		qgroup_rsv_add_by_qgroup(qgroup, src);
+	else
+		qgroup_rsv_release_by_qgroup(qgroup, src);
 
 	qgroup_dirty(fs_info, qgroup);
 
@@ -1129,15 +1136,10 @@ static int __qgroup_excl_accounting(struct btrfs_fs_info *fs_info,
 		qgroup->rfer_cmpr += sign * num_bytes;
 		WARN_ON(sign < 0 && qgroup->excl < num_bytes);
 		qgroup->excl += sign * num_bytes;
-		if (sign > 0) {
-			trace_qgroup_update_reserve(fs_info, qgroup,
-						    -(s64)num_bytes);
-			if (qgroup->reserved < num_bytes)
-				report_reserved_underflow(fs_info, qgroup,
-							  num_bytes);
-			else
-				qgroup->reserved -= num_bytes;
-		}
+		if (sign > 0)
+			qgroup_rsv_add_by_qgroup(qgroup, src);
+		else
+			qgroup_rsv_release_by_qgroup(qgroup, src);
 		qgroup->excl_cmpr += sign * num_bytes;
 		qgroup_dirty(fs_info, qgroup);
 
@@ -1180,7 +1182,7 @@ static int quick_update_accounting(struct btrfs_fs_info *fs_info,
 	if (qgroup->excl == qgroup->rfer) {
 		ret = 0;
 		err = __qgroup_excl_accounting(fs_info, tmp, dst,
-					       qgroup->excl, sign);
+					       qgroup, sign);
 		if (err < 0) {
 			ret = err;
 			goto out;

From 64ee4e751a1c43b155afe2c1c07212893836f36d Mon Sep 17 00:00:00 2001
From: Qu Wenruo <wqu@suse.com>
Date: Tue, 12 Dec 2017 15:34:27 +0800
Subject: [PATCH 141/164] btrfs: qgroup: Update trace events to use new
 separate rsv types

Signed-off-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/qgroup.c            | 36 +++++++++++++++++++++---------------
 include/trace/events/btrfs.h | 17 ++++++++++++-----
 2 files changed, 33 insertions(+), 20 deletions(-)

diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index 87672d03c8ac..8ec103deb361 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -75,15 +75,19 @@ static const char *qgroup_rsv_type_str(enum btrfs_qgroup_rsv_type type)
 }
 #endif
 
-static void qgroup_rsv_add(struct btrfs_qgroup *qgroup, u64 num_bytes,
+static void qgroup_rsv_add(struct btrfs_fs_info *fs_info,
+			   struct btrfs_qgroup *qgroup, u64 num_bytes,
 			   enum btrfs_qgroup_rsv_type type)
 {
+	trace_qgroup_update_reserve(fs_info, qgroup, num_bytes, type);
 	qgroup->rsv.values[type] += num_bytes;
 }
 
-static void qgroup_rsv_release(struct btrfs_qgroup *qgroup, u64 num_bytes,
+static void qgroup_rsv_release(struct btrfs_fs_info *fs_info,
+			       struct btrfs_qgroup *qgroup, u64 num_bytes,
 			       enum btrfs_qgroup_rsv_type type)
 {
+	trace_qgroup_update_reserve(fs_info, qgroup, -(s64)num_bytes, type);
 	if (qgroup->rsv.values[type] >= num_bytes) {
 		qgroup->rsv.values[type] -= num_bytes;
 		return;
@@ -97,22 +101,24 @@ static void qgroup_rsv_release(struct btrfs_qgroup *qgroup, u64 num_bytes,
 	qgroup->rsv.values[type] = 0;
 }
 
-static void qgroup_rsv_add_by_qgroup(struct btrfs_qgroup *dest,
-					  struct btrfs_qgroup *src)
+static void qgroup_rsv_add_by_qgroup(struct btrfs_fs_info *fs_info,
+				     struct btrfs_qgroup *dest,
+				     struct btrfs_qgroup *src)
 {
 	int i;
 
 	for (i = 0; i < BTRFS_QGROUP_RSV_LAST; i++)
-		qgroup_rsv_add(dest, src->rsv.values[i], i);
+		qgroup_rsv_add(fs_info, dest, src->rsv.values[i], i);
 }
 
-static void qgroup_rsv_release_by_qgroup(struct btrfs_qgroup *dest,
+static void qgroup_rsv_release_by_qgroup(struct btrfs_fs_info *fs_info,
+					 struct btrfs_qgroup *dest,
 					  struct btrfs_qgroup *src)
 {
 	int i;
 
 	for (i = 0; i < BTRFS_QGROUP_RSV_LAST; i++)
-		qgroup_rsv_release(dest, src->rsv.values[i], i);
+		qgroup_rsv_release(fs_info, dest, src->rsv.values[i], i);
 }
 
 static void btrfs_qgroup_update_old_refcnt(struct btrfs_qgroup *qg, u64 seq,
@@ -1114,9 +1120,9 @@ static int __qgroup_excl_accounting(struct btrfs_fs_info *fs_info,
 	qgroup->excl_cmpr += sign * num_bytes;
 
 	if (sign > 0)
-		qgroup_rsv_add_by_qgroup(qgroup, src);
+		qgroup_rsv_add_by_qgroup(fs_info, qgroup, src);
 	else
-		qgroup_rsv_release_by_qgroup(qgroup, src);
+		qgroup_rsv_release_by_qgroup(fs_info, qgroup, src);
 
 	qgroup_dirty(fs_info, qgroup);
 
@@ -1137,9 +1143,9 @@ static int __qgroup_excl_accounting(struct btrfs_fs_info *fs_info,
 		WARN_ON(sign < 0 && qgroup->excl < num_bytes);
 		qgroup->excl += sign * num_bytes;
 		if (sign > 0)
-			qgroup_rsv_add_by_qgroup(qgroup, src);
+			qgroup_rsv_add_by_qgroup(fs_info, qgroup, src);
 		else
-			qgroup_rsv_release_by_qgroup(qgroup, src);
+			qgroup_rsv_release_by_qgroup(fs_info, qgroup, src);
 		qgroup->excl_cmpr += sign * num_bytes;
 		qgroup_dirty(fs_info, qgroup);
 
@@ -2495,8 +2501,8 @@ retry:
 
 		qg = unode_aux_to_qgroup(unode);
 
-		trace_qgroup_update_reserve(fs_info, qg, num_bytes);
-		qgroup_rsv_add(qg, num_bytes, type);
+		trace_qgroup_update_reserve(fs_info, qg, num_bytes, type);
+		qgroup_rsv_add(fs_info, qg, num_bytes, type);
 	}
 
 out:
@@ -2542,8 +2548,8 @@ void btrfs_qgroup_free_refroot(struct btrfs_fs_info *fs_info,
 
 		qg = unode_aux_to_qgroup(unode);
 
-		trace_qgroup_update_reserve(fs_info, qg, -(s64)num_bytes);
-		qgroup_rsv_release(qg, num_bytes, type);
+		trace_qgroup_update_reserve(fs_info, qg, -(s64)num_bytes, type);
+		qgroup_rsv_release(fs_info, qg, num_bytes, type);
 
 		list_for_each_entry(glist, &qg->groups, next_group) {
 			ret = ulist_add(fs_info->qgroup_ulist,
diff --git a/include/trace/events/btrfs.h b/include/trace/events/btrfs.h
index 486771e3f4cb..54b9af822a3a 100644
--- a/include/trace/events/btrfs.h
+++ b/include/trace/events/btrfs.h
@@ -71,6 +71,11 @@ TRACE_DEFINE_ENUM(COMMIT_TRANS);
 		 { BTRFS_FILE_EXTENT_REG,	"REG"	 },		\
 		 { BTRFS_FILE_EXTENT_PREALLOC,	"PREALLOC"})
 
+#define show_qgroup_rsv_type(type)					\
+	__print_symbolic(type,						\
+		{ BTRFS_QGROUP_RSV_DATA,	"DATA"	},		\
+		{ BTRFS_QGROUP_RSV_META,	"META"	})
+
 #define BTRFS_GROUP_FLAGS	\
 	{ BTRFS_BLOCK_GROUP_DATA,	"DATA"},	\
 	{ BTRFS_BLOCK_GROUP_SYSTEM,	"SYSTEM"},	\
@@ -1633,24 +1638,26 @@ TRACE_EVENT(qgroup_update_counters,
 TRACE_EVENT(qgroup_update_reserve,
 
 	TP_PROTO(struct btrfs_fs_info *fs_info, struct btrfs_qgroup *qgroup,
-		 s64 diff),
+		 s64 diff, int type),
 
-	TP_ARGS(fs_info, qgroup, diff),
+	TP_ARGS(fs_info, qgroup, diff, type),
 
 	TP_STRUCT__entry_btrfs(
 		__field(	u64,	qgid			)
 		__field(	u64,	cur_reserved		)
 		__field(	s64,	diff			)
+		__field(	int,	type			)
 	),
 
 	TP_fast_assign_btrfs(fs_info,
 		__entry->qgid		= qgroup->qgroupid;
-		__entry->cur_reserved	= qgroup->reserved;
+		__entry->cur_reserved	= qgroup->rsv.values[type];
 		__entry->diff		= diff;
 	),
 
-	TP_printk_btrfs("qgid=%llu cur_reserved=%llu diff=%lld",
-		__entry->qgid, __entry->cur_reserved, __entry->diff)
+	TP_printk_btrfs("qgid=%llu type=%s cur_reserved=%llu diff=%lld",
+		__entry->qgid, show_qgroup_rsv_type(__entry->type),
+		__entry->cur_reserved, __entry->diff)
 );
 
 TRACE_EVENT(qgroup_meta_reserve,

From 5c40507ffb1bbbc8eeeaa6d8da181f431cb83d97 Mon Sep 17 00:00:00 2001
From: Qu Wenruo <wqu@suse.com>
Date: Tue, 12 Dec 2017 15:34:28 +0800
Subject: [PATCH 142/164] btrfs: qgroup: Cleanup the remaining old reservation
 counters

So qgroup is switched to new separate types reservation system.

Signed-off-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/qgroup.c | 13 -------------
 fs/btrfs/qgroup.h |  1 -
 2 files changed, 14 deletions(-)

diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index 8ec103deb361..c0deebfecd93 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -1070,19 +1070,6 @@ static void qgroup_dirty(struct btrfs_fs_info *fs_info,
 		list_add(&qgroup->dirty, &fs_info->dirty_qgroups);
 }
 
-static void report_reserved_underflow(struct btrfs_fs_info *fs_info,
-				      struct btrfs_qgroup *qgroup,
-				      u64 num_bytes)
-{
-#ifdef CONFIG_BTRFS_DEBUG
-	WARN_ON(qgroup->reserved < num_bytes);
-	btrfs_debug(fs_info,
-		"qgroup %llu reserved space underflow, have: %llu, to free: %llu",
-		qgroup->qgroupid, qgroup->reserved, num_bytes);
-#endif
-	qgroup->reserved = 0;
-}
-
 /*
  * The easy accounting, we're updating qgroup relationship whose child qgroup
  * only has exclusive extents.
diff --git a/fs/btrfs/qgroup.h b/fs/btrfs/qgroup.h
index b5ee229b3ef1..279e71a21695 100644
--- a/fs/btrfs/qgroup.h
+++ b/fs/btrfs/qgroup.h
@@ -107,7 +107,6 @@ struct btrfs_qgroup {
 	/*
 	 * reservation tracking
 	 */
-	u64 reserved;
 	struct btrfs_qgroup_rsv rsv;
 
 	/*

From 733e03a0b26a463d75aa86083c9fab856571e7fc Mon Sep 17 00:00:00 2001
From: Qu Wenruo <wqu@suse.com>
Date: Tue, 12 Dec 2017 15:34:29 +0800
Subject: [PATCH 143/164] btrfs: qgroup: Split meta rsv type into meta_prealloc
 and meta_pertrans

Btrfs uses 2 different methods to reseve metadata qgroup space.

1) Reserve at btrfs_start_transaction() time
   This is quite straightforward, caller will use the trans handler
   allocated to modify b-trees.

   In this case, reserved metadata should be kept until qgroup numbers
   are updated.

2) Reserve by using block_rsv first, and later btrfs_join_transaction()
   This is more complicated, caller will reserve space using block_rsv
   first, and then later call btrfs_join_transaction() to get a trans
   handle.

   In this case, before we modify trees, the reserved space can be
   modified on demand, and after btrfs_join_transaction(), such reserved
   space should also be kept until qgroup numbers are updated.

Since these two types behave differently, split the original "META"
reservation type into 2 sub-types:

  META_PERTRANS:
    For above case 1)

  META_PREALLOC:
    For reservations that happened before btrfs_join_transaction() of
    case 2)

NOTE: This patch will only convert existing qgroup meta reservation
callers according to its situation, not ensuring all callers are at
correct timing.
Such fix will be added in later patches.

Signed-off-by: Qu Wenruo <wqu@suse.com>
[ update comments ]
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/extent-tree.c       |  8 ++---
 fs/btrfs/qgroup.c            | 22 ++++++------
 fs/btrfs/qgroup.h            | 69 +++++++++++++++++++++++++++++++++---
 fs/btrfs/transaction.c       |  8 ++---
 include/trace/events/btrfs.h |  5 +--
 5 files changed, 87 insertions(+), 25 deletions(-)

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 0b1f01dd02de..020c1a1a6526 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -5977,7 +5977,7 @@ int btrfs_subvolume_reserve_metadata(struct btrfs_root *root,
 	if (test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags)) {
 		/* One for parent inode, two for dir entries */
 		num_bytes = 3 * fs_info->nodesize;
-		ret = btrfs_qgroup_reserve_meta(root, num_bytes, true);
+		ret = btrfs_qgroup_reserve_meta_prealloc(root, num_bytes, true);
 		if (ret)
 			return ret;
 	} else {
@@ -5996,7 +5996,7 @@ int btrfs_subvolume_reserve_metadata(struct btrfs_root *root,
 		ret = btrfs_block_rsv_migrate(global_rsv, rsv, num_bytes, 1);
 
 	if (ret && *qgroup_reserved)
-		btrfs_qgroup_free_meta(root, *qgroup_reserved);
+		btrfs_qgroup_free_meta_prealloc(root, *qgroup_reserved);
 
 	return ret;
 }
@@ -6072,7 +6072,7 @@ int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes)
 	spin_unlock(&inode->lock);
 
 	if (test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags)) {
-		ret = btrfs_qgroup_reserve_meta(root,
+		ret = btrfs_qgroup_reserve_meta_prealloc(root,
 				nr_extents * fs_info->nodesize, true);
 		if (ret)
 			goto out_fail;
@@ -6080,7 +6080,7 @@ int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes)
 
 	ret = btrfs_inode_rsv_refill(inode, flush);
 	if (unlikely(ret)) {
-		btrfs_qgroup_free_meta(root,
+		btrfs_qgroup_free_meta_prealloc(root,
 				       nr_extents * fs_info->nodesize);
 		goto out_fail;
 	}
diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index c0deebfecd93..8831eaa14204 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -69,8 +69,10 @@ static const char *qgroup_rsv_type_str(enum btrfs_qgroup_rsv_type type)
 {
 	if (type == BTRFS_QGROUP_RSV_DATA)
 		return "data";
-	if (type == BTRFS_QGROUP_RSV_META)
-		return "meta";
+	if (type == BTRFS_QGROUP_RSV_META_PERTRANS)
+		return "meta_pertrans";
+	if (type == BTRFS_QGROUP_RSV_META_PREALLOC)
+		return "meta_prealloc";
 	return NULL;
 }
 #endif
@@ -3065,8 +3067,8 @@ int btrfs_qgroup_release_data(struct inode *inode, u64 start, u64 len)
 	return __btrfs_qgroup_release_data(inode, NULL, start, len, 0);
 }
 
-int btrfs_qgroup_reserve_meta(struct btrfs_root *root, int num_bytes,
-			      bool enforce)
+int __btrfs_qgroup_reserve_meta(struct btrfs_root *root, int num_bytes,
+				enum btrfs_qgroup_rsv_type type, bool enforce)
 {
 	struct btrfs_fs_info *fs_info = root->fs_info;
 	int ret;
@@ -3077,14 +3079,14 @@ int btrfs_qgroup_reserve_meta(struct btrfs_root *root, int num_bytes,
 
 	BUG_ON(num_bytes != round_down(num_bytes, fs_info->nodesize));
 	trace_qgroup_meta_reserve(root, (s64)num_bytes);
-	ret = qgroup_reserve(root, num_bytes, enforce, BTRFS_QGROUP_RSV_META);
+	ret = qgroup_reserve(root, num_bytes, enforce, type);
 	if (ret < 0)
 		return ret;
 	atomic64_add(num_bytes, &root->qgroup_meta_rsv);
 	return ret;
 }
 
-void btrfs_qgroup_free_meta_all(struct btrfs_root *root)
+void btrfs_qgroup_free_meta_all_pertrans(struct btrfs_root *root)
 {
 	struct btrfs_fs_info *fs_info = root->fs_info;
 	u64 reserved;
@@ -3098,10 +3100,11 @@ void btrfs_qgroup_free_meta_all(struct btrfs_root *root)
 		return;
 	trace_qgroup_meta_reserve(root, -(s64)reserved);
 	btrfs_qgroup_free_refroot(fs_info, root->objectid, reserved,
-				  BTRFS_QGROUP_RSV_META);
+				  BTRFS_QGROUP_RSV_META_PERTRANS);
 }
 
-void btrfs_qgroup_free_meta(struct btrfs_root *root, int num_bytes)
+void __btrfs_qgroup_free_meta(struct btrfs_root *root, int num_bytes,
+			      enum btrfs_qgroup_rsv_type type)
 {
 	struct btrfs_fs_info *fs_info = root->fs_info;
 
@@ -3113,8 +3116,7 @@ void btrfs_qgroup_free_meta(struct btrfs_root *root, int num_bytes)
 	WARN_ON(atomic64_read(&root->qgroup_meta_rsv) < num_bytes);
 	atomic64_sub(num_bytes, &root->qgroup_meta_rsv);
 	trace_qgroup_meta_reserve(root, -(s64)num_bytes);
-	btrfs_qgroup_free_refroot(fs_info, root->objectid, num_bytes,
-				  BTRFS_QGROUP_RSV_META);
+	btrfs_qgroup_free_refroot(fs_info, root->objectid, num_bytes, type);
 }
 
 /*
diff --git a/fs/btrfs/qgroup.h b/fs/btrfs/qgroup.h
index 279e71a21695..987a5a49deb8 100644
--- a/fs/btrfs/qgroup.h
+++ b/fs/btrfs/qgroup.h
@@ -61,9 +61,31 @@ struct btrfs_qgroup_extent_record {
 	struct ulist *old_roots;
 };
 
+/*
+ * Qgroup reservation types:
+ *
+ * DATA:
+ *	space reserved for data
+ *
+ * META_PERTRANS:
+ * 	Space reserved for metadata (per-transaction)
+ * 	Due to the fact that qgroup data is only updated at transaction commit
+ * 	time, reserved space for metadata must be kept until transaction
+ * 	commits.
+ * 	Any metadata reserved that are used in btrfs_start_transaction() should
+ * 	be of this type.
+ *
+ * META_PREALLOC:
+ *	There are cases where metadata space is reserved before starting
+ *	transaction, and then btrfs_join_transaction() to get a trans handle.
+ *	Any metadata reserved for such usage should be of this type.
+ *	And after join_transaction() part (or all) of such reservation should
+ *	be converted into META_PERTRANS.
+ */
 enum btrfs_qgroup_rsv_type {
 	BTRFS_QGROUP_RSV_DATA = 0,
-	BTRFS_QGROUP_RSV_META,
+	BTRFS_QGROUP_RSV_META_PERTRANS,
+	BTRFS_QGROUP_RSV_META_PREALLOC,
 	BTRFS_QGROUP_RSV_LAST,
 };
 
@@ -269,9 +291,46 @@ int btrfs_qgroup_release_data(struct inode *inode, u64 start, u64 len);
 int btrfs_qgroup_free_data(struct inode *inode,
 			struct extent_changeset *reserved, u64 start, u64 len);
 
-int btrfs_qgroup_reserve_meta(struct btrfs_root *root, int num_bytes,
-			      bool enforce);
-void btrfs_qgroup_free_meta_all(struct btrfs_root *root);
-void btrfs_qgroup_free_meta(struct btrfs_root *root, int num_bytes);
+int __btrfs_qgroup_reserve_meta(struct btrfs_root *root, int num_bytes,
+				enum btrfs_qgroup_rsv_type type, bool enforce);
+/* Reserve metadata space for pertrans and prealloc type */
+static inline int btrfs_qgroup_reserve_meta_pertrans(struct btrfs_root *root,
+				int num_bytes, bool enforce)
+{
+	return __btrfs_qgroup_reserve_meta(root, num_bytes,
+			BTRFS_QGROUP_RSV_META_PERTRANS, enforce);
+}
+static inline int btrfs_qgroup_reserve_meta_prealloc(struct btrfs_root *root,
+				int num_bytes, bool enforce)
+{
+	return __btrfs_qgroup_reserve_meta(root, num_bytes,
+			BTRFS_QGROUP_RSV_META_PREALLOC, enforce);
+}
+
+void __btrfs_qgroup_free_meta(struct btrfs_root *root, int num_bytes,
+			     enum btrfs_qgroup_rsv_type type);
+
+/* Free per-transaction meta reservation for error handling */
+static inline void btrfs_qgroup_free_meta_pertrans(struct btrfs_root *root,
+						   int num_bytes)
+{
+	__btrfs_qgroup_free_meta(root, num_bytes,
+			BTRFS_QGROUP_RSV_META_PERTRANS);
+}
+
+/* Pre-allocated meta reservation can be freed at need */
+static inline void btrfs_qgroup_free_meta_prealloc(struct btrfs_root *root,
+						   int num_bytes)
+{
+	__btrfs_qgroup_free_meta(root, num_bytes,
+			BTRFS_QGROUP_RSV_META_PREALLOC);
+}
+
+/*
+ * Per-transaction meta reservation should be all freed at transaction commit
+ * time
+ */
+void btrfs_qgroup_free_meta_all_pertrans(struct btrfs_root *root);
+
 void btrfs_qgroup_check_reserved_leak(struct inode *inode);
 #endif /* __BTRFS_QGROUP__ */
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 15f6541303bc..5c4cf0f9146b 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -498,8 +498,8 @@ start_transaction(struct btrfs_root *root, unsigned int num_items,
 	 */
 	if (num_items && root != fs_info->chunk_root) {
 		qgroup_reserved = num_items * fs_info->nodesize;
-		ret = btrfs_qgroup_reserve_meta(root, qgroup_reserved,
-						enforce_qgroups);
+		ret = btrfs_qgroup_reserve_meta_pertrans(root, qgroup_reserved,
+				enforce_qgroups);
 		if (ret)
 			return ERR_PTR(ret);
 
@@ -596,7 +596,7 @@ alloc_fail:
 		btrfs_block_rsv_release(fs_info, &fs_info->trans_block_rsv,
 					num_bytes);
 reserve_fail:
-	btrfs_qgroup_free_meta(root, qgroup_reserved);
+	btrfs_qgroup_free_meta_pertrans(root, qgroup_reserved);
 	return ERR_PTR(ret);
 }
 
@@ -1284,7 +1284,7 @@ static noinline int commit_fs_roots(struct btrfs_trans_handle *trans)
 			spin_lock(&fs_info->fs_roots_radix_lock);
 			if (err)
 				break;
-			btrfs_qgroup_free_meta_all(root);
+			btrfs_qgroup_free_meta_all_pertrans(root);
 		}
 	}
 	spin_unlock(&fs_info->fs_roots_radix_lock);
diff --git a/include/trace/events/btrfs.h b/include/trace/events/btrfs.h
index 54b9af822a3a..eee778ba1414 100644
--- a/include/trace/events/btrfs.h
+++ b/include/trace/events/btrfs.h
@@ -73,8 +73,9 @@ TRACE_DEFINE_ENUM(COMMIT_TRANS);
 
 #define show_qgroup_rsv_type(type)					\
 	__print_symbolic(type,						\
-		{ BTRFS_QGROUP_RSV_DATA,	"DATA"	},		\
-		{ BTRFS_QGROUP_RSV_META,	"META"	})
+		{ BTRFS_QGROUP_RSV_DATA,	  "DATA"	},	\
+		{ BTRFS_QGROUP_RSV_META_PERTRANS, "META_PERTRANS" },	\
+		{ BTRFS_QGROUP_RSV_META_PREALLOC, "META_PREALLOC" })
 
 #define BTRFS_GROUP_FLAGS	\
 	{ BTRFS_BLOCK_GROUP_DATA,	"DATA"},	\

From e1211d0e896b71d395fe411d0e0a76f4bc336617 Mon Sep 17 00:00:00 2001
From: Qu Wenruo <wqu@suse.com>
Date: Tue, 12 Dec 2017 15:34:30 +0800
Subject: [PATCH 144/164] btrfs: qgroup: Don't use root->qgroup_meta_rsv for
 qgroup

Since qgroup has seperate metadata reservation types now, we can
completely get rid of the old root->qgroup_meta_rsv, which mostly acts
as current META_PERTRANS reservation type.

Signed-off-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/ctree.h   |  3 ---
 fs/btrfs/disk-io.c |  1 -
 fs/btrfs/qgroup.c  | 33 ++++++++++++++++++++++++---------
 3 files changed, 24 insertions(+), 13 deletions(-)

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index f9f512be9d41..df0463e2ab7f 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -1264,9 +1264,6 @@ struct btrfs_root {
 	int send_in_progress;
 	struct btrfs_subvolume_writers *subv_writers;
 	atomic_t will_be_snapshotted;
-
-	/* For qgroup metadata space reserve */
-	atomic64_t qgroup_meta_rsv;
 };
 
 struct btrfs_file_private {
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 38b387ae78f8..ad900f821f1e 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -1163,7 +1163,6 @@ static void __setup_root(struct btrfs_root *root, struct btrfs_fs_info *fs_info,
 	atomic_set(&root->orphan_inodes, 0);
 	refcount_set(&root->refs, 1);
 	atomic_set(&root->will_be_snapshotted, 0);
-	atomic64_set(&root->qgroup_meta_rsv, 0);
 	root->log_transid = 0;
 	root->log_transid_committed = -1;
 	root->last_log_commit = 0;
diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index 8831eaa14204..a7ca464cdbf7 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -2499,6 +2499,15 @@ out:
 	return ret;
 }
 
+/*
+ * Free @num_bytes of reserved space with @type for qgroup.  (Normally level 0
+ * qgroup).
+ *
+ * Will handle all higher level qgroup too.
+ *
+ * NOTE: If @num_bytes is (u64)-1, this means to free all bytes of this qgroup.
+ * This special case is only used for META_PERTRANS type.
+ */
 void btrfs_qgroup_free_refroot(struct btrfs_fs_info *fs_info,
 			       u64 ref_root, u64 num_bytes,
 			       enum btrfs_qgroup_rsv_type type)
@@ -2515,6 +2524,10 @@ void btrfs_qgroup_free_refroot(struct btrfs_fs_info *fs_info,
 	if (num_bytes == 0)
 		return;
 
+	if (num_bytes == (u64)-1 && type != BTRFS_QGROUP_RSV_META_PERTRANS) {
+		WARN(1, "%s: Invalid type to free", __func__);
+		return;
+	}
 	spin_lock(&fs_info->qgroup_lock);
 
 	quota_root = fs_info->quota_root;
@@ -2525,6 +2538,13 @@ void btrfs_qgroup_free_refroot(struct btrfs_fs_info *fs_info,
 	if (!qgroup)
 		goto out;
 
+	/*
+	 * We're freeing all pertrans rsv, get current value from level 0
+	 * qgroup as real num_bytes to free.
+	 */
+	if (num_bytes == (u64)-1)
+		num_bytes = qgroup->rsv.values[type];
+
 	ulist_reinit(fs_info->qgroup_ulist);
 	ret = ulist_add(fs_info->qgroup_ulist, qgroup->qgroupid,
 			(uintptr_t)qgroup, GFP_ATOMIC);
@@ -3082,24 +3102,21 @@ int __btrfs_qgroup_reserve_meta(struct btrfs_root *root, int num_bytes,
 	ret = qgroup_reserve(root, num_bytes, enforce, type);
 	if (ret < 0)
 		return ret;
-	atomic64_add(num_bytes, &root->qgroup_meta_rsv);
 	return ret;
 }
 
 void btrfs_qgroup_free_meta_all_pertrans(struct btrfs_root *root)
 {
 	struct btrfs_fs_info *fs_info = root->fs_info;
-	u64 reserved;
 
 	if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags) ||
 	    !is_fstree(root->objectid))
 		return;
 
-	reserved = atomic64_xchg(&root->qgroup_meta_rsv, 0);
-	if (reserved == 0)
-		return;
-	trace_qgroup_meta_reserve(root, -(s64)reserved);
-	btrfs_qgroup_free_refroot(fs_info, root->objectid, reserved,
+	/* TODO: Update trace point to handle such free */
+	trace_qgroup_meta_reserve(root, 0);
+	/* Special value -1 means to free all reserved space */
+	btrfs_qgroup_free_refroot(fs_info, root->objectid, (u64)-1,
 				  BTRFS_QGROUP_RSV_META_PERTRANS);
 }
 
@@ -3113,8 +3130,6 @@ void __btrfs_qgroup_free_meta(struct btrfs_root *root, int num_bytes,
 		return;
 
 	BUG_ON(num_bytes != round_down(num_bytes, fs_info->nodesize));
-	WARN_ON(atomic64_read(&root->qgroup_meta_rsv) < num_bytes);
-	atomic64_sub(num_bytes, &root->qgroup_meta_rsv);
 	trace_qgroup_meta_reserve(root, -(s64)num_bytes);
 	btrfs_qgroup_free_refroot(fs_info, root->objectid, num_bytes, type);
 }

From 64cfaef6362fc756972f477372997fbe117d79cb Mon Sep 17 00:00:00 2001
From: Qu Wenruo <wqu@suse.com>
Date: Tue, 12 Dec 2017 15:34:31 +0800
Subject: [PATCH 145/164] btrfs: qgroup: Introduce function to convert
 META_PREALLOC into META_PERTRANS

For meta_prealloc reservation users, after btrfs_join_transaction()
caller will modify tree so part (or even all) meta_prealloc reservation
should be converted to meta_pertrans until transaction commit time.

This patch introduces a new function,
btrfs_qgroup_convert_reserved_meta() to do this for META_PREALLOC
reservation user.

Signed-off-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/qgroup.c | 56 +++++++++++++++++++++++++++++++++++++++++++++++
 fs/btrfs/qgroup.h |  8 +++++++
 2 files changed, 64 insertions(+)

diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index a7ca464cdbf7..9ce626579c81 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -3134,6 +3134,62 @@ void __btrfs_qgroup_free_meta(struct btrfs_root *root, int num_bytes,
 	btrfs_qgroup_free_refroot(fs_info, root->objectid, num_bytes, type);
 }
 
+static void qgroup_convert_meta(struct btrfs_fs_info *fs_info, u64 ref_root,
+				int num_bytes)
+{
+	struct btrfs_root *quota_root = fs_info->quota_root;
+	struct btrfs_qgroup *qgroup;
+	struct ulist_node *unode;
+	struct ulist_iterator uiter;
+	int ret = 0;
+
+	if (num_bytes == 0)
+		return;
+	if (!quota_root)
+		return;
+
+	spin_lock(&fs_info->qgroup_lock);
+	qgroup = find_qgroup_rb(fs_info, ref_root);
+	if (!qgroup)
+		goto out;
+	ulist_reinit(fs_info->qgroup_ulist);
+	ret = ulist_add(fs_info->qgroup_ulist, qgroup->qgroupid,
+		       (uintptr_t)qgroup, GFP_ATOMIC);
+	if (ret < 0)
+		goto out;
+	ULIST_ITER_INIT(&uiter);
+	while ((unode = ulist_next(fs_info->qgroup_ulist, &uiter))) {
+		struct btrfs_qgroup *qg;
+		struct btrfs_qgroup_list *glist;
+
+		qg = unode_aux_to_qgroup(unode);
+
+		qgroup_rsv_release(fs_info, qg, num_bytes,
+				BTRFS_QGROUP_RSV_META_PREALLOC);
+		qgroup_rsv_add(fs_info, qg, num_bytes,
+				BTRFS_QGROUP_RSV_META_PERTRANS);
+		list_for_each_entry(glist, &qg->groups, next_group) {
+			ret = ulist_add(fs_info->qgroup_ulist,
+					glist->group->qgroupid,
+					(uintptr_t)glist->group, GFP_ATOMIC);
+			if (ret < 0)
+				goto out;
+		}
+	}
+out:
+	spin_unlock(&fs_info->qgroup_lock);
+}
+
+void btrfs_qgroup_convert_reserved_meta(struct btrfs_root *root, int num_bytes)
+{
+	struct btrfs_fs_info *fs_info = root->fs_info;
+
+	if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags) ||
+	    !is_fstree(root->objectid))
+		return;
+	qgroup_convert_meta(fs_info, root->objectid, num_bytes);
+}
+
 /*
  * Check qgroup reserved space leaking, normally at destroy inode
  * time
diff --git a/fs/btrfs/qgroup.h b/fs/btrfs/qgroup.h
index 987a5a49deb8..e63e2d497a8e 100644
--- a/fs/btrfs/qgroup.h
+++ b/fs/btrfs/qgroup.h
@@ -332,5 +332,13 @@ static inline void btrfs_qgroup_free_meta_prealloc(struct btrfs_root *root,
  */
 void btrfs_qgroup_free_meta_all_pertrans(struct btrfs_root *root);
 
+/*
+ * Convert @num_bytes of META_PREALLOCATED reservation to META_PERTRANS.
+ *
+ * This is called when preallocated meta reservation needs to be used.
+ * Normally after btrfs_join_transaction() call.
+ */
+void btrfs_qgroup_convert_reserved_meta(struct btrfs_root *root, int num_bytes);
+
 void btrfs_qgroup_check_reserved_leak(struct inode *inode);
 #endif /* __BTRFS_QGROUP__ */

From 43b18595d6603cb4197fb9b063915cd7802141a6 Mon Sep 17 00:00:00 2001
From: Qu Wenruo <wqu@suse.com>
Date: Tue, 12 Dec 2017 15:34:32 +0800
Subject: [PATCH 146/164] btrfs: qgroup: Use separate meta reservation type for
 delalloc

Before this patch, btrfs qgroup is mixing per-transcation meta rsv with
preallocated meta rsv, making it quite easy to underflow qgroup meta
reservation.

Since we have the new qgroup meta rsv types, apply it to delalloc
reservation.

Now for delalloc, most of its reserved space will use META_PREALLOC qgroup
rsv type.

And for callers reducing outstanding extent like btrfs_finish_ordered_io(),
they will convert corresponding META_PREALLOC reservation to
META_PERTRANS.

This is mainly due to the fact that current qgroup numbers will only be
updated in btrfs_commit_transaction(), that's to say if we don't keep
such placeholder reservation, we can exceed qgroup limitation.

And for callers freeing outstanding extent in error handler, we will
just free META_PREALLOC bytes.

This behavior makes callers of btrfs_qgroup_release_meta() or
btrfs_qgroup_convert_meta() to be aware of which type they are.
So in this patch, btrfs_delalloc_release_metadata() and its callers get
an extra parameter to info qgroup to do correct meta convert/release.

The good news is, even we use the wrong type (convert or free), it won't
cause obvious bug, as prealloc type is always in good shape, and the
type only affects how per-trans meta is increased or not.

So the worst case will be at most metadata limitation can be sometimes
exceeded (no convert at all) or metadata limitation is reached too soon
(no free at all).

Signed-off-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/ctree.h            |  9 +++++---
 fs/btrfs/extent-tree.c      | 44 ++++++++++++++++++++-----------------
 fs/btrfs/file.c             | 15 +++++++------
 fs/btrfs/free-space-cache.c |  2 +-
 fs/btrfs/inode-map.c        |  4 ++--
 fs/btrfs/inode.c            | 27 ++++++++++++-----------
 fs/btrfs/ioctl.c            | 10 +++++----
 fs/btrfs/ordered-data.c     |  2 +-
 fs/btrfs/relocation.c       | 13 ++++++-----
 9 files changed, 69 insertions(+), 57 deletions(-)

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index df0463e2ab7f..7924e50cc528 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -2742,7 +2742,8 @@ int btrfs_check_data_free_space(struct inode *inode,
 void btrfs_free_reserved_data_space(struct inode *inode,
 			struct extent_changeset *reserved, u64 start, u64 len);
 void btrfs_delalloc_release_space(struct inode *inode,
-			struct extent_changeset *reserved, u64 start, u64 len);
+				  struct extent_changeset *reserved,
+				  u64 start, u64 len, bool qgroup_free);
 void btrfs_free_reserved_data_space_noquota(struct inode *inode, u64 start,
 					    u64 len);
 void btrfs_trans_release_chunk_metadata(struct btrfs_trans_handle *trans);
@@ -2755,10 +2756,12 @@ int btrfs_subvolume_reserve_metadata(struct btrfs_root *root,
 				     u64 *qgroup_reserved, bool use_global_rsv);
 void btrfs_subvolume_release_metadata(struct btrfs_fs_info *fs_info,
 				      struct btrfs_block_rsv *rsv);
-void btrfs_delalloc_release_extents(struct btrfs_inode *inode, u64 num_bytes);
+void btrfs_delalloc_release_extents(struct btrfs_inode *inode, u64 num_bytes,
+				    bool qgroup_free);
 
 int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes);
-void btrfs_delalloc_release_metadata(struct btrfs_inode *inode, u64 num_bytes);
+void btrfs_delalloc_release_metadata(struct btrfs_inode *inode, u64 num_bytes,
+				     bool qgroup_free);
 int btrfs_delalloc_reserve_space(struct inode *inode,
 			struct extent_changeset **reserved, u64 start, u64 len);
 void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv, unsigned short type);
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 020c1a1a6526..6b07202385d3 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -5760,6 +5760,9 @@ static int btrfs_inode_rsv_refill(struct btrfs_inode *inode,
 	if (num_bytes == 0)
 		return 0;
 
+	ret = btrfs_qgroup_reserve_meta_prealloc(root, num_bytes, true);
+	if (ret)
+		return ret;
 	ret = reserve_metadata_bytes(root, block_rsv, num_bytes, flush);
 	if (!ret) {
 		block_rsv_add_bytes(block_rsv, num_bytes, 0);
@@ -5772,11 +5775,15 @@ static int btrfs_inode_rsv_refill(struct btrfs_inode *inode,
 /**
  * btrfs_inode_rsv_release - release any excessive reservation.
  * @inode - the inode we need to release from.
+ * @qgroup_free - free or convert qgroup meta.
+ *   Unlike normal operation, qgroup meta reservation needs to know if we are
+ *   freeing qgroup reservation or just converting it into per-trans.  Normally
+ *   @qgroup_free is true for error handling, and false for normal release.
  *
  * This is the same as btrfs_block_rsv_release, except that it handles the
  * tracepoint for the reservation.
  */
-static void btrfs_inode_rsv_release(struct btrfs_inode *inode)
+static void btrfs_inode_rsv_release(struct btrfs_inode *inode, bool qgroup_free)
 {
 	struct btrfs_fs_info *fs_info = inode->root->fs_info;
 	struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
@@ -5792,6 +5799,10 @@ static void btrfs_inode_rsv_release(struct btrfs_inode *inode)
 	if (released > 0)
 		trace_btrfs_space_reservation(fs_info, "delalloc",
 					      btrfs_ino(inode), released, 0);
+	if (qgroup_free)
+		btrfs_qgroup_free_meta_prealloc(inode->root, released);
+	else
+		btrfs_qgroup_convert_reserved_meta(inode->root, released);
 }
 
 void btrfs_block_rsv_release(struct btrfs_fs_info *fs_info,
@@ -6033,7 +6044,6 @@ static void btrfs_calculate_inode_block_rsv_size(struct btrfs_fs_info *fs_info,
 int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes)
 {
 	struct btrfs_fs_info *fs_info = btrfs_sb(inode->vfs_inode.i_sb);
-	struct btrfs_root *root = inode->root;
 	unsigned nr_extents;
 	enum btrfs_reserve_flush_enum flush = BTRFS_RESERVE_FLUSH_ALL;
 	int ret = 0;
@@ -6071,19 +6081,9 @@ int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes)
 	btrfs_calculate_inode_block_rsv_size(fs_info, inode);
 	spin_unlock(&inode->lock);
 
-	if (test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags)) {
-		ret = btrfs_qgroup_reserve_meta_prealloc(root,
-				nr_extents * fs_info->nodesize, true);
-		if (ret)
-			goto out_fail;
-	}
-
 	ret = btrfs_inode_rsv_refill(inode, flush);
-	if (unlikely(ret)) {
-		btrfs_qgroup_free_meta_prealloc(root,
-				       nr_extents * fs_info->nodesize);
+	if (unlikely(ret))
 		goto out_fail;
-	}
 
 	if (delalloc_lock)
 		mutex_unlock(&inode->delalloc_mutex);
@@ -6097,7 +6097,7 @@ out_fail:
 	btrfs_calculate_inode_block_rsv_size(fs_info, inode);
 	spin_unlock(&inode->lock);
 
-	btrfs_inode_rsv_release(inode);
+	btrfs_inode_rsv_release(inode, true);
 	if (delalloc_lock)
 		mutex_unlock(&inode->delalloc_mutex);
 	return ret;
@@ -6107,12 +6107,14 @@ out_fail:
  * btrfs_delalloc_release_metadata - release a metadata reservation for an inode
  * @inode: the inode to release the reservation for.
  * @num_bytes: the number of bytes we are releasing.
+ * @qgroup_free: free qgroup reservation or convert it to per-trans reservation
  *
  * This will release the metadata reservation for an inode.  This can be called
  * once we complete IO for a given set of bytes to release their metadata
  * reservations, or on error for the same reason.
  */
-void btrfs_delalloc_release_metadata(struct btrfs_inode *inode, u64 num_bytes)
+void btrfs_delalloc_release_metadata(struct btrfs_inode *inode, u64 num_bytes,
+				     bool qgroup_free)
 {
 	struct btrfs_fs_info *fs_info = btrfs_sb(inode->vfs_inode.i_sb);
 
@@ -6125,13 +6127,14 @@ void btrfs_delalloc_release_metadata(struct btrfs_inode *inode, u64 num_bytes)
 	if (btrfs_is_testing(fs_info))
 		return;
 
-	btrfs_inode_rsv_release(inode);
+	btrfs_inode_rsv_release(inode, qgroup_free);
 }
 
 /**
  * btrfs_delalloc_release_extents - release our outstanding_extents
  * @inode: the inode to balance the reservation for.
  * @num_bytes: the number of bytes we originally reserved with
+ * @qgroup_free: do we need to free qgroup meta reservation or convert them.
  *
  * When we reserve space we increase outstanding_extents for the extents we may
  * add.  Once we've set the range as delalloc or created our ordered extents we
@@ -6139,7 +6142,8 @@ void btrfs_delalloc_release_metadata(struct btrfs_inode *inode, u64 num_bytes)
  * temporarily tracked outstanding_extents.  This _must_ be used in conjunction
  * with btrfs_delalloc_reserve_metadata.
  */
-void btrfs_delalloc_release_extents(struct btrfs_inode *inode, u64 num_bytes)
+void btrfs_delalloc_release_extents(struct btrfs_inode *inode, u64 num_bytes,
+				    bool qgroup_free)
 {
 	struct btrfs_fs_info *fs_info = btrfs_sb(inode->vfs_inode.i_sb);
 	unsigned num_extents;
@@ -6153,7 +6157,7 @@ void btrfs_delalloc_release_extents(struct btrfs_inode *inode, u64 num_bytes)
 	if (btrfs_is_testing(fs_info))
 		return;
 
-	btrfs_inode_rsv_release(inode);
+	btrfs_inode_rsv_release(inode, qgroup_free);
 }
 
 /**
@@ -6209,9 +6213,9 @@ int btrfs_delalloc_reserve_space(struct inode *inode,
  */
 void btrfs_delalloc_release_space(struct inode *inode,
 				  struct extent_changeset *reserved,
-				  u64 start, u64 len)
+				  u64 start, u64 len, bool qgroup_free)
 {
-	btrfs_delalloc_release_metadata(BTRFS_I(inode), len);
+	btrfs_delalloc_release_metadata(BTRFS_I(inode), len, qgroup_free);
 	btrfs_free_reserved_data_space(inode, reserved, start, len);
 }
 
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 6d878f1d1082..f247300170e5 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -1691,7 +1691,7 @@ again:
 				    force_page_uptodate);
 		if (ret) {
 			btrfs_delalloc_release_extents(BTRFS_I(inode),
-						       reserve_bytes);
+						       reserve_bytes, true);
 			break;
 		}
 
@@ -1703,7 +1703,7 @@ again:
 			if (extents_locked == -EAGAIN)
 				goto again;
 			btrfs_delalloc_release_extents(BTRFS_I(inode),
-						       reserve_bytes);
+						       reserve_bytes, true);
 			ret = extents_locked;
 			break;
 		}
@@ -1738,7 +1738,7 @@ again:
 						fs_info->sb->s_blocksize_bits;
 			if (only_release_metadata) {
 				btrfs_delalloc_release_metadata(BTRFS_I(inode),
-								release_bytes);
+							release_bytes, true);
 			} else {
 				u64 __pos;
 
@@ -1747,7 +1747,7 @@ again:
 					(dirty_pages << PAGE_SHIFT);
 				btrfs_delalloc_release_space(inode,
 						data_reserved, __pos,
-						release_bytes);
+						release_bytes, true);
 			}
 		}
 
@@ -1760,7 +1760,8 @@ again:
 		if (extents_locked)
 			unlock_extent_cached(&BTRFS_I(inode)->io_tree,
 					     lockstart, lockend, &cached_state);
-		btrfs_delalloc_release_extents(BTRFS_I(inode), reserve_bytes);
+		btrfs_delalloc_release_extents(BTRFS_I(inode), reserve_bytes,
+					       (ret != 0));
 		if (ret) {
 			btrfs_drop_pages(pages, num_pages);
 			break;
@@ -1800,11 +1801,11 @@ again:
 		if (only_release_metadata) {
 			btrfs_end_write_no_snapshotting(root);
 			btrfs_delalloc_release_metadata(BTRFS_I(inode),
-					release_bytes);
+					release_bytes, true);
 		} else {
 			btrfs_delalloc_release_space(inode, data_reserved,
 					round_down(pos, fs_info->sectorsize),
-					release_bytes);
+					release_bytes, true);
 		}
 	}
 
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index a9f22ac50d6a..d0dde9e6afd7 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -3547,7 +3547,7 @@ int btrfs_write_out_ino_cache(struct btrfs_root *root,
 	if (ret) {
 		if (release_metadata)
 			btrfs_delalloc_release_metadata(BTRFS_I(inode),
-					inode->i_size);
+					inode->i_size, true);
 #ifdef DEBUG
 		btrfs_err(fs_info,
 			  "failed to write free ino cache for root %llu",
diff --git a/fs/btrfs/inode-map.c b/fs/btrfs/inode-map.c
index 022b19336fee..9409dcc7020d 100644
--- a/fs/btrfs/inode-map.c
+++ b/fs/btrfs/inode-map.c
@@ -500,12 +500,12 @@ again:
 	ret = btrfs_prealloc_file_range_trans(inode, trans, 0, 0, prealloc,
 					      prealloc, prealloc, &alloc_hint);
 	if (ret) {
-		btrfs_delalloc_release_extents(BTRFS_I(inode), prealloc);
+		btrfs_delalloc_release_extents(BTRFS_I(inode), prealloc, true);
 		goto out_put;
 	}
 
 	ret = btrfs_write_out_ino_cache(root, trans, path, inode);
-	btrfs_delalloc_release_extents(BTRFS_I(inode), prealloc);
+	btrfs_delalloc_release_extents(BTRFS_I(inode), prealloc, false);
 out_put:
 	iput(inode);
 out_release:
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index a9a47387e53f..1f091c2358a4 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -1867,7 +1867,7 @@ static void btrfs_clear_bit_hook(void *private_data,
 		 */
 		if (*bits & EXTENT_CLEAR_META_RESV &&
 		    root != fs_info->tree_root)
-			btrfs_delalloc_release_metadata(inode, len);
+			btrfs_delalloc_release_metadata(inode, len, false);
 
 		/* For sanity tests. */
 		if (btrfs_is_testing(fs_info))
@@ -2152,7 +2152,7 @@ again:
 
 	ClearPageChecked(page);
 	set_page_dirty(page);
-	btrfs_delalloc_release_extents(BTRFS_I(inode), PAGE_SIZE);
+	btrfs_delalloc_release_extents(BTRFS_I(inode), PAGE_SIZE, false);
 out:
 	unlock_extent_cached(&BTRFS_I(inode)->io_tree, page_start, page_end,
 			     &cached_state);
@@ -4802,8 +4802,8 @@ again:
 	page = find_or_create_page(mapping, index, mask);
 	if (!page) {
 		btrfs_delalloc_release_space(inode, data_reserved,
-					     block_start, blocksize);
-		btrfs_delalloc_release_extents(BTRFS_I(inode), blocksize);
+					     block_start, blocksize, true);
+		btrfs_delalloc_release_extents(BTRFS_I(inode), blocksize, true);
 		ret = -ENOMEM;
 		goto out;
 	}
@@ -4870,8 +4870,8 @@ again:
 out_unlock:
 	if (ret)
 		btrfs_delalloc_release_space(inode, data_reserved, block_start,
-					     blocksize);
-	btrfs_delalloc_release_extents(BTRFS_I(inode), blocksize);
+					     blocksize, true);
+	btrfs_delalloc_release_extents(BTRFS_I(inode), blocksize, (ret != 0));
 	unlock_page(page);
 	put_page(page);
 out:
@@ -8636,7 +8636,7 @@ static ssize_t btrfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
 		if (ret < 0 && ret != -EIOCBQUEUED) {
 			if (dio_data.reserve)
 				btrfs_delalloc_release_space(inode, data_reserved,
-					offset, dio_data.reserve);
+					offset, dio_data.reserve, true);
 			/*
 			 * On error we might have left some ordered extents
 			 * without submitting corresponding bios for them, so
@@ -8652,8 +8652,8 @@ static ssize_t btrfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
 					false);
 		} else if (ret >= 0 && (size_t)ret < count)
 			btrfs_delalloc_release_space(inode, data_reserved,
-					offset, count - (size_t)ret);
-		btrfs_delalloc_release_extents(BTRFS_I(inode), count);
+					offset, count - (size_t)ret, true);
+		btrfs_delalloc_release_extents(BTRFS_I(inode), count, false);
 	}
 out:
 	if (wakeup)
@@ -8968,7 +8968,8 @@ again:
 		if (reserved_space < PAGE_SIZE) {
 			end = page_start + reserved_space - 1;
 			btrfs_delalloc_release_space(inode, data_reserved,
-					page_start, PAGE_SIZE - reserved_space);
+					page_start, PAGE_SIZE - reserved_space,
+					true);
 		}
 	}
 
@@ -9018,16 +9019,16 @@ again:
 
 out_unlock:
 	if (!ret) {
-		btrfs_delalloc_release_extents(BTRFS_I(inode), PAGE_SIZE);
+		btrfs_delalloc_release_extents(BTRFS_I(inode), PAGE_SIZE, true);
 		sb_end_pagefault(inode->i_sb);
 		extent_changeset_free(data_reserved);
 		return VM_FAULT_LOCKED;
 	}
 	unlock_page(page);
 out:
-	btrfs_delalloc_release_extents(BTRFS_I(inode), PAGE_SIZE);
+	btrfs_delalloc_release_extents(BTRFS_I(inode), PAGE_SIZE, (ret != 0));
 	btrfs_delalloc_release_space(inode, data_reserved, page_start,
-				     reserved_space);
+				     reserved_space, (ret != 0));
 out_noreserve:
 	sb_end_pagefault(inode->i_sb);
 	extent_changeset_free(data_reserved);
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 94bcc1bf71ca..8c3ff75cbdd4 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -1197,7 +1197,7 @@ again:
 		spin_unlock(&BTRFS_I(inode)->lock);
 		btrfs_delalloc_release_space(inode, data_reserved,
 				start_index << PAGE_SHIFT,
-				(page_cnt - i_done) << PAGE_SHIFT);
+				(page_cnt - i_done) << PAGE_SHIFT, true);
 	}
 
 
@@ -1215,7 +1215,8 @@ again:
 		unlock_page(pages[i]);
 		put_page(pages[i]);
 	}
-	btrfs_delalloc_release_extents(BTRFS_I(inode), page_cnt << PAGE_SHIFT);
+	btrfs_delalloc_release_extents(BTRFS_I(inode), page_cnt << PAGE_SHIFT,
+				       false);
 	extent_changeset_free(data_reserved);
 	return i_done;
 out:
@@ -1225,8 +1226,9 @@ out:
 	}
 	btrfs_delalloc_release_space(inode, data_reserved,
 			start_index << PAGE_SHIFT,
-			page_cnt << PAGE_SHIFT);
-	btrfs_delalloc_release_extents(BTRFS_I(inode), page_cnt << PAGE_SHIFT);
+			page_cnt << PAGE_SHIFT, true);
+	btrfs_delalloc_release_extents(BTRFS_I(inode), page_cnt << PAGE_SHIFT,
+				       true);
 	extent_changeset_free(data_reserved);
 	return ret;
 
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index 9be98e42cfb6..661cc3db0c7c 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -610,7 +610,7 @@ void btrfs_remove_ordered_extent(struct inode *inode,
 	btrfs_mod_outstanding_extents(btrfs_inode, -1);
 	spin_unlock(&btrfs_inode->lock);
 	if (root != fs_info->tree_root)
-		btrfs_delalloc_release_metadata(btrfs_inode, entry->len);
+		btrfs_delalloc_release_metadata(btrfs_inode, entry->len, false);
 
 	tree = &btrfs_inode->ordered_tree;
 	spin_lock_irq(&tree->lock);
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index cd2298d185dd..e61e1ee9af9a 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -3226,7 +3226,7 @@ static int relocate_file_extent_cluster(struct inode *inode,
 						   mask);
 			if (!page) {
 				btrfs_delalloc_release_metadata(BTRFS_I(inode),
-							PAGE_SIZE);
+							PAGE_SIZE, true);
 				ret = -ENOMEM;
 				goto out;
 			}
@@ -3245,9 +3245,9 @@ static int relocate_file_extent_cluster(struct inode *inode,
 				unlock_page(page);
 				put_page(page);
 				btrfs_delalloc_release_metadata(BTRFS_I(inode),
-							PAGE_SIZE);
+							PAGE_SIZE, true);
 				btrfs_delalloc_release_extents(BTRFS_I(inode),
-							       PAGE_SIZE);
+							       PAGE_SIZE, true);
 				ret = -EIO;
 				goto out;
 			}
@@ -3274,9 +3274,9 @@ static int relocate_file_extent_cluster(struct inode *inode,
 			unlock_page(page);
 			put_page(page);
 			btrfs_delalloc_release_metadata(BTRFS_I(inode),
-							 PAGE_SIZE);
+							 PAGE_SIZE, true);
 			btrfs_delalloc_release_extents(BTRFS_I(inode),
-			                               PAGE_SIZE);
+			                               PAGE_SIZE, true);
 
 			clear_extent_bits(&BTRFS_I(inode)->io_tree,
 					  page_start, page_end,
@@ -3292,7 +3292,8 @@ static int relocate_file_extent_cluster(struct inode *inode,
 		put_page(page);
 
 		index++;
-		btrfs_delalloc_release_extents(BTRFS_I(inode), PAGE_SIZE);
+		btrfs_delalloc_release_extents(BTRFS_I(inode), PAGE_SIZE,
+					       false);
 		balance_dirty_pages_ratelimited(inode->i_mapping);
 		btrfs_throttle(fs_info);
 	}

From 4f5427ccce5d9cb8e2c8f98b49e744e523d246ec Mon Sep 17 00:00:00 2001
From: Qu Wenruo <wqu@suse.com>
Date: Tue, 12 Dec 2017 15:34:33 +0800
Subject: [PATCH 147/164] btrfs: delayed-inode: Use new qgroup meta rsv for
 delayed inode and item

Quite similar for delalloc, some modification to delayed-inode and
delayed-item reservation.  Also needs extra parameter for release case
to distinguish normal release and error release.

Signed-off-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/delayed-inode.c | 46 ++++++++++++++++++++++++++--------------
 1 file changed, 30 insertions(+), 16 deletions(-)

diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c
index d06bef16ebd5..86ec2edc05e8 100644
--- a/fs/btrfs/delayed-inode.c
+++ b/fs/btrfs/delayed-inode.c
@@ -23,6 +23,7 @@
 #include "disk-io.h"
 #include "transaction.h"
 #include "ctree.h"
+#include "qgroup.h"
 
 #define BTRFS_DELAYED_WRITEBACK		512
 #define BTRFS_DELAYED_BACKGROUND	128
@@ -552,11 +553,12 @@ static struct btrfs_delayed_item *__btrfs_next_delayed_item(
 }
 
 static int btrfs_delayed_item_reserve_metadata(struct btrfs_trans_handle *trans,
-					       struct btrfs_fs_info *fs_info,
+					       struct btrfs_root *root,
 					       struct btrfs_delayed_item *item)
 {
 	struct btrfs_block_rsv *src_rsv;
 	struct btrfs_block_rsv *dst_rsv;
+	struct btrfs_fs_info *fs_info = root->fs_info;
 	u64 num_bytes;
 	int ret;
 
@@ -578,15 +580,17 @@ static int btrfs_delayed_item_reserve_metadata(struct btrfs_trans_handle *trans,
 	return ret;
 }
 
-static void btrfs_delayed_item_release_metadata(struct btrfs_fs_info *fs_info,
+static void btrfs_delayed_item_release_metadata(struct btrfs_root *root,
 						struct btrfs_delayed_item *item)
 {
 	struct btrfs_block_rsv *rsv;
+	struct btrfs_fs_info *fs_info = root->fs_info;
 
 	if (!item->bytes_reserved)
 		return;
 
 	rsv = &fs_info->delayed_block_rsv;
+	btrfs_qgroup_convert_reserved_meta(root, item->bytes_reserved);
 	trace_btrfs_space_reservation(fs_info, "delayed_item",
 				      item->key.objectid, item->bytes_reserved,
 				      0);
@@ -611,6 +615,9 @@ static int btrfs_delayed_inode_reserve_metadata(
 
 	num_bytes = btrfs_calc_trans_metadata_size(fs_info, 1);
 
+	ret = btrfs_qgroup_reserve_meta_prealloc(root, num_bytes, true);
+	if (ret < 0)
+		return ret;
 	/*
 	 * btrfs_dirty_inode will update the inode under btrfs_join_transaction
 	 * which doesn't reserve space for speed.  This is a problem since we
@@ -630,8 +637,10 @@ static int btrfs_delayed_inode_reserve_metadata(
 		 * EAGAIN to make us stop the transaction we have, so return
 		 * ENOSPC instead so that btrfs_dirty_inode knows what to do.
 		 */
-		if (ret == -EAGAIN)
+		if (ret == -EAGAIN) {
 			ret = -ENOSPC;
+			btrfs_qgroup_free_meta_prealloc(root, num_bytes);
+		}
 		if (!ret) {
 			node->bytes_reserved = num_bytes;
 			trace_btrfs_space_reservation(fs_info,
@@ -653,7 +662,8 @@ static int btrfs_delayed_inode_reserve_metadata(
 }
 
 static void btrfs_delayed_inode_release_metadata(struct btrfs_fs_info *fs_info,
-						struct btrfs_delayed_node *node)
+						struct btrfs_delayed_node *node,
+						bool qgroup_free)
 {
 	struct btrfs_block_rsv *rsv;
 
@@ -665,6 +675,12 @@ static void btrfs_delayed_inode_release_metadata(struct btrfs_fs_info *fs_info,
 				      node->inode_id, node->bytes_reserved, 0);
 	btrfs_block_rsv_release(fs_info, rsv,
 				node->bytes_reserved);
+	if (qgroup_free)
+		btrfs_qgroup_free_meta_prealloc(node->root,
+				node->bytes_reserved);
+	else
+		btrfs_qgroup_convert_reserved_meta(node->root,
+				node->bytes_reserved);
 	node->bytes_reserved = 0;
 }
 
@@ -766,7 +782,7 @@ static int btrfs_batch_insert_items(struct btrfs_root *root,
 				    curr->data_len);
 		slot++;
 
-		btrfs_delayed_item_release_metadata(fs_info, curr);
+		btrfs_delayed_item_release_metadata(root, curr);
 
 		list_del(&curr->tree_list);
 		btrfs_release_delayed_item(curr);
@@ -788,7 +804,6 @@ static int btrfs_insert_delayed_item(struct btrfs_trans_handle *trans,
 				     struct btrfs_path *path,
 				     struct btrfs_delayed_item *delayed_item)
 {
-	struct btrfs_fs_info *fs_info = root->fs_info;
 	struct extent_buffer *leaf;
 	char *ptr;
 	int ret;
@@ -806,7 +821,7 @@ static int btrfs_insert_delayed_item(struct btrfs_trans_handle *trans,
 			    delayed_item->data_len);
 	btrfs_mark_buffer_dirty(leaf);
 
-	btrfs_delayed_item_release_metadata(fs_info, delayed_item);
+	btrfs_delayed_item_release_metadata(root, delayed_item);
 	return 0;
 }
 
@@ -858,7 +873,6 @@ static int btrfs_batch_delete_items(struct btrfs_trans_handle *trans,
 				    struct btrfs_path *path,
 				    struct btrfs_delayed_item *item)
 {
-	struct btrfs_fs_info *fs_info = root->fs_info;
 	struct btrfs_delayed_item *curr, *next;
 	struct extent_buffer *leaf;
 	struct btrfs_key key;
@@ -908,7 +922,7 @@ static int btrfs_batch_delete_items(struct btrfs_trans_handle *trans,
 		goto out;
 
 	list_for_each_entry_safe(curr, next, &head, tree_list) {
-		btrfs_delayed_item_release_metadata(fs_info, curr);
+		btrfs_delayed_item_release_metadata(root, curr);
 		list_del(&curr->tree_list);
 		btrfs_release_delayed_item(curr);
 	}
@@ -1051,7 +1065,7 @@ out:
 no_iref:
 	btrfs_release_path(path);
 err_out:
-	btrfs_delayed_inode_release_metadata(fs_info, node);
+	btrfs_delayed_inode_release_metadata(fs_info, node, (ret < 0));
 	btrfs_release_delayed_inode(node);
 
 	return ret;
@@ -1441,7 +1455,7 @@ int btrfs_insert_delayed_dir_index(struct btrfs_trans_handle *trans,
 	btrfs_set_stack_dir_type(dir_item, type);
 	memcpy((char *)(dir_item + 1), name, name_len);
 
-	ret = btrfs_delayed_item_reserve_metadata(trans, fs_info, delayed_item);
+	ret = btrfs_delayed_item_reserve_metadata(trans, dir->root, delayed_item);
 	/*
 	 * we have reserved enough space when we start a new transaction,
 	 * so reserving metadata failure is impossible
@@ -1478,7 +1492,7 @@ static int btrfs_delete_delayed_insertion_item(struct btrfs_fs_info *fs_info,
 		return 1;
 	}
 
-	btrfs_delayed_item_release_metadata(fs_info, item);
+	btrfs_delayed_item_release_metadata(node->root, item);
 	btrfs_release_delayed_item(item);
 	mutex_unlock(&node->mutex);
 	return 0;
@@ -1513,7 +1527,7 @@ int btrfs_delete_delayed_dir_index(struct btrfs_trans_handle *trans,
 
 	item->key = item_key;
 
-	ret = btrfs_delayed_item_reserve_metadata(trans, fs_info, item);
+	ret = btrfs_delayed_item_reserve_metadata(trans, dir->root, item);
 	/*
 	 * we have reserved enough space when we start a new transaction,
 	 * so reserving metadata failure is impossible.
@@ -1878,7 +1892,7 @@ static void __btrfs_kill_delayed_node(struct btrfs_delayed_node *delayed_node)
 	mutex_lock(&delayed_node->mutex);
 	curr_item = __btrfs_first_delayed_insertion_item(delayed_node);
 	while (curr_item) {
-		btrfs_delayed_item_release_metadata(fs_info, curr_item);
+		btrfs_delayed_item_release_metadata(root, curr_item);
 		prev_item = curr_item;
 		curr_item = __btrfs_next_delayed_item(prev_item);
 		btrfs_release_delayed_item(prev_item);
@@ -1886,7 +1900,7 @@ static void __btrfs_kill_delayed_node(struct btrfs_delayed_node *delayed_node)
 
 	curr_item = __btrfs_first_delayed_deletion_item(delayed_node);
 	while (curr_item) {
-		btrfs_delayed_item_release_metadata(fs_info, curr_item);
+		btrfs_delayed_item_release_metadata(root, curr_item);
 		prev_item = curr_item;
 		curr_item = __btrfs_next_delayed_item(prev_item);
 		btrfs_release_delayed_item(prev_item);
@@ -1896,7 +1910,7 @@ static void __btrfs_kill_delayed_node(struct btrfs_delayed_node *delayed_node)
 		btrfs_release_delayed_iref(delayed_node);
 
 	if (test_bit(BTRFS_DELAYED_NODE_INODE_DIRTY, &delayed_node->flags)) {
-		btrfs_delayed_inode_release_metadata(fs_info, delayed_node);
+		btrfs_delayed_inode_release_metadata(fs_info, delayed_node, false);
 		btrfs_release_delayed_inode(delayed_node);
 	}
 	mutex_unlock(&delayed_node->mutex);

From 8287475a20552af66b32c07704dbdbeeb898ac1f Mon Sep 17 00:00:00 2001
From: Qu Wenruo <wqu@suse.com>
Date: Tue, 12 Dec 2017 15:34:34 +0800
Subject: [PATCH 148/164] btrfs: qgroup: Use root::qgroup_meta_rsv_* to record
 qgroup meta reserved space

For quota disabled->enable case, it's possible that at reservation time
quota was not enabled so no bytes were really reserved, while at release
time, quota was enabled so we will try to release some bytes we didn't
really own.

Such situation can cause metadata reserveation underflow, for both types,
also less possible for per-trans type since quota enable will commit
transaction.

To address this, record qgroup meta reserved bytes into
root::qgroup_meta_rsv_pertrans and ::prealloc.
So at releasing time we won't free any bytes we didn't reserve.

For DATA, it's already handled by io_tree, so nothing needs to be done
there.

Signed-off-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/ctree.h   |  5 ++++
 fs/btrfs/disk-io.c |  1 +
 fs/btrfs/qgroup.c  | 66 +++++++++++++++++++++++++++++++++++++++++++---
 3 files changed, 68 insertions(+), 4 deletions(-)

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 7924e50cc528..0eb55825862a 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -1264,6 +1264,11 @@ struct btrfs_root {
 	int send_in_progress;
 	struct btrfs_subvolume_writers *subv_writers;
 	atomic_t will_be_snapshotted;
+
+	/* For qgroup metadata reserved space */
+	spinlock_t qgroup_meta_rsv_lock;
+	u64 qgroup_meta_rsv_pertrans;
+	u64 qgroup_meta_rsv_prealloc;
 };
 
 struct btrfs_file_private {
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index ad900f821f1e..269374261e36 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -1147,6 +1147,7 @@ static void __setup_root(struct btrfs_root *root, struct btrfs_fs_info *fs_info,
 	spin_lock_init(&root->accounting_lock);
 	spin_lock_init(&root->log_extents_lock[0]);
 	spin_lock_init(&root->log_extents_lock[1]);
+	spin_lock_init(&root->qgroup_meta_rsv_lock);
 	mutex_init(&root->objectid_mutex);
 	mutex_init(&root->log_mutex);
 	mutex_init(&root->ordered_extent_mutex);
diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index 9ce626579c81..836819c34c95 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -2538,11 +2538,11 @@ void btrfs_qgroup_free_refroot(struct btrfs_fs_info *fs_info,
 	if (!qgroup)
 		goto out;
 
-	/*
-	 * We're freeing all pertrans rsv, get current value from level 0
-	 * qgroup as real num_bytes to free.
-	 */
 	if (num_bytes == (u64)-1)
+		/*
+		 * We're freeing all pertrans rsv, get reserved value from
+		 * level 0 qgroup as real num_bytes to free.
+		 */
 		num_bytes = qgroup->rsv.values[type];
 
 	ulist_reinit(fs_info->qgroup_ulist);
@@ -3087,6 +3087,46 @@ int btrfs_qgroup_release_data(struct inode *inode, u64 start, u64 len)
 	return __btrfs_qgroup_release_data(inode, NULL, start, len, 0);
 }
 
+static void add_root_meta_rsv(struct btrfs_root *root, int num_bytes,
+			      enum btrfs_qgroup_rsv_type type)
+{
+	if (type != BTRFS_QGROUP_RSV_META_PREALLOC &&
+	    type != BTRFS_QGROUP_RSV_META_PERTRANS)
+		return;
+	if (num_bytes == 0)
+		return;
+
+	spin_lock(&root->qgroup_meta_rsv_lock);
+	if (type == BTRFS_QGROUP_RSV_META_PREALLOC)
+		root->qgroup_meta_rsv_prealloc += num_bytes;
+	else
+		root->qgroup_meta_rsv_pertrans += num_bytes;
+	spin_unlock(&root->qgroup_meta_rsv_lock);
+}
+
+static int sub_root_meta_rsv(struct btrfs_root *root, int num_bytes,
+			     enum btrfs_qgroup_rsv_type type)
+{
+	if (type != BTRFS_QGROUP_RSV_META_PREALLOC &&
+	    type != BTRFS_QGROUP_RSV_META_PERTRANS)
+		return 0;
+	if (num_bytes == 0)
+		return 0;
+
+	spin_lock(&root->qgroup_meta_rsv_lock);
+	if (type == BTRFS_QGROUP_RSV_META_PREALLOC) {
+		num_bytes = min_t(u64, root->qgroup_meta_rsv_prealloc,
+				  num_bytes);
+		root->qgroup_meta_rsv_prealloc -= num_bytes;
+	} else {
+		num_bytes = min_t(u64, root->qgroup_meta_rsv_pertrans,
+				  num_bytes);
+		root->qgroup_meta_rsv_pertrans -= num_bytes;
+	}
+	spin_unlock(&root->qgroup_meta_rsv_lock);
+	return num_bytes;
+}
+
 int __btrfs_qgroup_reserve_meta(struct btrfs_root *root, int num_bytes,
 				enum btrfs_qgroup_rsv_type type, bool enforce)
 {
@@ -3102,6 +3142,15 @@ int __btrfs_qgroup_reserve_meta(struct btrfs_root *root, int num_bytes,
 	ret = qgroup_reserve(root, num_bytes, enforce, type);
 	if (ret < 0)
 		return ret;
+	/*
+	 * Record what we have reserved into root.
+	 *
+	 * To avoid quota disabled->enabled underflow.
+	 * In that case, we may try to free space we haven't reserved
+	 * (since quota was disabled), so record what we reserved into root.
+	 * And ensure later release won't underflow this number.
+	 */
+	add_root_meta_rsv(root, num_bytes, type);
 	return ret;
 }
 
@@ -3129,6 +3178,12 @@ void __btrfs_qgroup_free_meta(struct btrfs_root *root, int num_bytes,
 	    !is_fstree(root->objectid))
 		return;
 
+	/*
+	 * reservation for META_PREALLOC can happen before quota is enabled,
+	 * which can lead to underflow.
+	 * Here ensure we will only free what we really have reserved.
+	 */
+	num_bytes = sub_root_meta_rsv(root, num_bytes, type);
 	BUG_ON(num_bytes != round_down(num_bytes, fs_info->nodesize));
 	trace_qgroup_meta_reserve(root, -(s64)num_bytes);
 	btrfs_qgroup_free_refroot(fs_info, root->objectid, num_bytes, type);
@@ -3187,6 +3242,9 @@ void btrfs_qgroup_convert_reserved_meta(struct btrfs_root *root, int num_bytes)
 	if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags) ||
 	    !is_fstree(root->objectid))
 		return;
+	/* Same as btrfs_qgroup_free_meta_prealloc() */
+	num_bytes = sub_root_meta_rsv(root, num_bytes,
+				      BTRFS_QGROUP_RSV_META_PREALLOC);
 	qgroup_convert_meta(fs_info, root->objectid, num_bytes);
 }
 

From 4ee0d8832c2ecd08fd4ccbaa55484e6a500f2f34 Mon Sep 17 00:00:00 2001
From: Qu Wenruo <wqu@suse.com>
Date: Tue, 12 Dec 2017 15:34:35 +0800
Subject: [PATCH 149/164] btrfs: qgroup: Update trace events for metadata
 reservation

Now trace_qgroup_meta_reserve() will have extra type parameter.

And introduce two new trace events:

1) trace_qgroup_meta_free_all_pertrans()
   For btrfs_qgroup_free_meta_all_pertrans()

2) trace_qgroup_meta_convert()
   For btrfs_qgroup_convert_reserved_meta()

Signed-off-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/qgroup.c            |  7 +++--
 include/trace/events/btrfs.h | 59 +++++++++++++++++++++++++++++++++---
 2 files changed, 59 insertions(+), 7 deletions(-)

diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index 836819c34c95..92e2c9f15951 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -3138,7 +3138,7 @@ int __btrfs_qgroup_reserve_meta(struct btrfs_root *root, int num_bytes,
 		return 0;
 
 	BUG_ON(num_bytes != round_down(num_bytes, fs_info->nodesize));
-	trace_qgroup_meta_reserve(root, (s64)num_bytes);
+	trace_qgroup_meta_reserve(root, type, (s64)num_bytes);
 	ret = qgroup_reserve(root, num_bytes, enforce, type);
 	if (ret < 0)
 		return ret;
@@ -3163,7 +3163,7 @@ void btrfs_qgroup_free_meta_all_pertrans(struct btrfs_root *root)
 		return;
 
 	/* TODO: Update trace point to handle such free */
-	trace_qgroup_meta_reserve(root, 0);
+	trace_qgroup_meta_free_all_pertrans(root);
 	/* Special value -1 means to free all reserved space */
 	btrfs_qgroup_free_refroot(fs_info, root->objectid, (u64)-1,
 				  BTRFS_QGROUP_RSV_META_PERTRANS);
@@ -3185,7 +3185,7 @@ void __btrfs_qgroup_free_meta(struct btrfs_root *root, int num_bytes,
 	 */
 	num_bytes = sub_root_meta_rsv(root, num_bytes, type);
 	BUG_ON(num_bytes != round_down(num_bytes, fs_info->nodesize));
-	trace_qgroup_meta_reserve(root, -(s64)num_bytes);
+	trace_qgroup_meta_reserve(root, type, -(s64)num_bytes);
 	btrfs_qgroup_free_refroot(fs_info, root->objectid, num_bytes, type);
 }
 
@@ -3245,6 +3245,7 @@ void btrfs_qgroup_convert_reserved_meta(struct btrfs_root *root, int num_bytes)
 	/* Same as btrfs_qgroup_free_meta_prealloc() */
 	num_bytes = sub_root_meta_rsv(root, num_bytes,
 				      BTRFS_QGROUP_RSV_META_PREALLOC);
+	trace_qgroup_meta_convert(root, num_bytes);
 	qgroup_convert_meta(fs_info, root->objectid, num_bytes);
 }
 
diff --git a/include/trace/events/btrfs.h b/include/trace/events/btrfs.h
index eee778ba1414..965c650a5273 100644
--- a/include/trace/events/btrfs.h
+++ b/include/trace/events/btrfs.h
@@ -1663,13 +1663,14 @@ TRACE_EVENT(qgroup_update_reserve,
 
 TRACE_EVENT(qgroup_meta_reserve,
 
-	TP_PROTO(struct btrfs_root *root, s64 diff),
+	TP_PROTO(struct btrfs_root *root, s64 diff, int type),
 
-	TP_ARGS(root, diff),
+	TP_ARGS(root, diff, type),
 
 	TP_STRUCT__entry_btrfs(
 		__field(	u64,	refroot			)
 		__field(	s64,	diff			)
+		__field(	int,	type			)
 	),
 
 	TP_fast_assign_btrfs(root->fs_info,
@@ -1677,8 +1678,58 @@ TRACE_EVENT(qgroup_meta_reserve,
 		__entry->diff		= diff;
 	),
 
-	TP_printk_btrfs("refroot=%llu(%s) diff=%lld",
-		show_root_type(__entry->refroot), __entry->diff)
+	TP_printk_btrfs("refroot=%llu(%s) type=%s diff=%lld",
+		show_root_type(__entry->refroot),
+		show_qgroup_rsv_type(__entry->type), __entry->diff)
+);
+
+TRACE_EVENT(qgroup_meta_convert,
+
+	TP_PROTO(struct btrfs_root *root, s64 diff),
+
+	TP_ARGS(root, diff),
+
+	TP_STRUCT__entry_btrfs(
+		__field(	u64,	refroot			)
+		__field(	s64,	diff			)
+		__field(	int,	type			)
+	),
+
+	TP_fast_assign_btrfs(root->fs_info,
+		__entry->refroot	= root->objectid;
+		__entry->diff		= diff;
+	),
+
+	TP_printk_btrfs("refroot=%llu(%s) type=%s->%s diff=%lld",
+		show_root_type(__entry->refroot),
+		show_qgroup_rsv_type(BTRFS_QGROUP_RSV_META_PREALLOC),
+		show_qgroup_rsv_type(BTRFS_QGROUP_RSV_META_PERTRANS),
+		__entry->diff)
+);
+
+TRACE_EVENT(qgroup_meta_free_all_pertrans,
+
+	TP_PROTO(struct btrfs_root *root),
+
+	TP_ARGS(root),
+
+	TP_STRUCT__entry_btrfs(
+		__field(	u64,	refroot			)
+		__field(	s64,	diff			)
+		__field(	int,	type			)
+	),
+
+	TP_fast_assign_btrfs(root->fs_info,
+		__entry->refroot	= root->objectid;
+		spin_lock(&root->qgroup_meta_rsv_lock);
+		__entry->diff		= -(s64)root->qgroup_meta_rsv_pertrans;
+		spin_unlock(&root->qgroup_meta_rsv_lock);
+		__entry->type		= BTRFS_QGROUP_RSV_META_PERTRANS;
+	),
+
+	TP_printk_btrfs("refroot=%llu(%s) type=%s diff=%lld",
+		show_root_type(__entry->refroot),
+		show_qgroup_rsv_type(__entry->type), __entry->diff)
 );
 
 DECLARE_EVENT_CLASS(btrfs__prelim_ref,

From 0b78877a2adadc8f41f2bd959a3ffdc9c3bc0294 Mon Sep 17 00:00:00 2001
From: Qu Wenruo <wqu@suse.com>
Date: Tue, 12 Dec 2017 15:34:36 +0800
Subject: [PATCH 150/164] Revert "btrfs: qgroups: Retry after commit on getting
 EDQUOT"

This reverts commit 48a89bc4f2ceab87bc858a8eb189636b09c846a7.

The idea to commit transaction and free some space after hitting qgroup
limit is good, although the problem is it can easily cause deadlocks.

One deadlock example is caused by trying to flush data while still
holding it:

Call Trace:
 __schedule+0x49d/0x10f0
 schedule+0xc6/0x290
 schedule_timeout+0x187/0x1c0
 wait_for_completion+0x204/0x3a0
 btrfs_wait_ordered_extents+0xa40/0xaf0 [btrfs]
 qgroup_reserve+0x913/0xa10 [btrfs]
 btrfs_qgroup_reserve_data+0x3ef/0x580 [btrfs]
 btrfs_check_data_free_space+0x96/0xd0 [btrfs]
 __btrfs_buffered_write+0x3ac/0xd40 [btrfs]
 btrfs_file_write_iter+0x62a/0xba0 [btrfs]
 __vfs_write+0x320/0x430
 vfs_write+0x107/0x270
 SyS_write+0xbf/0x150
 do_syscall_64+0x1b0/0x3d0
 entry_SYSCALL64_slow_path+0x25/0x25

Another can be caused by trying to commit one transaction while nesting
with trans handle held by ourselves:

btrfs_start_transaction()
|- btrfs_qgroup_reserve_meta_pertrans()
   |- qgroup_reserve()
      |- btrfs_join_transaction()
      |- btrfs_commit_transaction()

The retry is causing more problems than exppected when limit is enabled.
At least a graceful EDQUOT is way better than deadlock.

Signed-off-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/qgroup.c | 23 -----------------------
 1 file changed, 23 deletions(-)

diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index 92e2c9f15951..c8ca5483226e 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -2406,7 +2406,6 @@ static int qgroup_reserve(struct btrfs_root *root, u64 num_bytes, bool enforce,
 	struct btrfs_fs_info *fs_info = root->fs_info;
 	u64 ref_root = root->root_key.objectid;
 	int ret = 0;
-	int retried = 0;
 	struct ulist_node *unode;
 	struct ulist_iterator uiter;
 
@@ -2420,7 +2419,6 @@ static int qgroup_reserve(struct btrfs_root *root, u64 num_bytes, bool enforce,
 	    capable(CAP_SYS_RESOURCE))
 		enforce = false;
 
-retry:
 	spin_lock(&fs_info->qgroup_lock);
 	quota_root = fs_info->quota_root;
 	if (!quota_root)
@@ -2447,27 +2445,6 @@ retry:
 		qg = unode_aux_to_qgroup(unode);
 
 		if (enforce && !qgroup_check_limits(qg, num_bytes)) {
-			/*
-			 * Commit the tree and retry, since we may have
-			 * deletions which would free up space.
-			 */
-			if (!retried && qgroup_rsv_total(qg) > 0) {
-				struct btrfs_trans_handle *trans;
-
-				spin_unlock(&fs_info->qgroup_lock);
-				ret = btrfs_start_delalloc_inodes(root, 0);
-				if (ret)
-					return ret;
-				btrfs_wait_ordered_extents(root, U64_MAX, 0, (u64)-1);
-				trans = btrfs_join_transaction(root);
-				if (IS_ERR(trans))
-					return PTR_ERR(trans);
-				ret = btrfs_commit_transaction(trans);
-				if (ret)
-					return ret;
-				retried++;
-				goto retry;
-			}
 			ret = -EDQUOT;
 			goto out;
 		}

From a1840b50238bce5d106a9aeedf47a4bcdb0689ba Mon Sep 17 00:00:00 2001
From: David Sterba <dsterba@suse.com>
Date: Tue, 27 Mar 2018 19:04:50 +0200
Subject: [PATCH 151/164] btrfs: use helper to set ulist aux from a qgroup

We have a nice helper to do proper casting of a qgroup to a ulist aux
value. And several places that could make use of it.

Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/qgroup.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index c8ca5483226e..6b715d6d3c94 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -2434,7 +2434,7 @@ static int qgroup_reserve(struct btrfs_root *root, u64 num_bytes, bool enforce,
 	 */
 	ulist_reinit(fs_info->qgroup_ulist);
 	ret = ulist_add(fs_info->qgroup_ulist, qgroup->qgroupid,
-			(uintptr_t)qgroup, GFP_ATOMIC);
+			qgroup_to_aux(qgroup), GFP_ATOMIC);
 	if (ret < 0)
 		goto out;
 	ULIST_ITER_INIT(&uiter);
@@ -2452,7 +2452,7 @@ static int qgroup_reserve(struct btrfs_root *root, u64 num_bytes, bool enforce,
 		list_for_each_entry(glist, &qg->groups, next_group) {
 			ret = ulist_add(fs_info->qgroup_ulist,
 					glist->group->qgroupid,
-					(uintptr_t)glist->group, GFP_ATOMIC);
+					qgroup_to_aux(glist->group), GFP_ATOMIC);
 			if (ret < 0)
 				goto out;
 		}
@@ -2524,7 +2524,7 @@ void btrfs_qgroup_free_refroot(struct btrfs_fs_info *fs_info,
 
 	ulist_reinit(fs_info->qgroup_ulist);
 	ret = ulist_add(fs_info->qgroup_ulist, qgroup->qgroupid,
-			(uintptr_t)qgroup, GFP_ATOMIC);
+			qgroup_to_aux(qgroup), GFP_ATOMIC);
 	if (ret < 0)
 		goto out;
 	ULIST_ITER_INIT(&uiter);
@@ -2540,7 +2540,7 @@ void btrfs_qgroup_free_refroot(struct btrfs_fs_info *fs_info,
 		list_for_each_entry(glist, &qg->groups, next_group) {
 			ret = ulist_add(fs_info->qgroup_ulist,
 					glist->group->qgroupid,
-					(uintptr_t)glist->group, GFP_ATOMIC);
+					qgroup_to_aux(glist->group), GFP_ATOMIC);
 			if (ret < 0)
 				goto out;
 		}
@@ -3186,7 +3186,7 @@ static void qgroup_convert_meta(struct btrfs_fs_info *fs_info, u64 ref_root,
 		goto out;
 	ulist_reinit(fs_info->qgroup_ulist);
 	ret = ulist_add(fs_info->qgroup_ulist, qgroup->qgroupid,
-		       (uintptr_t)qgroup, GFP_ATOMIC);
+		       qgroup_to_aux(qgroup), GFP_ATOMIC);
 	if (ret < 0)
 		goto out;
 	ULIST_ITER_INIT(&uiter);
@@ -3203,7 +3203,7 @@ static void qgroup_convert_meta(struct btrfs_fs_info *fs_info, u64 ref_root,
 		list_for_each_entry(glist, &qg->groups, next_group) {
 			ret = ulist_add(fs_info->qgroup_ulist,
 					glist->group->qgroupid,
-					(uintptr_t)glist->group, GFP_ATOMIC);
+					qgroup_to_aux(glist->group), GFP_ATOMIC);
 			if (ret < 0)
 				goto out;
 		}

From 4ee3fad34a9cc2cf33303dfbd0cf554248651c86 Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana@suse.com>
Date: Mon, 26 Mar 2018 23:59:00 +0100
Subject: [PATCH 152/164] Btrfs: fix fsync after hole punching when using
 no-holes feature

When we have the no-holes mode enabled and fsync a file after punching a
hole in it, we can end up not logging the whole hole range in the log tree.
This happens if the file has extent items that span more than one leaf and
we punch a hole that covers a range that starts in a leaf but does not go
beyond the offset of the first extent in the next leaf.

Example:

  $ mkfs.btrfs -f -O no-holes -n 65536 /dev/sdb
  $ mount /dev/sdb /mnt
  $ for ((i = 0; i <= 831; i++)); do
	offset=$((i * 2 * 256 * 1024))
	xfs_io -f -c "pwrite -S 0xab -b 256K $offset 256K" \
		/mnt/foobar >/dev/null
    done
  $ sync

  # We now have 2 leafs in our filesystem fs tree, the first leaf has an
  # item corresponding the extent at file offset 216530944 and the second
  # leaf has a first item corresponding to the extent at offset 217055232.
  # Now we punch a hole that partially covers the range of the extent at
  # offset 216530944 but does go beyond the offset 217055232.

  $ xfs_io -c "fpunch $((216530944 + 128 * 1024 - 4000)) 256K" /mnt/foobar
  $ xfs_io -c "fsync" /mnt/foobar

  <power fail>

  # mount to replay the log
  $ mount /dev/sdb /mnt

  # Before this patch, only the subrange [216658016, 216662016[ (length of
  # 4000 bytes) was logged, leaving an incorrect file layout after log
  # replay.

Fix this by checking if there is a hole between the last extent item that
we processed and the first extent item in the next leaf, and if there is
one, log an explicit hole extent item.

Fixes: 16e7549f045d ("Btrfs: incompatible format change to remove hole extents")
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/tree-log.c | 30 ++++++++++++++++++++++++++++++
 1 file changed, 30 insertions(+)

diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 7b8fee45b29e..552582a028a4 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -4002,6 +4002,36 @@ fill_holes:
 			break;
 		*last_extent = extent_end;
 	}
+
+	/*
+	 * Check if there is a hole between the last extent found in our leaf
+	 * and the first extent in the next leaf. If there is one, we need to
+	 * log an explicit hole so that at replay time we can punch the hole.
+	 */
+	if (ret == 0 &&
+	    key.objectid == btrfs_ino(inode) &&
+	    key.type == BTRFS_EXTENT_DATA_KEY &&
+	    i == btrfs_header_nritems(src_path->nodes[0])) {
+		ret = btrfs_next_leaf(inode->root, src_path);
+		need_find_last_extent = true;
+		if (ret > 0) {
+			ret = 0;
+		} else if (ret == 0) {
+			btrfs_item_key_to_cpu(src_path->nodes[0], &key,
+					      src_path->slots[0]);
+			if (key.objectid == btrfs_ino(inode) &&
+			    key.type == BTRFS_EXTENT_DATA_KEY &&
+			    *last_extent < key.offset) {
+				const u64 len = key.offset - *last_extent;
+
+				ret = btrfs_insert_file_extent(trans, log,
+							       btrfs_ino(inode),
+							       *last_extent, 0,
+							       0, len, 0, len,
+							       0, 0, 0);
+			}
+		}
+	}
 	/*
 	 * Need to let the callers know we dropped the path so they should
 	 * re-search.

From 8434ec46c6e3232cebc25a910363b29f5c617820 Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana@suse.com>
Date: Mon, 26 Mar 2018 23:59:12 +0100
Subject: [PATCH 153/164] Btrfs: fix copy_items() return value when logging an
 inode

When logging an inode, at tree-log.c:copy_items(), if we call
btrfs_next_leaf() at the loop which checks for the need to log holes, we
need to make sure copy_items() returns the value 1 to its caller and
not 0 (on success). This is because the path the caller passed was
released and is now different from what is was before, and the caller
expects a return value of 0 to mean both success and that the path
has not changed, while a return value of 1 means both success and
signals the caller that it can not reuse the path, it has to perform
another tree search.

Even though this is a case that should not be triggered on normal
circumstances or very rare at least, its consequences can be very
unpredictable (especially when replaying a log tree).

Fixes: 16e7549f045d ("Btrfs: incompatible format change to remove hole extents")
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/tree-log.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 552582a028a4..70afd1085033 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -3968,6 +3968,7 @@ fill_holes:
 			ASSERT(ret == 0);
 			src = src_path->nodes[0];
 			i = 0;
+			need_find_last_extent = true;
 		}
 
 		btrfs_item_key_to_cpu(src, &key, i);

From 3c0efdf03b2d127f0e40e30db4e7aa0429b1b79a Mon Sep 17 00:00:00 2001
From: Qu Wenruo <wqu@suse.com>
Date: Tue, 27 Mar 2018 20:44:18 +0800
Subject: [PATCH 154/164] btrfs: tests/qgroup: Fix wrong tree backref level

The extent tree of the test fs is like the following:

 BTRFS info (device (null)): leaf 16327509003777336587 total ptrs 1 free space 3919
  item 0 key (4096 168 4096) itemoff 3944 itemsize 51
          extent refs 1 gen 1 flags 2
          tree block key (68719476736 0 0) level 1
                                           ^^^^^^^
          ref#0: tree block backref root 5

And it's using an empty tree for fs tree, so there is no way that its
level can be 1.

For REAL (created by mkfs) fs tree backref with no skinny metadata, the
result should look like:

 item 3 key (30408704 EXTENT_ITEM 4096) itemoff 3845 itemsize 51
         refs 1 gen 4 flags TREE_BLOCK
         tree block key (256 INODE_ITEM 0) level 0
                                           ^^^^^^^
         tree block backref root 5

Fix the level to 0, so it won't break later tree level checker.

Fixes: faa2dbf004e8 ("Btrfs: add sanity tests for new qgroup accounting code")
Signed-off-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/tests/qgroup-tests.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/btrfs/tests/qgroup-tests.c b/fs/btrfs/tests/qgroup-tests.c
index 90204b166643..160eb2fba726 100644
--- a/fs/btrfs/tests/qgroup-tests.c
+++ b/fs/btrfs/tests/qgroup-tests.c
@@ -63,7 +63,7 @@ static int insert_normal_tree_ref(struct btrfs_root *root, u64 bytenr,
 	btrfs_set_extent_generation(leaf, item, 1);
 	btrfs_set_extent_flags(leaf, item, BTRFS_EXTENT_FLAG_TREE_BLOCK);
 	block_info = (struct btrfs_tree_block_info *)(item + 1);
-	btrfs_set_tree_block_level(leaf, block_info, 1);
+	btrfs_set_tree_block_level(leaf, block_info, 0);
 	iref = (struct btrfs_extent_inline_ref *)(block_info + 1);
 	if (parent > 0) {
 		btrfs_set_extent_inline_ref_type(leaf, iref,

From 581c1760415c48cca9349b198bba52dd38750765 Mon Sep 17 00:00:00 2001
From: Qu Wenruo <wqu@suse.com>
Date: Thu, 29 Mar 2018 09:08:11 +0800
Subject: [PATCH 155/164] btrfs: Validate child tree block's level and first
 key

We have several reports about node pointer points to incorrect child
tree blocks, which could have even wrong owner and level but still with
valid generation and checksum.

Although btrfs check could handle it and print error message like:
leaf parent key incorrect 60670574592

Kernel doesn't have enough check on this type of corruption correctly.
At least add such check to read_tree_block() and btrfs_read_buffer(),
where we need two new parameters @level and @first_key to verify the
child tree block.

The new @level check is mandatory and all call sites are already
modified to extract expected level from its call chain.

While @first_key is optional, the following call sites are skipping such
check:
1) Root node/leaf
   As ROOT_ITEM doesn't contain the first key, skip @first_key check.
2) Direct backref
   Only parent bytenr and level is known and we need to resolve the key
   all by ourselves, skip @first_key check.

Another note of this verification is, it needs extra info from nodeptr
or ROOT_ITEM, so it can't fit into current tree-checker framework, which
is limited to node/leaf boundary.

Signed-off-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/backref.c     |  6 ++-
 fs/btrfs/ctree.c       | 28 ++++++++++---
 fs/btrfs/disk-io.c     | 95 ++++++++++++++++++++++++++++++++++++------
 fs/btrfs/disk-io.h     |  8 ++--
 fs/btrfs/extent-tree.c |  6 ++-
 fs/btrfs/print-tree.c  | 10 +++--
 fs/btrfs/qgroup.c      |  7 +++-
 fs/btrfs/ref-verify.c  |  7 +++-
 fs/btrfs/relocation.c  | 21 ++++++++--
 fs/btrfs/tree-log.c    | 28 ++++++++-----
 10 files changed, 170 insertions(+), 46 deletions(-)

diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c
index 6007dd6b799e..571024bc632e 100644
--- a/fs/btrfs/backref.c
+++ b/fs/btrfs/backref.c
@@ -738,7 +738,8 @@ static int add_missing_keys(struct btrfs_fs_info *fs_info,
 		BUG_ON(ref->key_for_search.type);
 		BUG_ON(!ref->wanted_disk_byte);
 
-		eb = read_tree_block(fs_info, ref->wanted_disk_byte, 0);
+		eb = read_tree_block(fs_info, ref->wanted_disk_byte, 0,
+				     ref->level - 1, NULL);
 		if (IS_ERR(eb)) {
 			free_pref(ref);
 			return PTR_ERR(eb);
@@ -1288,7 +1289,8 @@ again:
 			    ref->level == 0) {
 				struct extent_buffer *eb;
 
-				eb = read_tree_block(fs_info, ref->parent, 0);
+				eb = read_tree_block(fs_info, ref->parent, 0,
+						     ref->level, NULL);
 				if (IS_ERR(eb)) {
 					ret = PTR_ERR(eb);
 					goto out;
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 1ef6b67f893a..7c8faeb868f4 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -1354,6 +1354,7 @@ get_old_root(struct btrfs_root *root, u64 time_seq)
 	struct tree_mod_root *old_root = NULL;
 	u64 old_generation = 0;
 	u64 logical;
+	int level;
 
 	eb_root = btrfs_read_lock_root_node(root);
 	tm = __tree_mod_log_oldest_root(eb_root, time_seq);
@@ -1364,15 +1365,17 @@ get_old_root(struct btrfs_root *root, u64 time_seq)
 		old_root = &tm->old_root;
 		old_generation = tm->generation;
 		logical = old_root->logical;
+		level = old_root->level;
 	} else {
 		logical = eb_root->start;
+		level = btrfs_header_level(eb_root);
 	}
 
 	tm = tree_mod_log_search(fs_info, logical, time_seq);
 	if (old_root && tm && tm->op != MOD_LOG_KEY_REMOVE_WHILE_FREEING) {
 		btrfs_tree_read_unlock(eb_root);
 		free_extent_buffer(eb_root);
-		old = read_tree_block(fs_info, logical, 0);
+		old = read_tree_block(fs_info, logical, 0, level, NULL);
 		if (WARN_ON(IS_ERR(old) || !extent_buffer_uptodate(old))) {
 			if (!IS_ERR(old))
 				free_extent_buffer(old);
@@ -1592,6 +1595,7 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans,
 	btrfs_set_lock_blocking(parent);
 
 	for (i = start_slot; i <= end_slot; i++) {
+		struct btrfs_key first_key;
 		int close = 1;
 
 		btrfs_node_key(parent, &disk_key, i);
@@ -1601,6 +1605,7 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans,
 		progress_passed = 1;
 		blocknr = btrfs_node_blockptr(parent, i);
 		gen = btrfs_node_ptr_generation(parent, i);
+		btrfs_node_key_to_cpu(parent, &first_key, i);
 		if (last_block == 0)
 			last_block = blocknr;
 
@@ -1624,7 +1629,9 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans,
 			uptodate = 0;
 		if (!cur || !uptodate) {
 			if (!cur) {
-				cur = read_tree_block(fs_info, blocknr, gen);
+				cur = read_tree_block(fs_info, blocknr, gen,
+						      parent_level - 1,
+						      &first_key);
 				if (IS_ERR(cur)) {
 					return PTR_ERR(cur);
 				} else if (!extent_buffer_uptodate(cur)) {
@@ -1632,7 +1639,8 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans,
 					return -EIO;
 				}
 			} else if (!uptodate) {
-				err = btrfs_read_buffer(cur, gen);
+				err = btrfs_read_buffer(cur, gen,
+						parent_level - 1,&first_key);
 				if (err) {
 					free_extent_buffer(cur);
 					return err;
@@ -1785,14 +1793,17 @@ read_node_slot(struct btrfs_fs_info *fs_info, struct extent_buffer *parent,
 {
 	int level = btrfs_header_level(parent);
 	struct extent_buffer *eb;
+	struct btrfs_key first_key;
 
 	if (slot < 0 || slot >= btrfs_header_nritems(parent))
 		return ERR_PTR(-ENOENT);
 
 	BUG_ON(level == 0);
 
+	btrfs_node_key_to_cpu(parent, &first_key, slot);
 	eb = read_tree_block(fs_info, btrfs_node_blockptr(parent, slot),
-			     btrfs_node_ptr_generation(parent, slot));
+			     btrfs_node_ptr_generation(parent, slot),
+			     level - 1, &first_key);
 	if (!IS_ERR(eb) && !extent_buffer_uptodate(eb)) {
 		free_extent_buffer(eb);
 		eb = ERR_PTR(-EIO);
@@ -2388,10 +2399,14 @@ read_block_for_search(struct btrfs_root *root, struct btrfs_path *p,
 	u64 gen;
 	struct extent_buffer *b = *eb_ret;
 	struct extent_buffer *tmp;
+	struct btrfs_key first_key;
 	int ret;
+	int parent_level;
 
 	blocknr = btrfs_node_blockptr(b, slot);
 	gen = btrfs_node_ptr_generation(b, slot);
+	parent_level = btrfs_header_level(b);
+	btrfs_node_key_to_cpu(b, &first_key, slot);
 
 	tmp = find_extent_buffer(fs_info, blocknr);
 	if (tmp) {
@@ -2410,7 +2425,7 @@ read_block_for_search(struct btrfs_root *root, struct btrfs_path *p,
 		btrfs_set_path_blocking(p);
 
 		/* now we're allowed to do a blocking uptodate check */
-		ret = btrfs_read_buffer(tmp, gen);
+		ret = btrfs_read_buffer(tmp, gen, parent_level - 1, &first_key);
 		if (!ret) {
 			*eb_ret = tmp;
 			return 0;
@@ -2437,7 +2452,8 @@ read_block_for_search(struct btrfs_root *root, struct btrfs_path *p,
 	btrfs_release_path(p);
 
 	ret = -EAGAIN;
-	tmp = read_tree_block(fs_info, blocknr, 0);
+	tmp = read_tree_block(fs_info, blocknr, 0, parent_level - 1,
+			      &first_key);
 	if (!IS_ERR(tmp)) {
 		/*
 		 * If the read above didn't mark this buffer up to date,
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 269374261e36..a2f3a0c67a99 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -427,13 +427,59 @@ static int btrfs_check_super_csum(struct btrfs_fs_info *fs_info,
 	return ret;
 }
 
+static int verify_level_key(struct btrfs_fs_info *fs_info,
+			    struct extent_buffer *eb, int level,
+			    struct btrfs_key *first_key)
+{
+	int found_level;
+	struct btrfs_key found_key;
+	int ret;
+
+	found_level = btrfs_header_level(eb);
+	if (found_level != level) {
+#ifdef CONFIG_BTRFS_DEBUG
+		WARN_ON(1);
+		btrfs_err(fs_info,
+"tree level mismatch detected, bytenr=%llu level expected=%u has=%u",
+			  eb->start, level, found_level);
+#endif
+		return -EIO;
+	}
+
+	if (!first_key)
+		return 0;
+
+	if (found_level)
+		btrfs_node_key_to_cpu(eb, &found_key, 0);
+	else
+		btrfs_item_key_to_cpu(eb, &found_key, 0);
+	ret = btrfs_comp_cpu_keys(first_key, &found_key);
+
+#ifdef CONFIG_BTRFS_DEBUG
+	if (ret) {
+		WARN_ON(1);
+		btrfs_err(fs_info,
+"tree first key mismatch detected, bytenr=%llu key expected=(%llu, %u, %llu) has=(%llu, %u, %llu)",
+			  eb->start, first_key->objectid, first_key->type,
+			  first_key->offset, found_key.objectid,
+			  found_key.type, found_key.offset);
+	}
+#endif
+	return ret;
+}
+
 /*
  * helper to read a given tree block, doing retries as required when
  * the checksums don't match and we have alternate mirrors to try.
+ *
+ * @parent_transid:	expected transid, skip check if 0
+ * @level:		expected level, mandatory check
+ * @first_key:		expected key of first slot, skip check if NULL
  */
 static int btree_read_extent_buffer_pages(struct btrfs_fs_info *fs_info,
 					  struct extent_buffer *eb,
-					  u64 parent_transid)
+					  u64 parent_transid, int level,
+					  struct btrfs_key *first_key)
 {
 	struct extent_io_tree *io_tree;
 	int failed = 0;
@@ -448,11 +494,14 @@ static int btree_read_extent_buffer_pages(struct btrfs_fs_info *fs_info,
 		ret = read_extent_buffer_pages(io_tree, eb, WAIT_COMPLETE,
 					       mirror_num);
 		if (!ret) {
-			if (!verify_parent_transid(io_tree, eb,
+			if (verify_parent_transid(io_tree, eb,
 						   parent_transid, 0))
-				break;
-			else
 				ret = -EIO;
+			else if (verify_level_key(fs_info, eb, level,
+						  first_key))
+				ret = -EUCLEAN;
+			else
+				break;
 		}
 
 		/*
@@ -460,7 +509,8 @@ static int btree_read_extent_buffer_pages(struct btrfs_fs_info *fs_info,
 		 * there is no reason to read the other copies, they won't be
 		 * any less wrong.
 		 */
-		if (test_bit(EXTENT_BUFFER_CORRUPT, &eb->bflags))
+		if (test_bit(EXTENT_BUFFER_CORRUPT, &eb->bflags) ||
+		    ret == -EUCLEAN)
 			break;
 
 		num_copies = btrfs_num_copies(fs_info,
@@ -1049,8 +1099,17 @@ void btrfs_wait_tree_block_writeback(struct extent_buffer *buf)
 			        buf->start, buf->start + buf->len - 1);
 }
 
+/*
+ * Read tree block at logical address @bytenr and do variant basic but critical
+ * verification.
+ *
+ * @parent_transid:	expected transid of this tree block, skip check if 0
+ * @level:		expected level, mandatory check
+ * @first_key:		expected key in slot 0, skip check if NULL
+ */
 struct extent_buffer *read_tree_block(struct btrfs_fs_info *fs_info, u64 bytenr,
-				      u64 parent_transid)
+				      u64 parent_transid, int level,
+				      struct btrfs_key *first_key)
 {
 	struct extent_buffer *buf = NULL;
 	int ret;
@@ -1059,7 +1118,8 @@ struct extent_buffer *read_tree_block(struct btrfs_fs_info *fs_info, u64 bytenr,
 	if (IS_ERR(buf))
 		return buf;
 
-	ret = btree_read_extent_buffer_pages(fs_info, buf, parent_transid);
+	ret = btree_read_extent_buffer_pages(fs_info, buf, parent_transid,
+					     level, first_key);
 	if (ret) {
 		free_extent_buffer(buf);
 		return ERR_PTR(ret);
@@ -1388,6 +1448,7 @@ static struct btrfs_root *btrfs_read_tree_root(struct btrfs_root *tree_root,
 	struct btrfs_path *path;
 	u64 generation;
 	int ret;
+	int level;
 
 	path = btrfs_alloc_path();
 	if (!path)
@@ -1410,9 +1471,10 @@ static struct btrfs_root *btrfs_read_tree_root(struct btrfs_root *tree_root,
 	}
 
 	generation = btrfs_root_generation(&root->root_item);
+	level = btrfs_root_level(&root->root_item);
 	root->node = read_tree_block(fs_info,
 				     btrfs_root_bytenr(&root->root_item),
-				     generation);
+				     generation, level, NULL);
 	if (IS_ERR(root->node)) {
 		ret = PTR_ERR(root->node);
 		goto find_fail;
@@ -2261,6 +2323,7 @@ static int btrfs_replay_log(struct btrfs_fs_info *fs_info,
 	struct btrfs_root *log_tree_root;
 	struct btrfs_super_block *disk_super = fs_info->super_copy;
 	u64 bytenr = btrfs_super_log_root(disk_super);
+	int level = btrfs_super_log_root_level(disk_super);
 
 	if (fs_devices->rw_devices == 0) {
 		btrfs_warn(fs_info, "log replay required on RO media");
@@ -2274,7 +2337,8 @@ static int btrfs_replay_log(struct btrfs_fs_info *fs_info,
 	__setup_root(log_tree_root, fs_info, BTRFS_TREE_LOG_OBJECTID);
 
 	log_tree_root->node = read_tree_block(fs_info, bytenr,
-					      fs_info->generation + 1);
+					      fs_info->generation + 1,
+					      level, NULL);
 	if (IS_ERR(log_tree_root->node)) {
 		btrfs_warn(fs_info, "failed to read log tree");
 		ret = PTR_ERR(log_tree_root->node);
@@ -2390,6 +2454,7 @@ int open_ctree(struct super_block *sb,
 	int num_backups_tried = 0;
 	int backup_index = 0;
 	int clear_free_space_tree = 0;
+	int level;
 
 	tree_root = fs_info->tree_root = btrfs_alloc_root(fs_info, GFP_KERNEL);
 	chunk_root = fs_info->chunk_root = btrfs_alloc_root(fs_info, GFP_KERNEL);
@@ -2725,12 +2790,13 @@ int open_ctree(struct super_block *sb,
 	}
 
 	generation = btrfs_super_chunk_root_generation(disk_super);
+	level = btrfs_super_chunk_root_level(disk_super);
 
 	__setup_root(chunk_root, fs_info, BTRFS_CHUNK_TREE_OBJECTID);
 
 	chunk_root->node = read_tree_block(fs_info,
 					   btrfs_super_chunk_root(disk_super),
-					   generation);
+					   generation, level, NULL);
 	if (IS_ERR(chunk_root->node) ||
 	    !extent_buffer_uptodate(chunk_root->node)) {
 		btrfs_err(fs_info, "failed to read chunk root");
@@ -2764,10 +2830,11 @@ int open_ctree(struct super_block *sb,
 
 retry_root_backup:
 	generation = btrfs_super_generation(disk_super);
+	level = btrfs_super_root_level(disk_super);
 
 	tree_root->node = read_tree_block(fs_info,
 					  btrfs_super_root(disk_super),
-					  generation);
+					  generation, level, NULL);
 	if (IS_ERR(tree_root->node) ||
 	    !extent_buffer_uptodate(tree_root->node)) {
 		btrfs_warn(fs_info, "failed to read tree root");
@@ -3887,12 +3954,14 @@ void btrfs_btree_balance_dirty_nodelay(struct btrfs_fs_info *fs_info)
 	__btrfs_btree_balance_dirty(fs_info, 0);
 }
 
-int btrfs_read_buffer(struct extent_buffer *buf, u64 parent_transid)
+int btrfs_read_buffer(struct extent_buffer *buf, u64 parent_transid, int level,
+		      struct btrfs_key *first_key)
 {
 	struct btrfs_root *root = BTRFS_I(buf->pages[0]->mapping->host)->root;
 	struct btrfs_fs_info *fs_info = root->fs_info;
 
-	return btree_read_extent_buffer_pages(fs_info, buf, parent_transid);
+	return btree_read_extent_buffer_pages(fs_info, buf, parent_transid,
+					      level, first_key);
 }
 
 static int btrfs_check_super_valid(struct btrfs_fs_info *fs_info)
diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
index 70a88d61b547..453ea9f5d4e9 100644
--- a/fs/btrfs/disk-io.h
+++ b/fs/btrfs/disk-io.h
@@ -52,8 +52,9 @@ static inline u64 btrfs_sb_offset(int mirror)
 struct btrfs_device;
 struct btrfs_fs_devices;
 
-struct extent_buffer *read_tree_block(struct btrfs_fs_info *fs_info,
-				      u64 bytenr, u64 parent_transid);
+struct extent_buffer *read_tree_block(struct btrfs_fs_info *fs_info, u64 bytenr,
+				      u64 parent_transid, int level,
+				      struct btrfs_key *first_key);
 void readahead_tree_block(struct btrfs_fs_info *fs_info, u64 bytenr);
 int reada_tree_block_flagged(struct btrfs_fs_info *fs_info, u64 bytenr,
 			 int mirror_num, struct extent_buffer **eb);
@@ -123,7 +124,8 @@ static inline void btrfs_put_fs_root(struct btrfs_root *root)
 void btrfs_mark_buffer_dirty(struct extent_buffer *buf);
 int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid,
 			  int atomic);
-int btrfs_read_buffer(struct extent_buffer *buf, u64 parent_transid);
+int btrfs_read_buffer(struct extent_buffer *buf, u64 parent_transid, int level,
+		      struct btrfs_key *first_key);
 u32 btrfs_csum_data(const char *data, u32 seed, size_t len);
 void btrfs_csum_final(u32 crc, u8 *result);
 blk_status_t btrfs_bio_wq_end_io(struct btrfs_fs_info *info, struct bio *bio,
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 6b07202385d3..72f6c03445b6 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -8710,6 +8710,7 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans,
 	u64 parent;
 	u32 blocksize;
 	struct btrfs_key key;
+	struct btrfs_key first_key;
 	struct extent_buffer *next;
 	int level = wc->level;
 	int reada = 0;
@@ -8730,6 +8731,8 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans,
 	}
 
 	bytenr = btrfs_node_blockptr(path->nodes[level], path->slots[level]);
+	btrfs_node_key_to_cpu(path->nodes[level], &first_key,
+			      path->slots[level]);
 	blocksize = fs_info->nodesize;
 
 	next = find_extent_buffer(fs_info, bytenr);
@@ -8794,7 +8797,8 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans,
 	if (!next) {
 		if (reada && level == 1)
 			reada_walk_down(trans, root, wc, path);
-		next = read_tree_block(fs_info, bytenr, generation);
+		next = read_tree_block(fs_info, bytenr, generation, level - 1,
+				       &first_key);
 		if (IS_ERR(next)) {
 			return PTR_ERR(next);
 		} else if (!extent_buffer_uptodate(next)) {
diff --git a/fs/btrfs/print-tree.c b/fs/btrfs/print-tree.c
index 569205e651c7..4a8770485f77 100644
--- a/fs/btrfs/print-tree.c
+++ b/fs/btrfs/print-tree.c
@@ -365,9 +365,13 @@ void btrfs_print_tree(struct extent_buffer *c)
 		       btrfs_node_blockptr(c, i));
 	}
 	for (i = 0; i < nr; i++) {
-		struct extent_buffer *next = read_tree_block(fs_info,
-					btrfs_node_blockptr(c, i),
-					btrfs_node_ptr_generation(c, i));
+		struct btrfs_key first_key;
+		struct extent_buffer *next;
+
+		btrfs_node_key_to_cpu(c, &first_key, i);
+		next = read_tree_block(fs_info, btrfs_node_blockptr(c, i),
+				       btrfs_node_ptr_generation(c, i),
+				       level - 1, &first_key);
 		if (IS_ERR(next)) {
 			continue;
 		} else if (!extent_buffer_uptodate(next)) {
diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index 6b715d6d3c94..875df02ffaee 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -1684,7 +1684,7 @@ int btrfs_qgroup_trace_subtree(struct btrfs_trans_handle *trans,
 		return 0;
 
 	if (!extent_buffer_uptodate(root_eb)) {
-		ret = btrfs_read_buffer(root_eb, root_gen);
+		ret = btrfs_read_buffer(root_eb, root_gen, root_level, NULL);
 		if (ret)
 			goto out;
 	}
@@ -1715,6 +1715,7 @@ walk_down:
 	level = root_level;
 	while (level >= 0) {
 		if (path->nodes[level] == NULL) {
+			struct btrfs_key first_key;
 			int parent_slot;
 			u64 child_gen;
 			u64 child_bytenr;
@@ -1727,8 +1728,10 @@ walk_down:
 			parent_slot = path->slots[level + 1];
 			child_bytenr = btrfs_node_blockptr(eb, parent_slot);
 			child_gen = btrfs_node_ptr_generation(eb, parent_slot);
+			btrfs_node_key_to_cpu(eb, &first_key, parent_slot);
 
-			eb = read_tree_block(fs_info, child_bytenr, child_gen);
+			eb = read_tree_block(fs_info, child_bytenr, child_gen,
+					     level, &first_key);
 			if (IS_ERR(eb)) {
 				ret = PTR_ERR(eb);
 				goto out;
diff --git a/fs/btrfs/ref-verify.c b/fs/btrfs/ref-verify.c
index 171f3cce30e6..35fab67dcbe8 100644
--- a/fs/btrfs/ref-verify.c
+++ b/fs/btrfs/ref-verify.c
@@ -579,11 +579,16 @@ static int walk_down_tree(struct btrfs_root *root, struct btrfs_path *path,
 
 	while (level >= 0) {
 		if (level) {
+			struct btrfs_key first_key;
+
 			block_bytenr = btrfs_node_blockptr(path->nodes[level],
 							   path->slots[level]);
 			gen = btrfs_node_ptr_generation(path->nodes[level],
 							path->slots[level]);
-			eb = read_tree_block(fs_info, block_bytenr, gen);
+			btrfs_node_key_to_cpu(path->nodes[level], &first_key,
+					      path->slots[level]);
+			eb = read_tree_block(fs_info, block_bytenr, gen,
+					     level - 1, &first_key);
 			if (IS_ERR(eb))
 				return PTR_ERR(eb);
 			if (!extent_buffer_uptodate(eb)) {
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index e61e1ee9af9a..4874c09f6d3c 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -1839,6 +1839,8 @@ again:
 
 	parent = eb;
 	while (1) {
+		struct btrfs_key first_key;
+
 		level = btrfs_header_level(parent);
 		BUG_ON(level < lowest_level);
 
@@ -1852,6 +1854,7 @@ again:
 		old_bytenr = btrfs_node_blockptr(parent, slot);
 		blocksize = fs_info->nodesize;
 		old_ptr_gen = btrfs_node_ptr_generation(parent, slot);
+		btrfs_node_key_to_cpu(parent, &key, slot);
 
 		if (level <= max_level) {
 			eb = path->nodes[level];
@@ -1876,7 +1879,8 @@ again:
 				break;
 			}
 
-			eb = read_tree_block(fs_info, old_bytenr, old_ptr_gen);
+			eb = read_tree_block(fs_info, old_bytenr, old_ptr_gen,
+					     level - 1, &first_key);
 			if (IS_ERR(eb)) {
 				ret = PTR_ERR(eb);
 				break;
@@ -2036,6 +2040,8 @@ int walk_down_reloc_tree(struct btrfs_root *root, struct btrfs_path *path,
 	last_snapshot = btrfs_root_last_snapshot(&root->root_item);
 
 	for (i = *level; i > 0; i--) {
+		struct btrfs_key first_key;
+
 		eb = path->nodes[i];
 		nritems = btrfs_header_nritems(eb);
 		while (path->slots[i] < nritems) {
@@ -2056,7 +2062,9 @@ int walk_down_reloc_tree(struct btrfs_root *root, struct btrfs_path *path,
 		}
 
 		bytenr = btrfs_node_blockptr(eb, path->slots[i]);
-		eb = read_tree_block(fs_info, bytenr, ptr_gen);
+		btrfs_node_key_to_cpu(eb, &first_key, path->slots[i]);
+		eb = read_tree_block(fs_info, bytenr, ptr_gen, i - 1,
+				     &first_key);
 		if (IS_ERR(eb)) {
 			return PTR_ERR(eb);
 		} else if (!extent_buffer_uptodate(eb)) {
@@ -2714,6 +2722,8 @@ static int do_relocation(struct btrfs_trans_handle *trans,
 	path->lowest_level = node->level + 1;
 	rc->backref_cache.path[node->level] = node;
 	list_for_each_entry(edge, &node->upper, list[LOWER]) {
+		struct btrfs_key first_key;
+
 		cond_resched();
 
 		upper = edge->node[UPPER];
@@ -2779,7 +2789,9 @@ static int do_relocation(struct btrfs_trans_handle *trans,
 
 		blocksize = root->fs_info->nodesize;
 		generation = btrfs_node_ptr_generation(upper->eb, slot);
-		eb = read_tree_block(fs_info, bytenr, generation);
+		btrfs_node_key_to_cpu(upper->eb, &first_key, slot);
+		eb = read_tree_block(fs_info, bytenr, generation,
+				     upper->level - 1, &first_key);
 		if (IS_ERR(eb)) {
 			err = PTR_ERR(eb);
 			goto next;
@@ -2944,7 +2956,8 @@ static int get_tree_block_key(struct btrfs_fs_info *fs_info,
 	struct extent_buffer *eb;
 
 	BUG_ON(block->key_ready);
-	eb = read_tree_block(fs_info, block->bytenr, block->key.offset);
+	eb = read_tree_block(fs_info, block->bytenr, block->key.offset,
+			     block->level, NULL);
 	if (IS_ERR(eb)) {
 		return PTR_ERR(eb);
 	} else if (!extent_buffer_uptodate(eb)) {
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 70afd1085033..c91babc6aa4b 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -286,7 +286,7 @@ struct walk_control {
 	 * inside it
 	 */
 	int (*process_func)(struct btrfs_root *log, struct extent_buffer *eb,
-			    struct walk_control *wc, u64 gen);
+			    struct walk_control *wc, u64 gen, int level);
 };
 
 /*
@@ -294,7 +294,7 @@ struct walk_control {
  */
 static int process_one_buffer(struct btrfs_root *log,
 			      struct extent_buffer *eb,
-			      struct walk_control *wc, u64 gen)
+			      struct walk_control *wc, u64 gen, int level)
 {
 	struct btrfs_fs_info *fs_info = log->fs_info;
 	int ret = 0;
@@ -304,7 +304,7 @@ static int process_one_buffer(struct btrfs_root *log,
 	 * pin down any logged extents, so we have to read the block.
 	 */
 	if (btrfs_fs_incompat(fs_info, MIXED_GROUPS)) {
-		ret = btrfs_read_buffer(eb, gen);
+		ret = btrfs_read_buffer(eb, gen, level, NULL);
 		if (ret)
 			return ret;
 	}
@@ -2406,17 +2406,16 @@ out:
  * back refs).
  */
 static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb,
-			     struct walk_control *wc, u64 gen)
+			     struct walk_control *wc, u64 gen, int level)
 {
 	int nritems;
 	struct btrfs_path *path;
 	struct btrfs_root *root = wc->replay_dest;
 	struct btrfs_key key;
-	int level;
 	int i;
 	int ret;
 
-	ret = btrfs_read_buffer(eb, gen);
+	ret = btrfs_read_buffer(eb, gen, level, NULL);
 	if (ret)
 		return ret;
 
@@ -2533,6 +2532,8 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans,
 	WARN_ON(*level >= BTRFS_MAX_LEVEL);
 
 	while (*level > 0) {
+		struct btrfs_key first_key;
+
 		WARN_ON(*level < 0);
 		WARN_ON(*level >= BTRFS_MAX_LEVEL);
 		cur = path->nodes[*level];
@@ -2545,6 +2546,7 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans,
 
 		bytenr = btrfs_node_blockptr(cur, path->slots[*level]);
 		ptr_gen = btrfs_node_ptr_generation(cur, path->slots[*level]);
+		btrfs_node_key_to_cpu(cur, &first_key, path->slots[*level]);
 		blocksize = fs_info->nodesize;
 
 		parent = path->nodes[*level];
@@ -2555,7 +2557,8 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans,
 			return PTR_ERR(next);
 
 		if (*level == 1) {
-			ret = wc->process_func(root, next, wc, ptr_gen);
+			ret = wc->process_func(root, next, wc, ptr_gen,
+					       *level - 1);
 			if (ret) {
 				free_extent_buffer(next);
 				return ret;
@@ -2563,7 +2566,8 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans,
 
 			path->slots[*level]++;
 			if (wc->free) {
-				ret = btrfs_read_buffer(next, ptr_gen);
+				ret = btrfs_read_buffer(next, ptr_gen,
+							*level - 1, &first_key);
 				if (ret) {
 					free_extent_buffer(next);
 					return ret;
@@ -2593,7 +2597,7 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans,
 			free_extent_buffer(next);
 			continue;
 		}
-		ret = btrfs_read_buffer(next, ptr_gen);
+		ret = btrfs_read_buffer(next, ptr_gen, *level - 1, &first_key);
 		if (ret) {
 			free_extent_buffer(next);
 			return ret;
@@ -2643,7 +2647,8 @@ static noinline int walk_up_log_tree(struct btrfs_trans_handle *trans,
 
 			root_owner = btrfs_header_owner(parent);
 			ret = wc->process_func(root, path->nodes[*level], wc,
-				 btrfs_header_generation(path->nodes[*level]));
+				 btrfs_header_generation(path->nodes[*level]),
+				 *level);
 			if (ret)
 				return ret;
 
@@ -2725,7 +2730,8 @@ static int walk_log_tree(struct btrfs_trans_handle *trans,
 	/* was the root node processed? if not, catch it here */
 	if (path->nodes[orig_level]) {
 		ret = wc->process_func(log, path->nodes[orig_level], wc,
-			 btrfs_header_generation(path->nodes[orig_level]));
+			 btrfs_header_generation(path->nodes[orig_level]),
+			 orig_level);
 		if (ret)
 			goto out;
 		if (wc->free) {

From a4666e688f6fbf532c6c16259bea775ee690e4b7 Mon Sep 17 00:00:00 2001
From: David Sterba <dsterba@suse.com>
Date: Fri, 16 Mar 2018 02:21:22 +0100
Subject: [PATCH 156/164] btrfs: use lockdep_assert_held for spinlocks

Using lockdep_assert_held is preferred, replace assert_spin_locked.

Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/delayed-ref.c | 6 +++---
 fs/btrfs/qgroup.c      | 2 +-
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c
index 03bdf355107a..2677257c149d 100644
--- a/fs/btrfs/delayed-ref.c
+++ b/fs/btrfs/delayed-ref.c
@@ -216,7 +216,7 @@ int btrfs_delayed_ref_lock(struct btrfs_trans_handle *trans,
 	struct btrfs_delayed_ref_root *delayed_refs;
 
 	delayed_refs = &trans->transaction->delayed_refs;
-	assert_spin_locked(&delayed_refs->lock);
+	lockdep_assert_held(&delayed_refs->lock);
 	if (mutex_trylock(&head->mutex))
 		return 0;
 
@@ -239,7 +239,7 @@ static inline void drop_delayed_ref(struct btrfs_trans_handle *trans,
 				    struct btrfs_delayed_ref_head *head,
 				    struct btrfs_delayed_ref_node *ref)
 {
-	assert_spin_locked(&head->lock);
+	lockdep_assert_held(&head->lock);
 	rb_erase(&ref->ref_node, &head->ref_tree);
 	RB_CLEAR_NODE(&ref->ref_node);
 	if (!list_empty(&ref->add_list))
@@ -307,7 +307,7 @@ void btrfs_merge_delayed_refs(struct btrfs_trans_handle *trans,
 	struct rb_node *node;
 	u64 seq = 0;
 
-	assert_spin_locked(&head->lock);
+	lockdep_assert_held(&head->lock);
 
 	if (RB_EMPTY_ROOT(&head->ref_tree))
 		return;
diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index 875df02ffaee..f583f13ff26e 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -1484,7 +1484,7 @@ int btrfs_qgroup_trace_extent_nolock(struct btrfs_fs_info *fs_info,
 	struct btrfs_qgroup_extent_record *entry;
 	u64 bytenr = record->bytenr;
 
-	assert_spin_locked(&delayed_refs->lock);
+	lockdep_assert_held(&delayed_refs->lock);
 	trace_btrfs_qgroup_trace_extent(fs_info, record);
 
 	while (*p) {

From a32bf9a30201f13097428a3327c7c67f678c760c Mon Sep 17 00:00:00 2001
From: David Sterba <dsterba@suse.com>
Date: Fri, 16 Mar 2018 02:21:22 +0100
Subject: [PATCH 157/164] btrfs: use lockdep_assert_held for mutexes

Using lockdep_assert_held is preferred, replace mutex_is_locked.

Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/extent-tree.c |  2 +-
 fs/btrfs/scrub.c       |  4 ++--
 fs/btrfs/volumes.c     | 10 +++++-----
 3 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 72f6c03445b6..f30548d7e0d2 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -4545,7 +4545,7 @@ void check_system_chunk(struct btrfs_trans_handle *trans,
 	 * Needed because we can end up allocating a system chunk and for an
 	 * atomic and race free space reservation in the chunk block reserve.
 	 */
-	ASSERT(mutex_is_locked(&fs_info->chunk_mutex));
+	lockdep_assert_held(&fs_info->chunk_mutex);
 
 	info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_SYSTEM);
 	spin_lock(&info->lock);
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index 4ab4a68dbc06..fbc0c0e264af 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -371,7 +371,7 @@ static struct full_stripe_lock *insert_full_stripe_lock(
 	struct full_stripe_lock *entry;
 	struct full_stripe_lock *ret;
 
-	WARN_ON(!mutex_is_locked(&locks_root->lock));
+	lockdep_assert_held(&locks_root->lock);
 
 	p = &locks_root->root.rb_node;
 	while (*p) {
@@ -413,7 +413,7 @@ static struct full_stripe_lock *search_full_stripe_lock(
 	struct rb_node *node;
 	struct full_stripe_lock *entry;
 
-	WARN_ON(!mutex_is_locked(&locks_root->lock));
+	lockdep_assert_held(&locks_root->lock);
 
 	node = locks_root->root.rb_node;
 	while (node) {
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 4fc6acf65220..0c331d51385e 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -2067,7 +2067,7 @@ void btrfs_rm_dev_replace_remove_srcdev(struct btrfs_fs_info *fs_info,
 {
 	struct btrfs_fs_devices *fs_devices;
 
-	WARN_ON(!mutex_is_locked(&fs_info->fs_devices->device_list_mutex));
+	lockdep_assert_held(&fs_info->fs_devices->device_list_mutex);
 
 	/*
 	 * in case of fs with no seed, srcdev->fs_devices will point
@@ -2257,7 +2257,7 @@ static int btrfs_prepare_sprout(struct btrfs_fs_info *fs_info)
 	struct btrfs_device *device;
 	u64 super_flags;
 
-	BUG_ON(!mutex_is_locked(&uuid_mutex));
+	lockdep_assert_held(&uuid_mutex);
 	if (!fs_devices->seeding)
 		return -EINVAL;
 
@@ -2990,7 +2990,7 @@ static int btrfs_relocate_chunk(struct btrfs_fs_info *fs_info, u64 chunk_offset)
 	 * we release the path used to search the chunk/dev tree and before
 	 * the current task acquires this mutex and calls us.
 	 */
-	ASSERT(mutex_is_locked(&fs_info->delete_unused_bgs_mutex));
+	lockdep_assert_held(&fs_info->delete_unused_bgs_mutex);
 
 	ret = btrfs_can_relocate(fs_info, chunk_offset);
 	if (ret)
@@ -5100,7 +5100,7 @@ int btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
 {
 	u64 chunk_offset;
 
-	ASSERT(mutex_is_locked(&fs_info->chunk_mutex));
+	lockdep_assert_held(&fs_info->chunk_mutex);
 	chunk_offset = find_next_chunk(fs_info);
 	return __btrfs_alloc_chunk(trans, chunk_offset, type);
 }
@@ -6658,7 +6658,7 @@ static struct btrfs_fs_devices *open_seed_devices(struct btrfs_fs_info *fs_info,
 	struct btrfs_fs_devices *fs_devices;
 	int ret;
 
-	BUG_ON(!mutex_is_locked(&uuid_mutex));
+	lockdep_assert_held(&uuid_mutex);
 	ASSERT(fsid);
 
 	fs_devices = fs_info->fs_devices->seed;

From d1980131ca7f0776542f776ceb777cd01eb983e2 Mon Sep 17 00:00:00 2001
From: David Sterba <dsterba@suse.com>
Date: Fri, 16 Mar 2018 02:39:40 +0100
Subject: [PATCH 158/164] btrfs: update barrier in should_cow_block

Once there was a simple int force_cow that was used with the plain
barriers, and then converted to a bit, so we should use the appropriate
barrier helper.

Other variables in the complex if condition do not depend on a barrier,
so we should be fine in case the atomic barrier becomes a no-op.

Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/ctree.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 7c8faeb868f4..a2c9d21176e2 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -1441,8 +1441,8 @@ static inline int should_cow_block(struct btrfs_trans_handle *trans,
 	if (btrfs_is_testing(root->fs_info))
 		return 0;
 
-	/* ensure we can see the force_cow */
-	smp_rmb();
+	/* Ensure we can see the FORCE_COW bit */
+	smp_mb__before_atomic();
 
 	/*
 	 * We do not need to cow a block if

From 88c14590cdd6f3cafc7ea7487d5f4532db8c551e Mon Sep 17 00:00:00 2001
From: David Sterba <dsterba@suse.com>
Date: Fri, 16 Mar 2018 03:27:02 +0100
Subject: [PATCH 159/164] btrfs: use RCU in btrfs_show_devname for device list
 traversal

The show_devname callback is used to print device name in
/proc/self/mounts, we need to traverse the device list consistently and
read the name that's copied to a seq buffer so we don't need further
locking.

If the first device is being deleted at the same time, the RCU will
allow us to read the device name, though it will become stale right
after the RCU protection ends. This is unavoidable and the user can
expect that the device will disappear from the filesystem's list at some
point.

The device_list_mutex was pretty heavy as it is used eg. for writing
superblock and a few other IO related contexts. This can stall any
application that reads the proc file for no reason.

Reviewed-by: Anand Jain <anand.jain@oracle.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/super.c | 15 ++++++++++-----
 1 file changed, 10 insertions(+), 5 deletions(-)

diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index d41d5960ef4a..170baef49fae 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -2304,11 +2304,18 @@ static int btrfs_show_devname(struct seq_file *m, struct dentry *root)
 	struct list_head *head;
 	struct rcu_string *name;
 
-	mutex_lock(&fs_info->fs_devices->device_list_mutex);
+	/*
+	 * Lightweight locking of the devices. We should not need
+	 * device_list_mutex here as we only read the device data and the list
+	 * is protected by RCU.  Even if a device is deleted during the list
+	 * traversals, we'll get valid data, the freeing callback will wait at
+	 * least until until the rcu_read_unlock.
+	 */
+	rcu_read_lock();
 	cur_devices = fs_info->fs_devices;
 	while (cur_devices) {
 		head = &cur_devices->devices;
-		list_for_each_entry(dev, head, dev_list) {
+		list_for_each_entry_rcu(dev, head, dev_list) {
 			if (test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state))
 				continue;
 			if (!dev->name)
@@ -2320,14 +2327,12 @@ static int btrfs_show_devname(struct seq_file *m, struct dentry *root)
 	}
 
 	if (first_dev) {
-		rcu_read_lock();
 		name = rcu_dereference(first_dev->name);
 		seq_escape(m, name->str, " \t\n\\");
-		rcu_read_unlock();
 	} else {
 		WARN_ON(1);
 	}
-	mutex_unlock(&fs_info->fs_devices->device_list_mutex);
+	rcu_read_unlock();
 	return 0;
 }
 

From e7ab0af6c30f763e41a1206318d4756d270e93a0 Mon Sep 17 00:00:00 2001
From: David Sterba <dsterba@suse.com>
Date: Tue, 27 Mar 2018 20:08:28 +0200
Subject: [PATCH 160/164] btrfs: remove stale comments about fs_mutex

The fs_mutex has been killed in 2008, a213501153fd66e2 ("Btrfs: Replace
the big fs_mutex with a collection of other locks"), still remembered in
some comments.

We don't have any extra needs for locking in the ACL handlers.

Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/acl.c | 8 --------
 1 file changed, 8 deletions(-)

diff --git a/fs/btrfs/acl.c b/fs/btrfs/acl.c
index f8a1bdf06b2a..0066d95b133f 100644
--- a/fs/btrfs/acl.c
+++ b/fs/btrfs/acl.c
@@ -65,9 +65,6 @@ struct posix_acl *btrfs_get_acl(struct inode *inode, int type)
 	return acl;
 }
 
-/*
- * Needs to be called with fs_mutex held
- */
 static int __btrfs_set_acl(struct btrfs_trans_handle *trans,
 			 struct inode *inode, struct posix_acl *acl, int type)
 {
@@ -127,11 +124,6 @@ int btrfs_set_acl(struct inode *inode, struct posix_acl *acl, int type)
 	return ret;
 }
 
-/*
- * btrfs_init_acl is already generally called under fs_mutex, so the locking
- * stuff has been fixed to work with that.  If the locking stuff changes, we
- * need to re-evaluate the acl locking stuff.
- */
 int btrfs_init_acl(struct btrfs_trans_handle *trans,
 		   struct inode *inode, struct inode *dir)
 {

From 7e79cb86be28ed8073870c22f479b3b1293ecb85 Mon Sep 17 00:00:00 2001
From: David Sterba <dsterba@suse.com>
Date: Sat, 24 Mar 2018 02:11:38 +0100
Subject: [PATCH 161/164] btrfs: split dev-replace locking helpers for read and
 write

The current calls are unclear in what way btrfs_dev_replace_lock takes
the locks, so drop the argument, split the helpers and use similar
naming as for read and write locks.

Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/dev-replace.c | 98 +++++++++++++++++++++---------------------
 fs/btrfs/dev-replace.h |  6 ++-
 fs/btrfs/reada.c       | 10 ++---
 fs/btrfs/scrub.c       | 14 +++---
 fs/btrfs/volumes.c     | 18 ++++----
 5 files changed, 74 insertions(+), 72 deletions(-)

diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c
index e279f04b3388..0d203633bb96 100644
--- a/fs/btrfs/dev-replace.c
+++ b/fs/btrfs/dev-replace.c
@@ -205,13 +205,13 @@ int btrfs_run_dev_replace(struct btrfs_trans_handle *trans,
 	struct btrfs_dev_replace_item *ptr;
 	struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
 
-	btrfs_dev_replace_lock(dev_replace, 0);
+	btrfs_dev_replace_read_lock(dev_replace);
 	if (!dev_replace->is_valid ||
 	    !dev_replace->item_needs_writeback) {
-		btrfs_dev_replace_unlock(dev_replace, 0);
+		btrfs_dev_replace_read_unlock(dev_replace);
 		return 0;
 	}
-	btrfs_dev_replace_unlock(dev_replace, 0);
+	btrfs_dev_replace_read_unlock(dev_replace);
 
 	key.objectid = 0;
 	key.type = BTRFS_DEV_REPLACE_KEY;
@@ -269,7 +269,7 @@ int btrfs_run_dev_replace(struct btrfs_trans_handle *trans,
 	ptr = btrfs_item_ptr(eb, path->slots[0],
 			     struct btrfs_dev_replace_item);
 
-	btrfs_dev_replace_lock(dev_replace, 1);
+	btrfs_dev_replace_write_lock(dev_replace);
 	if (dev_replace->srcdev)
 		btrfs_set_dev_replace_src_devid(eb, ptr,
 			dev_replace->srcdev->devid);
@@ -292,7 +292,7 @@ int btrfs_run_dev_replace(struct btrfs_trans_handle *trans,
 	btrfs_set_dev_replace_cursor_right(eb, ptr,
 		dev_replace->cursor_right);
 	dev_replace->item_needs_writeback = 0;
-	btrfs_dev_replace_unlock(dev_replace, 1);
+	btrfs_dev_replace_write_unlock(dev_replace);
 
 	btrfs_mark_buffer_dirty(eb);
 
@@ -357,7 +357,7 @@ int btrfs_dev_replace_start(struct btrfs_fs_info *fs_info,
 		return PTR_ERR(trans);
 	}
 
-	btrfs_dev_replace_lock(dev_replace, 1);
+	btrfs_dev_replace_write_lock(dev_replace);
 	switch (dev_replace->replace_state) {
 	case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED:
 	case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED:
@@ -395,7 +395,7 @@ int btrfs_dev_replace_start(struct btrfs_fs_info *fs_info,
 	dev_replace->item_needs_writeback = 1;
 	atomic64_set(&dev_replace->num_write_errors, 0);
 	atomic64_set(&dev_replace->num_uncorrectable_read_errors, 0);
-	btrfs_dev_replace_unlock(dev_replace, 1);
+	btrfs_dev_replace_write_unlock(dev_replace);
 
 	ret = btrfs_sysfs_add_device_link(tgt_device->fs_devices, tgt_device);
 	if (ret)
@@ -407,7 +407,7 @@ int btrfs_dev_replace_start(struct btrfs_fs_info *fs_info,
 	trans = btrfs_start_transaction(root, 0);
 	if (IS_ERR(trans)) {
 		ret = PTR_ERR(trans);
-		btrfs_dev_replace_lock(dev_replace, 1);
+		btrfs_dev_replace_write_lock(dev_replace);
 		goto leave;
 	}
 
@@ -431,7 +431,7 @@ int btrfs_dev_replace_start(struct btrfs_fs_info *fs_info,
 leave:
 	dev_replace->srcdev = NULL;
 	dev_replace->tgtdev = NULL;
-	btrfs_dev_replace_unlock(dev_replace, 1);
+	btrfs_dev_replace_write_unlock(dev_replace);
 	btrfs_destroy_dev_replace_tgtdev(fs_info, tgt_device);
 	return ret;
 }
@@ -498,18 +498,18 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
 	/* don't allow cancel or unmount to disturb the finishing procedure */
 	mutex_lock(&dev_replace->lock_finishing_cancel_unmount);
 
-	btrfs_dev_replace_lock(dev_replace, 0);
+	btrfs_dev_replace_read_lock(dev_replace);
 	/* was the operation canceled, or is it finished? */
 	if (dev_replace->replace_state !=
 	    BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED) {
-		btrfs_dev_replace_unlock(dev_replace, 0);
+		btrfs_dev_replace_read_unlock(dev_replace);
 		mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
 		return 0;
 	}
 
 	tgt_device = dev_replace->tgtdev;
 	src_device = dev_replace->srcdev;
-	btrfs_dev_replace_unlock(dev_replace, 0);
+	btrfs_dev_replace_read_unlock(dev_replace);
 
 	/*
 	 * flush all outstanding I/O and inode extent mappings before the
@@ -534,7 +534,7 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
 	/* keep away write_all_supers() during the finishing procedure */
 	mutex_lock(&fs_info->fs_devices->device_list_mutex);
 	mutex_lock(&fs_info->chunk_mutex);
-	btrfs_dev_replace_lock(dev_replace, 1);
+	btrfs_dev_replace_write_lock(dev_replace);
 	dev_replace->replace_state =
 		scrub_ret ? BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED
 			  : BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED;
@@ -554,7 +554,7 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
 				 btrfs_dev_name(src_device),
 				 src_device->devid,
 				 rcu_str_deref(tgt_device->name), scrub_ret);
-		btrfs_dev_replace_unlock(dev_replace, 1);
+		btrfs_dev_replace_write_unlock(dev_replace);
 		mutex_unlock(&fs_info->chunk_mutex);
 		mutex_unlock(&fs_info->fs_devices->device_list_mutex);
 		mutex_unlock(&uuid_mutex);
@@ -591,7 +591,7 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
 	list_add(&tgt_device->dev_alloc_list, &fs_info->fs_devices->alloc_list);
 	fs_info->fs_devices->rw_devices++;
 
-	btrfs_dev_replace_unlock(dev_replace, 1);
+	btrfs_dev_replace_write_unlock(dev_replace);
 
 	btrfs_rm_dev_replace_blocked(fs_info);
 
@@ -684,7 +684,7 @@ void btrfs_dev_replace_status(struct btrfs_fs_info *fs_info,
 {
 	struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
 
-	btrfs_dev_replace_lock(dev_replace, 0);
+	btrfs_dev_replace_read_lock(dev_replace);
 	/* even if !dev_replace_is_valid, the values are good enough for
 	 * the replace_status ioctl */
 	args->result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NO_ERROR;
@@ -696,7 +696,7 @@ void btrfs_dev_replace_status(struct btrfs_fs_info *fs_info,
 	args->status.num_uncorrectable_read_errors =
 		atomic64_read(&dev_replace->num_uncorrectable_read_errors);
 	args->status.progress_1000 = btrfs_dev_replace_progress(fs_info);
-	btrfs_dev_replace_unlock(dev_replace, 0);
+	btrfs_dev_replace_read_unlock(dev_replace);
 }
 
 int btrfs_dev_replace_cancel(struct btrfs_fs_info *fs_info)
@@ -713,13 +713,13 @@ int btrfs_dev_replace_cancel(struct btrfs_fs_info *fs_info)
 		return -EROFS;
 
 	mutex_lock(&dev_replace->lock_finishing_cancel_unmount);
-	btrfs_dev_replace_lock(dev_replace, 1);
+	btrfs_dev_replace_write_lock(dev_replace);
 	switch (dev_replace->replace_state) {
 	case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED:
 	case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED:
 	case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED:
 		result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NOT_STARTED;
-		btrfs_dev_replace_unlock(dev_replace, 1);
+		btrfs_dev_replace_write_unlock(dev_replace);
 		goto leave;
 	case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED:
 	case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED:
@@ -733,7 +733,7 @@ int btrfs_dev_replace_cancel(struct btrfs_fs_info *fs_info)
 	dev_replace->replace_state = BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED;
 	dev_replace->time_stopped = get_seconds();
 	dev_replace->item_needs_writeback = 1;
-	btrfs_dev_replace_unlock(dev_replace, 1);
+	btrfs_dev_replace_write_unlock(dev_replace);
 	btrfs_scrub_cancel(fs_info);
 
 	trans = btrfs_start_transaction(root, 0);
@@ -762,7 +762,7 @@ void btrfs_dev_replace_suspend_for_unmount(struct btrfs_fs_info *fs_info)
 	struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
 
 	mutex_lock(&dev_replace->lock_finishing_cancel_unmount);
-	btrfs_dev_replace_lock(dev_replace, 1);
+	btrfs_dev_replace_write_lock(dev_replace);
 	switch (dev_replace->replace_state) {
 	case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED:
 	case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED:
@@ -778,7 +778,7 @@ void btrfs_dev_replace_suspend_for_unmount(struct btrfs_fs_info *fs_info)
 		break;
 	}
 
-	btrfs_dev_replace_unlock(dev_replace, 1);
+	btrfs_dev_replace_write_unlock(dev_replace);
 	mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
 }
 
@@ -788,12 +788,12 @@ int btrfs_resume_dev_replace_async(struct btrfs_fs_info *fs_info)
 	struct task_struct *task;
 	struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
 
-	btrfs_dev_replace_lock(dev_replace, 1);
+	btrfs_dev_replace_write_lock(dev_replace);
 	switch (dev_replace->replace_state) {
 	case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED:
 	case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED:
 	case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED:
-		btrfs_dev_replace_unlock(dev_replace, 1);
+		btrfs_dev_replace_write_unlock(dev_replace);
 		return 0;
 	case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED:
 		break;
@@ -807,10 +807,10 @@ int btrfs_resume_dev_replace_async(struct btrfs_fs_info *fs_info)
 			   "cannot continue dev_replace, tgtdev is missing");
 		btrfs_info(fs_info,
 			   "you may cancel the operation after 'mount -o degraded'");
-		btrfs_dev_replace_unlock(dev_replace, 1);
+		btrfs_dev_replace_write_unlock(dev_replace);
 		return 0;
 	}
-	btrfs_dev_replace_unlock(dev_replace, 1);
+	btrfs_dev_replace_write_unlock(dev_replace);
 
 	WARN_ON(test_and_set_bit(BTRFS_FS_EXCL_OP, &fs_info->flags));
 	task = kthread_run(btrfs_dev_replace_kthread, fs_info, "btrfs-devrepl");
@@ -879,35 +879,35 @@ int btrfs_dev_replace_is_ongoing(struct btrfs_dev_replace *dev_replace)
 	return 1;
 }
 
-void btrfs_dev_replace_lock(struct btrfs_dev_replace *dev_replace, int rw)
+void btrfs_dev_replace_read_lock(struct btrfs_dev_replace *dev_replace)
+{
+	read_lock(&dev_replace->lock);
+	atomic_inc(&dev_replace->read_locks);
+}
+
+void btrfs_dev_replace_read_unlock(struct btrfs_dev_replace *dev_replace)
+{
+	ASSERT(atomic_read(&dev_replace->read_locks) > 0);
+	atomic_dec(&dev_replace->read_locks);
+	read_unlock(&dev_replace->lock);
+}
+
+void btrfs_dev_replace_write_lock(struct btrfs_dev_replace *dev_replace)
 {
-	if (rw == 1) {
-		/* write */
 again:
-		wait_event(dev_replace->read_lock_wq,
-			   atomic_read(&dev_replace->blocking_readers) == 0);
-		write_lock(&dev_replace->lock);
-		if (atomic_read(&dev_replace->blocking_readers)) {
-			write_unlock(&dev_replace->lock);
-			goto again;
-		}
-	} else {
-		read_lock(&dev_replace->lock);
-		atomic_inc(&dev_replace->read_locks);
+	wait_event(dev_replace->read_lock_wq,
+		   atomic_read(&dev_replace->blocking_readers) == 0);
+	write_lock(&dev_replace->lock);
+	if (atomic_read(&dev_replace->blocking_readers)) {
+		write_unlock(&dev_replace->lock);
+		goto again;
 	}
 }
 
-void btrfs_dev_replace_unlock(struct btrfs_dev_replace *dev_replace, int rw)
+void btrfs_dev_replace_write_unlock(struct btrfs_dev_replace *dev_replace)
 {
-	if (rw == 1) {
-		/* write */
-		ASSERT(atomic_read(&dev_replace->blocking_readers) == 0);
-		write_unlock(&dev_replace->lock);
-	} else {
-		ASSERT(atomic_read(&dev_replace->read_locks) > 0);
-		atomic_dec(&dev_replace->read_locks);
-		read_unlock(&dev_replace->lock);
-	}
+	ASSERT(atomic_read(&dev_replace->blocking_readers) == 0);
+	write_unlock(&dev_replace->lock);
 }
 
 /* inc blocking cnt and release read lock */
diff --git a/fs/btrfs/dev-replace.h b/fs/btrfs/dev-replace.h
index 389de365b0db..8566a02ef222 100644
--- a/fs/btrfs/dev-replace.h
+++ b/fs/btrfs/dev-replace.h
@@ -36,8 +36,10 @@ int btrfs_dev_replace_cancel(struct btrfs_fs_info *fs_info);
 void btrfs_dev_replace_suspend_for_unmount(struct btrfs_fs_info *fs_info);
 int btrfs_resume_dev_replace_async(struct btrfs_fs_info *fs_info);
 int btrfs_dev_replace_is_ongoing(struct btrfs_dev_replace *dev_replace);
-void btrfs_dev_replace_lock(struct btrfs_dev_replace *dev_replace, int rw);
-void btrfs_dev_replace_unlock(struct btrfs_dev_replace *dev_replace, int rw);
+void btrfs_dev_replace_read_lock(struct btrfs_dev_replace *dev_replace);
+void btrfs_dev_replace_read_unlock(struct btrfs_dev_replace *dev_replace);
+void btrfs_dev_replace_write_lock(struct btrfs_dev_replace *dev_replace);
+void btrfs_dev_replace_write_unlock(struct btrfs_dev_replace *dev_replace);
 void btrfs_dev_replace_set_lock_blocking(struct btrfs_dev_replace *dev_replace);
 void btrfs_dev_replace_clear_lock_blocking(
 					struct btrfs_dev_replace *dev_replace);
diff --git a/fs/btrfs/reada.c b/fs/btrfs/reada.c
index ab852b8e3e37..a52dd12af648 100644
--- a/fs/btrfs/reada.c
+++ b/fs/btrfs/reada.c
@@ -395,20 +395,20 @@ static struct reada_extent *reada_find_extent(struct btrfs_fs_info *fs_info,
 		goto error;
 
 	/* insert extent in reada_tree + all per-device trees, all or nothing */
-	btrfs_dev_replace_lock(&fs_info->dev_replace, 0);
+	btrfs_dev_replace_read_lock(&fs_info->dev_replace);
 	spin_lock(&fs_info->reada_lock);
 	ret = radix_tree_insert(&fs_info->reada_tree, index, re);
 	if (ret == -EEXIST) {
 		re_exist = radix_tree_lookup(&fs_info->reada_tree, index);
 		re_exist->refcnt++;
 		spin_unlock(&fs_info->reada_lock);
-		btrfs_dev_replace_unlock(&fs_info->dev_replace, 0);
+		btrfs_dev_replace_read_unlock(&fs_info->dev_replace);
 		radix_tree_preload_end();
 		goto error;
 	}
 	if (ret) {
 		spin_unlock(&fs_info->reada_lock);
-		btrfs_dev_replace_unlock(&fs_info->dev_replace, 0);
+		btrfs_dev_replace_read_unlock(&fs_info->dev_replace);
 		radix_tree_preload_end();
 		goto error;
 	}
@@ -451,13 +451,13 @@ static struct reada_extent *reada_find_extent(struct btrfs_fs_info *fs_info,
 			}
 			radix_tree_delete(&fs_info->reada_tree, index);
 			spin_unlock(&fs_info->reada_lock);
-			btrfs_dev_replace_unlock(&fs_info->dev_replace, 0);
+			btrfs_dev_replace_read_unlock(&fs_info->dev_replace);
 			goto error;
 		}
 		have_zone = 1;
 	}
 	spin_unlock(&fs_info->reada_lock);
-	btrfs_dev_replace_unlock(&fs_info->dev_replace, 0);
+	btrfs_dev_replace_read_unlock(&fs_info->dev_replace);
 
 	if (!have_zone)
 		goto error;
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index fbc0c0e264af..1a2066ac6fe7 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -3935,11 +3935,11 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx,
 			break;
 		}
 
-		btrfs_dev_replace_lock(&fs_info->dev_replace, 1);
+		btrfs_dev_replace_write_lock(&fs_info->dev_replace);
 		dev_replace->cursor_right = found_key.offset + length;
 		dev_replace->cursor_left = found_key.offset;
 		dev_replace->item_needs_writeback = 1;
-		btrfs_dev_replace_unlock(&fs_info->dev_replace, 1);
+		btrfs_dev_replace_write_unlock(&fs_info->dev_replace);
 		ret = scrub_chunk(sctx, scrub_dev, chunk_offset, length,
 				  found_key.offset, cache, is_dev_replace);
 
@@ -3975,10 +3975,10 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx,
 
 		scrub_pause_off(fs_info);
 
-		btrfs_dev_replace_lock(&fs_info->dev_replace, 1);
+		btrfs_dev_replace_write_lock(&fs_info->dev_replace);
 		dev_replace->cursor_left = dev_replace->cursor_right;
 		dev_replace->item_needs_writeback = 1;
-		btrfs_dev_replace_unlock(&fs_info->dev_replace, 1);
+		btrfs_dev_replace_write_unlock(&fs_info->dev_replace);
 
 		if (ro_set)
 			btrfs_dec_block_group_ro(cache);
@@ -4194,16 +4194,16 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
 		return -EIO;
 	}
 
-	btrfs_dev_replace_lock(&fs_info->dev_replace, 0);
+	btrfs_dev_replace_read_lock(&fs_info->dev_replace);
 	if (dev->scrub_ctx ||
 	    (!is_dev_replace &&
 	     btrfs_dev_replace_is_ongoing(&fs_info->dev_replace))) {
-		btrfs_dev_replace_unlock(&fs_info->dev_replace, 0);
+		btrfs_dev_replace_read_unlock(&fs_info->dev_replace);
 		mutex_unlock(&fs_info->scrub_lock);
 		mutex_unlock(&fs_info->fs_devices->device_list_mutex);
 		return -EINPROGRESS;
 	}
-	btrfs_dev_replace_unlock(&fs_info->dev_replace, 0);
+	btrfs_dev_replace_read_unlock(&fs_info->dev_replace);
 
 	ret = scrub_workers_get(fs_info, is_dev_replace);
 	if (ret) {
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 0c331d51385e..93f8f17cacca 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -1936,12 +1936,12 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info, const char *device_path,
 	mutex_lock(&uuid_mutex);
 
 	num_devices = fs_info->fs_devices->num_devices;
-	btrfs_dev_replace_lock(&fs_info->dev_replace, 0);
+	btrfs_dev_replace_read_lock(&fs_info->dev_replace);
 	if (btrfs_dev_replace_is_ongoing(&fs_info->dev_replace)) {
 		WARN_ON(num_devices < 1);
 		num_devices--;
 	}
-	btrfs_dev_replace_unlock(&fs_info->dev_replace, 0);
+	btrfs_dev_replace_read_unlock(&fs_info->dev_replace);
 
 	ret = btrfs_check_raid_min_devices(fs_info, num_devices - 1);
 	if (ret)
@@ -3910,12 +3910,12 @@ int btrfs_balance(struct btrfs_balance_control *bctl,
 	}
 
 	num_devices = fs_info->fs_devices->num_devices;
-	btrfs_dev_replace_lock(&fs_info->dev_replace, 0);
+	btrfs_dev_replace_read_lock(&fs_info->dev_replace);
 	if (btrfs_dev_replace_is_ongoing(&fs_info->dev_replace)) {
 		BUG_ON(num_devices < 1);
 		num_devices--;
 	}
-	btrfs_dev_replace_unlock(&fs_info->dev_replace, 0);
+	btrfs_dev_replace_read_unlock(&fs_info->dev_replace);
 	allowed = BTRFS_AVAIL_ALLOC_BIT_SINGLE | BTRFS_BLOCK_GROUP_DUP;
 	if (num_devices > 1)
 		allowed |= (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID1);
@@ -5241,11 +5241,11 @@ int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len)
 		ret = 1;
 	free_extent_map(em);
 
-	btrfs_dev_replace_lock(&fs_info->dev_replace, 0);
+	btrfs_dev_replace_read_lock(&fs_info->dev_replace);
 	if (btrfs_dev_replace_is_ongoing(&fs_info->dev_replace) &&
 	    fs_info->dev_replace.tgtdev)
 		ret++;
-	btrfs_dev_replace_unlock(&fs_info->dev_replace, 0);
+	btrfs_dev_replace_read_unlock(&fs_info->dev_replace);
 
 	return ret;
 }
@@ -5823,10 +5823,10 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info,
 	if (!bbio_ret)
 		goto out;
 
-	btrfs_dev_replace_lock(dev_replace, 0);
+	btrfs_dev_replace_read_lock(dev_replace);
 	dev_replace_is_ongoing = btrfs_dev_replace_is_ongoing(dev_replace);
 	if (!dev_replace_is_ongoing)
-		btrfs_dev_replace_unlock(dev_replace, 0);
+		btrfs_dev_replace_read_unlock(dev_replace);
 	else
 		btrfs_dev_replace_set_lock_blocking(dev_replace);
 
@@ -6024,7 +6024,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info,
 out:
 	if (dev_replace_is_ongoing) {
 		btrfs_dev_replace_clear_lock_blocking(dev_replace);
-		btrfs_dev_replace_unlock(dev_replace, 0);
+		btrfs_dev_replace_read_unlock(dev_replace);
 	}
 	free_extent_map(em);
 	return ret;

From 38e82de8ccd1878fb5f8371b8a6f455a6bbd0325 Mon Sep 17 00:00:00 2001
From: David Sterba <dsterba@suse.com>
Date: Mon, 26 Mar 2018 18:29:41 +0200
Subject: [PATCH 162/164] btrfs: user proper type for btrfs_mask_flags flags

All users pass a local unsigned int and not the __uXX types that are
supposed to be used for userspace interfaces.

Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/ioctl.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 8c3ff75cbdd4..ac85e07f567b 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -106,7 +106,7 @@ static int btrfs_clone(struct inode *src, struct inode *inode,
 		       int no_time_update);
 
 /* Mask out flags that are inappropriate for the given type of inode. */
-static inline __u32 btrfs_mask_flags(umode_t mode, __u32 flags)
+static unsigned int btrfs_mask_flags(umode_t mode, unsigned int flags)
 {
 	if (S_ISDIR(mode))
 		return flags;

From f50f43539070c6c876bf5435fb23f898f9d81e72 Mon Sep 17 00:00:00 2001
From: Liu Bo <bo.liu@linux.alibaba.com>
Date: Thu, 29 Mar 2018 06:11:45 +0800
Subject: [PATCH 163/164] Btrfs: print error messages when failing to read
 trees

When mount fails to read trees like fs tree, checksum tree, extent
tree, etc, there is not enough information about where went wrong.

With this, messages like

"BTRFS warning (device sdf): failed to read root (objectid=7): -5"

would help us a bit.

Signed-off-by: Liu Bo <bo.liu@linux.alibaba.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/disk-io.c | 31 ++++++++++++++++++++++---------
 1 file changed, 22 insertions(+), 9 deletions(-)

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index a2f3a0c67a99..07b5e6f7df67 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -2383,23 +2383,29 @@ static int btrfs_read_roots(struct btrfs_fs_info *fs_info)
 	location.offset = 0;
 
 	root = btrfs_read_tree_root(tree_root, &location);
-	if (IS_ERR(root))
-		return PTR_ERR(root);
+	if (IS_ERR(root)) {
+		ret = PTR_ERR(root);
+		goto out;
+	}
 	set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state);
 	fs_info->extent_root = root;
 
 	location.objectid = BTRFS_DEV_TREE_OBJECTID;
 	root = btrfs_read_tree_root(tree_root, &location);
-	if (IS_ERR(root))
-		return PTR_ERR(root);
+	if (IS_ERR(root)) {
+		ret = PTR_ERR(root);
+		goto out;
+	}
 	set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state);
 	fs_info->dev_root = root;
 	btrfs_init_devices_late(fs_info);
 
 	location.objectid = BTRFS_CSUM_TREE_OBJECTID;
 	root = btrfs_read_tree_root(tree_root, &location);
-	if (IS_ERR(root))
-		return PTR_ERR(root);
+	if (IS_ERR(root)) {
+		ret = PTR_ERR(root);
+		goto out;
+	}
 	set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state);
 	fs_info->csum_root = root;
 
@@ -2416,7 +2422,7 @@ static int btrfs_read_roots(struct btrfs_fs_info *fs_info)
 	if (IS_ERR(root)) {
 		ret = PTR_ERR(root);
 		if (ret != -ENOENT)
-			return ret;
+			goto out;
 	} else {
 		set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state);
 		fs_info->uuid_root = root;
@@ -2425,13 +2431,19 @@ static int btrfs_read_roots(struct btrfs_fs_info *fs_info)
 	if (btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE)) {
 		location.objectid = BTRFS_FREE_SPACE_TREE_OBJECTID;
 		root = btrfs_read_tree_root(tree_root, &location);
-		if (IS_ERR(root))
-			return PTR_ERR(root);
+		if (IS_ERR(root)) {
+			ret = PTR_ERR(root);
+			goto out;
+		}
 		set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state);
 		fs_info->free_space_root = root;
 	}
 
 	return 0;
+out:
+	btrfs_warn(fs_info, "failed to read root (objectid=%llu): %d",
+		   location.objectid, ret);
+	return ret;
 }
 
 int open_ctree(struct super_block *sb,
@@ -3004,6 +3016,7 @@ retry_root_backup:
 	fs_info->fs_root = btrfs_read_fs_root_no_name(fs_info, &location);
 	if (IS_ERR(fs_info->fs_root)) {
 		err = PTR_ERR(fs_info->fs_root);
+		btrfs_warn(fs_info, "failed to read fs tree: %d", err);
 		goto fail_qgroup;
 	}
 

From 57599c7e7722daf5f8c2dba4b0e4628f5c500771 Mon Sep 17 00:00:00 2001
From: David Sterba <dsterba@suse.com>
Date: Thu, 1 Mar 2018 17:56:34 +0100
Subject: [PATCH 164/164] btrfs: lift errors from add_extent_changeset to the
 callers

The missing error handling in add_extent_changeset was hidden, so make
it at least visible in the callers.

Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/extent_io.c | 19 +++++++++++--------
 1 file changed, 11 insertions(+), 8 deletions(-)

diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index f27bad003f8e..47a8fe9d22e8 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -119,23 +119,22 @@ struct extent_page_data {
 	unsigned int sync_io:1;
 };
 
-static void add_extent_changeset(struct extent_state *state, unsigned bits,
+static int add_extent_changeset(struct extent_state *state, unsigned bits,
 				 struct extent_changeset *changeset,
 				 int set)
 {
 	int ret;
 
 	if (!changeset)
-		return;
+		return 0;
 	if (set && (state->state & bits) == bits)
-		return;
+		return 0;
 	if (!set && (state->state & bits) == 0)
-		return;
+		return 0;
 	changeset->bytes_changed += state->end - state->start + 1;
 	ret = ulist_add(&changeset->range_changed, state->start, state->end,
 			GFP_ATOMIC);
-	/* ENOMEM */
-	BUG_ON(ret < 0);
+	return ret;
 }
 
 static void flush_write_bio(struct extent_page_data *epd);
@@ -527,6 +526,7 @@ static struct extent_state *clear_state_bit(struct extent_io_tree *tree,
 {
 	struct extent_state *next;
 	unsigned bits_to_clear = *bits & ~EXTENT_CTLBITS;
+	int ret;
 
 	if ((bits_to_clear & EXTENT_DIRTY) && (state->state & EXTENT_DIRTY)) {
 		u64 range = state->end - state->start + 1;
@@ -534,7 +534,8 @@ static struct extent_state *clear_state_bit(struct extent_io_tree *tree,
 		tree->dirty_bytes -= range;
 	}
 	clear_state_cb(tree, state, bits);
-	add_extent_changeset(state, bits_to_clear, changeset, 0);
+	ret = add_extent_changeset(state, bits_to_clear, changeset, 0);
+	BUG_ON(ret < 0);
 	state->state &= ~bits_to_clear;
 	if (wake)
 		wake_up(&state->wq);
@@ -805,13 +806,15 @@ static void set_state_bits(struct extent_io_tree *tree,
 			   unsigned *bits, struct extent_changeset *changeset)
 {
 	unsigned bits_to_set = *bits & ~EXTENT_CTLBITS;
+	int ret;
 
 	set_state_cb(tree, state, bits);
 	if ((bits_to_set & EXTENT_DIRTY) && !(state->state & EXTENT_DIRTY)) {
 		u64 range = state->end - state->start + 1;
 		tree->dirty_bytes += range;
 	}
-	add_extent_changeset(state, bits_to_set, changeset, 1);
+	ret = add_extent_changeset(state, bits_to_set, changeset, 1);
+	BUG_ON(ret < 0);
 	state->state |= bits_to_set;
 }